In [1]:
# dependencies
import pandas as pd
import tweepy
import json
import requests
import os
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import zipfile

ModuleNotFoundError: No module named 'tweepy'

## 1. Hate Speech Twitter Annotations
***

In [None]:
# url for labeled tweet ids
url = 'https://github.com/ZeerakW/hatespeech/archive/master.zip'

# use requests to establish connection
response = requests.get(url)

# create folder 'data'
folder_name = 'data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# download zip file
with open(os.path.join(folder_name, 'master'), mode = 'wb') as file:
    file.write(response.content)

# extract zipfile
with zipfile.ZipFile('data/master.zip', 'r') as zipf:
    zipf.extractall(os.path.join('data'))

In [None]:
# read in the csv file of labeled tweet ids
labeled_ids = pd.read_csv('data/hatespeech-master/NAACL_SRW_2016.csv', names = ['id', 'label'])

labeled_ids.head(2)

In [None]:
# Insert secret tokens and keys from Twitter Developer account
consumer_key = '###'
consumer_secret = '###'
access_token = '###'
access_secret = '###'

# authenticate as per tweepy docs
auth = tweepy.OAuthHandler(consumer_key, consumer_secret)
auth.set_access_token(access_token, access_secret)

# create api object
api = tweepy.API(auth, wait_on_rate_limit = True, wait_on_rate_limit_notify = True)

In [None]:
# init counter to keep track of tweets collected and of the failed ids
i = 0
j = 0
failed_ids = []

# open file to write json objects from api
with open('data/tweets.txt', 'w') as outfile:
    for _ in labeled_ids.id:
        
        # try-except block since few tweet IDs in the archive may have been deleted
        try: 
            tweet = api.get_status(_, tweet_mode = 'extended')
        except:
            failed_ids.append(_)
            j = j+1
            print(f'Failed: {_} | {j} of {len(labeled_ids.id)}')
            continue
            
        
        # print the number of tweets collected
        print(f'Success: {_} | {i} of {len(labeled_ids.id)}')
        i = i+1
        
        # dump the json object corresponding to the tweet collected from the api
        json.dump(tweet._json, outfile)
        outfile.write('\n')
print(f'Number of Successful Tweets Querried: {i}')
print(f'Number of Failed Queries: {j}')

In [None]:
#load the json data and store it in a list
data = []
with open('data/tweets.txt') as f:    
        for line in f:         
            data.append(json.loads(line))

df_api = pd.DataFrame(data)

#select columns of interest
columns_of_interest = ['id', 'full_text']
df_api = df_api[columns_of_interest]

In [None]:
df_api.head(2)

In [None]:
# join the dataframes with ID's and tweets
df = labeled_ids.merge(df_api, left_on = 'id', right_on = 'id', how = 'left');

# drop the id's whose tweets could not be retrieved
df.dropna(how = 'any', inplace = True)

In [None]:
df.head()

In [None]:
# map labels to binary classes
df['label'] = df.label.map({'none': 'Non-offensive', 'sexism': 'Offensive', 'racism': 'Offensive'});

In [None]:
# save file
df.to_csv('labeled_tweets.csv', index = None)

## 2. Hate Speech and Offensive Language Detection
***

In [None]:
# url for GitHub dataset
url = 'https://github.com/t-davidson/hate-speech-and-offensive-language/raw/master/data/labeled_data.csv'

# use requests to establish connection
response = requests.get(url)

# create folder 'data'
folder_name = 'data'
if not os.path.exists(folder_name):
    os.makedirs(folder_name)

# download zip file
with open(os.path.join(folder_name, 'public_data.csv'), mode = 'wb') as file:
    file.write(response.content)

In [None]:
# read in the data
df2 = pd.read_csv('data/public_data.csv')

# select only the column 'class'
df2 = df2.iloc[:, -2:]
df2.head()

In [None]:
# map classes to labels
df2['class'] = df2['class'].map({0: 'Offensive', 1: 'Offensive', 2: 'Non-offensive'})

df2.rename(columns = {'class': 'label', 'tweet': 'full_text'}, inplace = True)

In [None]:
df2.head()

In [None]:
# save file
df2.to_csv('public_data_labeled.csv', index = None)