# Merge 5 twitter datasets and preprocess

In [149]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
import csv
import seaborn as sns

import nltk
import nltk.corpus # sample text for performing tokenization
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

nltk.download('all')

[nltk_data] Downloading collection 'all'
[nltk_data]    | 
[nltk_data]    | Downloading package abc to /Users/dominika-schreyer-
[nltk_data]    |     macbook-pro/nltk_data...
[nltk_data]    |   Package abc is already up-to-date!
[nltk_data]    | Downloading package alpino to /Users/dominika-
[nltk_data]    |     schreyer-macbook-pro/nltk_data...
[nltk_data]    |   Package alpino is already up-to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger to
[nltk_data]    |     /Users/dominika-schreyer-macbook-pro/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger is already up-
[nltk_data]    |       to-date!
[nltk_data]    | Downloading package averaged_perceptron_tagger_ru to
[nltk_data]    |     /Users/dominika-schreyer-macbook-pro/nltk_data...
[nltk_data]    |   Package averaged_perceptron_tagger_ru is already
[nltk_data]    |       up-to-date!
[nltk_data]    | Downloading package basque_grammars to
[nltk_data]    |     /Users/dominika-schreyer-macbook-pr

True

In [86]:
# Importing all datasets

racism_df = pd.read_csv('data/twitter_racism_parsed_dataset.csv',index_col=False)
general_df = pd.read_csv('data/twitter_parsed_dataset.csv',index_col=False)
sexism_df = pd.read_csv('data/twitter_sexism_parsed_dataset.csv',index_col=False)
toxity_df = pd.read_csv('data/toxicity_parsed_dataset.csv',index_col=False)
aggression_df = pd.read_csv('data/aggression_parsed_dataset.csv',index_col=False)
attack_df = pd.read_csv('data/attack_parsed_dataset.csv',index_col=False)

In [51]:
# Synchronize format for combination

toxity_df['Annotation'] = np.where(toxity_df['oh_label']!= 0, 'toxity', 'none')
aggression_df['Annotation'] = np.where(aggression_df['oh_label']!= 0, 'aggression', 'none')
attack_df['Annotation'] = np.where(attack_df['oh_label']!= 0, 'attack', 'none')

attack_df.head()

Unnamed: 0,index,Text,ed_label_0,ed_label_1,oh_label,Annotation
0,0,`- This is not ``creative``. Those are the di...,1.0,0.0,0,none
1,1,` :: the term ``standard model`` is itself le...,1.0,0.0,0,none
2,2,"True or false, the situation as of March 200...",1.0,0.0,0,none
3,3,"Next, maybe you could work on being less cond...",0.555556,0.444444,0,none
4,4,This page will need disambiguation.,1.0,0.0,0,none


In [69]:
# Combine all datasets
del(twitter_df)
twitterList =[]
twitterList.extend(value for name, value in locals().items() if name.endswith('_df'))

twitter_df = pd.concat(twitterList, ignore_index=True)


In [70]:
twitter_df.head()

Unnamed: 0,index,id,Text,Annotation,oh_label,ed_label_0,ed_label_1
0,5.76749336190525e+17,5.76749336190525e+17,@AAlwuhaib1977 Muslim mob violence against Hin...,racism,1.0,,
1,5.4089053338916096e+17,5.4089053338916096e+17,@Te4m_NiGhtM4Re http://t.co/5Ih7MkDbQG,none,0.0,,
2,5.678433203381249e+17,5.678433203381249e+17,@jncatron @isra_jourisra @AMPalestine Islamoph...,racism,1.0,,
3,5.76646151631327e+17,5.76646151631327e+17,"Finally I'm all caught up, and that sudden dea...",none,0.0,,
4,5.7134919561068096e+17,5.7134919561068096e+17,@carolinesinders @herecomesfran *hugs*,none,0.0,,


In [71]:
twitter_df = twitter_df[['Text','Annotation','oh_label','ed_label_0','ed_label_1']]

In [72]:
# Identify duplicates

print(len(twitter_df['Text'].unique()))
print(len(twitter_df.groupby(['Text', 'oh_label']).first()))
print(len(twitter_df.groupby(['Text', 'oh_label','Annotation']).first()))
print(len(twitter_df))

# Drop duplicates with same content, label and annotation, assuming that ambiguous annotation / label can be learned in multiple categories
#twitter_df.groupby(['Text', 'oh_label','Annotation']).first().head()
twitter_df = twitter_df.drop_duplicates(
  subset = ['Text', 'oh_label','Annotation'],
  keep = 'last').reset_index(drop = True) # use last because those have label confidence


213143
219590
244118
436617


In [76]:
twitter_df.columns= twitter_df.columns.str.lower()
twitter_df[twitter_df['text'] == ""] 

#twitter_df.dropna(subset=['text'])
twitter_df = twitter_df.dropna(how='all')
twitter_df

Unnamed: 0,text,annotation,oh_label,ed_label_0,ed_label_1
0,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0,,
1,@SirajZarook @OdiniaInvictus @BilalIGhumman @I...,racism,1.0,,
2,"@scamp_faridxx @AbuAlbaraaSham Yeah, it's call...",racism,1.0,,
3,@Asadumarfans You are a Muslim. You are brain ...,racism,1.0,,
4,@harmlesstree2 @MaxBlumenthal If you want to u...,racism,1.0,,
...,...,...,...,...,...
244116,` These sources don't exactly exude a sense ...,none,0.0,0.888889,0.111111
244117,The Institute for Historical Review is a pee...,none,0.0,0.900000,0.100000
244118,:The way you're trying to describe it in this...,none,0.0,1.000000,0.000000
244119,== Warning == There is clearly a protection...,none,0.0,0.800000,0.200000


In [80]:
twitter_df = twitter_df.replace('&amp;','&', regex=True)

In [84]:
# Move Hashtags into a column
twitter_df['hashtags'] = twitter_df.text.apply(lambda x: [x for x in x.split(" ") if x.startswith("#")])

# replace all hashtags, ampersands, and character references with no space
twitter_df.hashtags = twitter_df.hashtags.apply(lambda x: re.sub(r'[#@]+', '', str(x)))

twitter_df.head(n=20)

Unnamed: 0,text,annotation,oh_label,ed_label_0,ed_label_1,hashtags
0,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0,,,[]
1,@SirajZarook @OdiniaInvictus @BilalIGhumman @I...,racism,1.0,,,[]
2,"@scamp_faridxx @AbuAlbaraaSham Yeah, it's call...",racism,1.0,,,[]
3,@Asadumarfans You are a Muslim. You are brain ...,racism,1.0,,,[]
4,@harmlesstree2 @MaxBlumenthal If you want to u...,racism,1.0,,,[]
5,"@watan71969 You are a total liar, just like yo...",racism,1.0,,,[]
6,@Rudd1971 Both Daesh and Shia Militia are driv...,racism,1.0,,,[]
7,@DianH4 @ExposeFalsehood If the Muslim world e...,racism,1.0,,,[]
8,@DianH4 Islam doesn't answer anything. It pre...,racism,1.0,,,[]
9,@greenlinerzjm You should be attacking everyon...,racism,1.0,,,[]


In [134]:
# Create stopword list:
mystopwords = set(stopwords.words('english'))

#add words that aren't in the NLTK stopwords list
#mystopwords_new = ["f", "u", "r","t"]
#mystopwords = mystopwords.union(mystopwords_new)

#remove words that are in NLTK stopwords list
#not_stopwords = {"not", "didn't", "no"} 
#mystopwords = set([word for word in mystopwords if word not in not_stopwords])

# create preprocess_text function
def preprocess_text(text):

    # Tokenize the text & remove twitter accounts
    tknzr = TweetTokenizer(strip_handles=True)
    tokens = tknzr.tokenize(text.lower())

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in mystopwords]

    # Lemmatize the tokens

    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string

    processed_text = ' '.join(lemmatized_tokens)

    # remove urls
    #processed_text = processed_text.replace('^(http|https)://', '')
    processed_text = re.sub(r'(http|https)://[\S]+', '',processed_text)

    # remove numbers
    processed_text = re.sub(r'(?<!\S)(?=.)(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?!\S)', '',processed_text)

    # remove special characters
    processed_text = re.sub(r'[#@&][\S]+', '',processed_text)
    processed_text = re.sub(r'[#@&$“”".,’]', '',processed_text)
    processed_text = re.sub(r'\b\d[\S]+', '',processed_text)
    processed_text = re.sub('[^a-zA-Z]', ' ', processed_text)
    processed_text = re.sub(r'\s+', ' ', processed_text) # spaces
    processed_text = re.sub(r'\\b[A-Za-z] \\b|\\b [A-Za-z]\\b', ' ', processed_text) # single letters
    
    processed_text =" ".join(processed_text.split())
    processed_text =" ".join([w for w in processed_text.split() if len(w)>1])
    
    return processed_text

# apply the function df

twitter_df['tokenized'] = twitter_df['text'].apply(preprocess_text)
twitter_df.head(n=20)

Unnamed: 0,text,annotation,oh_label,ed_label_0,ed_label_1,hashtags,tokenized
0,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0,,,[],wrong isi follows example mohammed quran exactly
1,@SirajZarook @OdiniaInvictus @BilalIGhumman @I...,racism,1.0,,,[],good muslim good despite bad religion
2,"@scamp_faridxx @AbuAlbaraaSham Yeah, it's call...",racism,1.0,,,[],yeah called caring human life idiot something ...
3,@Asadumarfans You are a Muslim. You are brain ...,racism,1.0,,,[],muslim brain dead repeat others said million time
4,@harmlesstree2 @MaxBlumenthal If you want to u...,racism,1.0,,,[],want understand lie muslim living peace jew re...
5,"@watan71969 You are a total liar, just like yo...",racism,1.0,,,[],total liar like pedophile prophet un soldier b...
6,@Rudd1971 Both Daesh and Shia Militia are driv...,racism,1.0,,,[],daesh shia militia driven religion hatred bigo...
7,@DianH4 @ExposeFalsehood If the Muslim world e...,racism,1.0,,,[],muslim world ever produced anything tyrant dic...
8,@DianH4 Islam doesn't answer anything. It pre...,racism,1.0,,,[],islam answer anything pretend answer illogical...
9,@greenlinerzjm You should be attacking everyon...,racism,1.0,,,[],attacking everyone follows religious cult hate...


In [177]:
# Save processed file
twitter_df.to_csv("data/twitter_all_data.csv")
