# Merge 5 twitter datasets and preprocess

In [None]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
import csv
import seaborn as sns

import nltk
import nltk.corpus # sample text for performing tokenization
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator

nltk.download('all')

In [2]:
# Importing all datasets

racism_df = pd.read_csv('data/twitter_racism_parsed_dataset.csv',index_col=False)
general_df = pd.read_csv('data/twitter_parsed_dataset.csv',index_col=False)
sexism_df = pd.read_csv('data/twitter_sexism_parsed_dataset.csv',index_col=False)
toxity_df = pd.read_csv('data/toxicity_parsed_dataset.csv',index_col=False)
aggression_df = pd.read_csv('data/aggression_parsed_dataset.csv',index_col=False)
attack_df = pd.read_csv('data/attack_parsed_dataset.csv',index_col=False)

In [3]:
len(sexism_df)

14881

In [4]:
sexism_df = sexism_df[sexism_df['oh_label'].notna()]

len(sexism_df)


14878

In [5]:
# Synchronize format for combination

toxity_df['Annotation'] = np.where(toxity_df['oh_label']!= 0, 'toxity', 'none')
aggression_df['Annotation'] = np.where(aggression_df['oh_label']!= 0, 'aggression', 'none')
attack_df['Annotation'] = np.where(attack_df['oh_label']!= 0, 'attack', 'none')

attack_df.head()

Unnamed: 0,index,Text,ed_label_0,ed_label_1,oh_label,Annotation
0,0,`- This is not ``creative``. Those are the di...,1.0,0.0,0,none
1,1,` :: the term ``standard model`` is itself le...,1.0,0.0,0,none
2,2,"True or false, the situation as of March 200...",1.0,0.0,0,none
3,3,"Next, maybe you could work on being less cond...",0.555556,0.444444,0,none
4,4,This page will need disambiguation.,1.0,0.0,0,none


In [6]:
def annotation_encoder(df,enc):
    df['Cat_enc'] = np.where(df['Annotation']!= 'none', str(enc), str(0))
    return df

racism_df = annotation_encoder(racism_df,1)
sexism_df = annotation_encoder(sexism_df,2)
toxity_df = annotation_encoder(toxity_df,3)
aggression_df = annotation_encoder(aggression_df,4)
attack_df = annotation_encoder(attack_df,5)

In [7]:
general_df['Cat_enc'] = np.where(general_df['Annotation']== 'racism', str(1), str(0))
general_df['Cat_enc'] = np.where(general_df['Annotation']== 'sexism', str(2), general_df['Cat_enc'])
general_df.head()

Unnamed: 0,index,id,Text,Annotation,oh_label,Cat_enc
0,5.74948705591165e+17,5.74948705591165e+17,@halalflaws @biebervalue @greenlinerzjm I read...,none,0.0,0
1,5.71917888690393e+17,5.71917888690393e+17,@ShreyaBafna3 Now you idiots claim that people...,none,0.0,0
2,3.90255841338601e+17,3.90255841338601e+17,"RT @Mooseoftorment Call me sexist, but when I ...",sexism,1.0,2
3,5.68208850655916e+17,5.68208850655916e+17,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0,1
4,5.75596338802373e+17,5.75596338802373e+17,#mkr No No No No No No,none,0.0,0


In [9]:
# Combine all datasets
#del(twitter_df)
twitterList =[]
twitterList.extend(value for name, value in locals().items() if name.endswith('_df'))

twitter_df = pd.concat(twitterList, ignore_index=True)
twitter_df.head()
twitter_df = twitter_df[twitter_df['oh_label'].notna()]


In [10]:
twitter_df = twitter_df[['Text','Annotation','oh_label','Cat_enc','ed_label_0','ed_label_1']]

In [12]:
# Identify duplicates

print(len(twitter_df['Text'].unique()))
print(len(twitter_df.groupby(['Text', 'oh_label']).first()))
print(len(twitter_df.groupby(['Text', 'oh_label','Annotation']).first()))
print(len(twitter_df))

# Drop duplicates with same content, label and annotation, assuming that ambiguous annotation / label can be learned in multiple categories
#twitter_df.groupby(['Text', 'oh_label','Annotation']).first().head()
twitter_df = twitter_df.drop_duplicates(
  subset = ['Text', 'oh_label'],
  keep = 'last').reset_index(drop = True) # use last because those have label confidence


213140
219590
244118
244118


In [13]:
twitter_df.columns= twitter_df.columns.str.lower()
twitter_df[twitter_df['text'] == ""] 

#twitter_df.dropna(subset=['text'])
twitter_df = twitter_df.dropna(how='all')
twitter_df

twitter_df['text']=twitter_df['text'].fillna("")


In [14]:
twitter_df = twitter_df.replace('&amp;','&', regex=True)


In [None]:
# Move Hashtags into a column
twitter_df['hashtags'] = twitter_df.text.apply(lambda x: [x for x in x.split(" ") if x.startswith("#")])

# replace all hashtags, ampersands, and character references with no space
twitter_df.hashtags = twitter_df.hashtags.apply(lambda x: re.sub(r'[#@]+', '', str(x)))

twitter_df.head(n=20)


In [None]:
# Create stopword list:
mystopwords = set(stopwords.words('english'))

# create preprocess_text function
def preprocess_text(text):

    # Tokenize the text & remove twitter accounts
    tknzr = TweetTokenizer(strip_handles=True)
    tokens = tknzr.tokenize(text.lower())

    # Remove stop words
    filtered_tokens = [token for token in tokens if token not in mystopwords]

    # Lemmatize the tokens
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]

    # Join the tokens back into a string

    processed_text = ' '.join(lemmatized_tokens)

    # remove urls
    #processed_text = processed_text.replace('^(http|https)://', '')
    processed_text = re.sub(r'(http|https)://[\S]+', '',processed_text)

    # remove numbers
    processed_text = re.sub(r'(?<!\S)(?=.)(0|([1-9](\d*|\d{0,2}(,\d{3})*)))?(\.\d*[1-9])?(?!\S)', '',processed_text)

    # remove special characters
    processed_text = re.sub(r'[#@&][\S]+', '',processed_text)
    processed_text = re.sub(r'[#@&$“”".,’]', '',processed_text)
    processed_text = re.sub(r'\b\d[\S]+', '',processed_text)
    processed_text = re.sub('[^a-zA-Z]', ' ', processed_text)
    processed_text = re.sub(r'\s+', ' ', processed_text) # spaces
    processed_text = re.sub(r'\\b[A-Za-z] \\b|\\b [A-Za-z]\\b', ' ', processed_text) # single letters
    
    processed_text =" ".join(processed_text.split())
    processed_text =" ".join([w for w in processed_text.split() if len(w)>1])
    
    return processed_text

# apply the function df

twitter_df['tokenized'] = twitter_df['text'].apply(preprocess_text)
twitter_df.head(n=20)


In [17]:
# Save processed file
twitter_df.to_csv("data/twitter_all_data.csv")


In [18]:
twitter_df[twitter_df['oh_label'].isna()]

Unnamed: 0,text,annotation,oh_label,cat_enc,ed_label_0,ed_label_1,hashtags,tokenized
