In [19]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
import csv
import time
import itertools
import seaborn as sns

import nltk
import nltk.corpus # sample text for performing tokenization
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.parsing.porter import PorterStemmer
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



In [15]:
# Importing twitter dataset

twitter_df = pd.read_csv('data/twitter_all_data.csv',index_col=False)
twitter_df=twitter_df.fillna("")

twitter_df.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,annotation,oh_label,cat_enc,ed_label_0,ed_label_1,hashtags,tokenized,tokenized_text,stemmed_tokens
0,0,0,0,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0,1,,,[],wrong isi follows example mohammed quran exactly,"['wrong', 'isi', 'follows', 'example', 'mohamm...","['wrong', 'isi', 'follow', 'exampl', 'moham', ..."
1,1,1,1,@SirajZarook @OdiniaInvictus @BilalIGhumman @I...,racism,1.0,1,,,[],good muslim good despite bad religion,"['good', 'muslim', 'good', 'despite', 'bad', '...","['good', 'muslim', 'good', 'despit', 'bad', 'r..."
2,2,2,2,"@scamp_faridxx @AbuAlbaraaSham Yeah, it's call...",racism,1.0,1,,,[],yeah called caring human life idiot something ...,"['yeah', 'called', 'caring', 'human', 'life', ...","['yeah', 'call', 'care', 'human', 'life', 'idi..."
3,3,3,3,@Asadumarfans You are a Muslim. You are brain ...,racism,1.0,1,,,[],muslim brain dead repeat others said million time,"['muslim', 'brain', 'dead', 'repeat', 'others'...","['muslim', 'brain', 'dead', 'repeat', 'other',..."
4,4,4,4,@harmlesstree2 @MaxBlumenthal If you want to u...,racism,1.0,1,,,[],want understand lie muslim living peace jew re...,"['want', 'understand', 'lie', 'muslim', 'livin...","['want', 'understand', 'lie', 'muslim', 'live'..."


In [16]:
twitter_df[twitter_df['oh_label']=='']

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,annotation,oh_label,cat_enc,ed_label_0,ed_label_1,hashtags,tokenized,tokenized_text,stemmed_tokens


In [17]:
# Preparing the dataset

tweets = " ".join(tw for tw in twitter_df.tokenized)
all_sentences = nltk.sent_tokenize(tweets)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

all_words[:10]
#stemmed_tokens = pd.Series(df['stemmed_tokens']).values


[['wrong',
  'isi',
  'follows',
  'example',
  'mohammed',
  'quran',
  'exactly',
  'good',
  'muslim',
  'good',
  'despite',
  'bad',
  'religion',
  'yeah',
  'called',
  'caring',
  'human',
  'life',
  'idiot',
  'something',
  'genocidal',
  'daesh',
  'would',
  'nt',
  'understand',
  'muslim',
  'brain',
  'dead',
  'repeat',
  'others',
  'said',
  'million',
  'time',
  'want',
  'understand',
  'lie',
  'muslim',
  'living',
  'peace',
  'jew',
  'read',
  'ibn',
  'warraq',
  'total',
  'liar',
  'like',
  'pedophile',
  'prophet',
  'un',
  'soldier',
  'burn',
  'people',
  'alive',
  'daesh',
  'shia',
  'militia',
  'driven',
  'religion',
  'hatred',
  'bigotry',
  'freedom',
  'tikrit',
  'regardless',
  'muslim',
  'world',
  'ever',
  'produced',
  'anything',
  'tyrant',
  'dictator',
  'fascist',
  'fanatic',
  'would',
  'support',
  'islam',
  'answer',
  'anything',
  'pretend',
  'answer',
  'illogical',
  'delusional',
  'superstition',
  'attacking',
  'e

In [20]:
# Tokenize the text column to get the new column 'tokenized_text'
twitter_df['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in twitter_df['tokenized']] 
print(twitter_df['tokenized_text'].head(10))

porter_stemmer = PorterStemmer()
# Get the stemmed_tokens
twitter_df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in twitter_df['tokenized_text'] ]
twitter_df['stemmed_tokens'].head(10)

0    [wrong, isi, follows, example, mohammed, quran...
1         [good, muslim, good, despite, bad, religion]
2    [yeah, called, caring, human, life, idiot, som...
3    [muslim, brain, dead, repeat, others, said, mi...
4    [want, understand, lie, muslim, living, peace,...
5    [total, liar, like, pedophile, prophet, un, so...
6    [daesh, shia, militia, driven, religion, hatre...
7    [muslim, world, ever, produced, anything, tyra...
8    [islam, answer, anything, pretend, answer, ill...
9    [attacking, everyone, follows, religious, cult...
Name: tokenized_text, dtype: object


0    [wrong, isi, follow, exampl, moham, quran, exa...
1          [good, muslim, good, despit, bad, religion]
2    [yeah, call, care, human, life, idiot, someth,...
3    [muslim, brain, dead, repeat, other, said, mil...
4    [want, understand, lie, muslim, live, peac, je...
5    [total, liar, like, pedophil, prophet, un, sol...
6    [daesh, shia, militia, driven, religion, hatr,...
7    [muslim, world, ever, produc, anyth, tyrant, d...
8    [islam, answer, anyth, pretend, answer, illog,...
9    [attack, everyon, follow, religi, cult, hate, ...
Name: stemmed_tokens, dtype: object

In [23]:
# Train and test set
X_train, X_test, y_train, y_test = train_test_split(twitter_df[['stemmed_tokens']], twitter_df['oh_label'], stratify=twitter_df['oh_label'], random_state=42)
X_train = X_train.reset_index()
X_test = X_test.reset_index()
y_train = y_train.to_frame()
y_train = y_train.reset_index()
y_test = y_test.to_frame()
y_test = y_test.reset_index()

In [33]:
# Save for later
X_train.to_csv("data/X_train.csv")
X_test.to_csv("data/X_test.csv")
y_train.to_csv("data/y_train.csv")
y_test.to_csv("data/y_test.csv")

In [25]:
#word2vec_model_file = "model/twitter_all_data.csv" + 'word2vec_' + '.model'
word2vec_model_file = "model/twitter_train_data.csv" + 'word2vec_' + '.model'

stemmed_tokens = pd.Series(X_train['stemmed_tokens']).values
stemmed_tokens

array([list(['hi', 'wifion', 'thank', 'much', 'patient', 'address', 'comment', 'sorri', 'get', 'back', 'earlier', 'understand', 'relev', 'guidelin', 'seem', 'wp', 'undu', 'content', 'mai', 'satisfi', 'requir', 'major', 'view', 'also', 'understand', 'point', 'involv', 'parti', 'court', 'case', 'becom', 'primari', 'sourc', 'topic', 'relat', 'case', 'per', 'notw', 'exampl', 'still', 'convinc', 'bar', 'bench', 'reliabl', 'sourc', 'll', 'try', 'look', 'bit', 'see', 'rational', 'us', 'particularli', 'case', 'blp', 'somewhat', 'specul', 'question', 'suppos', 'court', 'case', 'settl', 'suffici', 'medium', 'coverag', 'would', 'lawsuit', 'notabl', 'enough', 'articl', 'would', 'subject', 'blp', 'restrict', 'anywai', 'thank', 'help']),
       list(['serious', 'believ', 'episod', 'air', 'cancel', 'show', 'pleas', 'explain', 'known', 'provid', 'refer', 'verifi']),
       list(['good', 'point', 'expect', 'see', 'articl', 'delet', 'bother', 'clear', 'wikipedia', 'truli', 'repres', 'forum', 'taken', 'h

In [26]:
# Train the Word2Vec Model (Skip-gram model (sg = 1))
start_time = time.time()

w2v_model = Word2Vec(stemmed_tokens, vector_size=100,min_count = 1, workers = 4, window = 3, sg = 1)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)

Time taken to train word2vec model: 34.105093240737915


In [None]:
print(len(w2v_model.wv.key_to_index))
w2v_model.wv.key_to_index

In [28]:
stepped_list = list(itertools.chain.from_iterable(stemmed_tokens.tolist()))
len(set(stepped_list))

126579

In [29]:
print(w2v_model.wv.most_similar('religion'))
print(w2v_model.wv.most_similar('femin'))
print(w2v_model.wv.most_similar('peopl'))

[('religi', 0.8370547294616699), ('humanist', 0.8314415216445923), ('islam', 0.8253030776977539), ('creed', 0.8213331699371338), ('hinduism', 0.819883406162262), ('atheism', 0.8170770406723022), ('judaism', 0.8143739104270935), ('irreligion', 0.8069251775741577), ('monotheist', 0.8062925338745117), ('sikhism', 0.8054744601249695)]
[('feminist', 0.8098196983337402), ('egalitarian', 0.7891940474510193), ('paglia', 0.7797747850418091), ('nambla', 0.7643048167228699), ('overtli', 0.7521183490753174), ('conservat', 0.7409563064575195), ('movement', 0.7394248247146606), ('persuas', 0.7275168299674988), ('paranorm', 0.7261168360710144), ('subcultur', 0.723408043384552)]
[('other', 0.7898426651954651), ('fervent', 0.7837676405906677), ('instinct', 0.783477783203125), ('terroris', 0.7691183090209961), ('profoundli', 0.7682755589485168), ('fierc', 0.7667680978775024), ('pitchfork', 0.763868510723114), ('shun', 0.7602774500846863), ('deepak', 0.7602328658103943), ('appeas', 0.7536629438400269)]


In [30]:
twitter_df.head()

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,annotation,oh_label,cat_enc,ed_label_0,ed_label_1,hashtags,tokenized,tokenized_text,stemmed_tokens
0,0,0,0,"@g0ssipsquirrelx Wrong, ISIS follows the examp...",racism,1.0,1,,,[],wrong isi follows example mohammed quran exactly,"[wrong, isi, follows, example, mohammed, quran...","[wrong, isi, follow, exampl, moham, quran, exa..."
1,1,1,1,@SirajZarook @OdiniaInvictus @BilalIGhumman @I...,racism,1.0,1,,,[],good muslim good despite bad religion,"[good, muslim, good, despite, bad, religion]","[good, muslim, good, despit, bad, religion]"
2,2,2,2,"@scamp_faridxx @AbuAlbaraaSham Yeah, it's call...",racism,1.0,1,,,[],yeah called caring human life idiot something ...,"[yeah, called, caring, human, life, idiot, som...","[yeah, call, care, human, life, idiot, someth,..."
3,3,3,3,@Asadumarfans You are a Muslim. You are brain ...,racism,1.0,1,,,[],muslim brain dead repeat others said million time,"[muslim, brain, dead, repeat, others, said, mi...","[muslim, brain, dead, repeat, other, said, mil..."
4,4,4,4,@harmlesstree2 @MaxBlumenthal If you want to u...,racism,1.0,1,,,[],want understand lie muslim living peace jew re...,"[want, understand, lie, muslim, living, peace,...","[want, understand, lie, muslim, live, peac, je..."


In [31]:
twitter_df.to_csv("data/twitter_all_data.csv")

In [32]:
twitter_df[twitter_df['oh_label'].isna()]

Unnamed: 0.3,Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,annotation,oh_label,cat_enc,ed_label_0,ed_label_1,hashtags,tokenized,tokenized_text,stemmed_tokens
