In [26]:
# Importing necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import re
import os
import csv
import time
import itertools
import seaborn as sns

import nltk
import nltk.corpus # sample text for performing tokenization
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.tokenize import TweetTokenizer
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from gensim.parsing.porter import PorterStemmer
from gensim.utils import simple_preprocess
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer



In [27]:
# Importing twitter dataset

twitter_df = pd.read_csv('data/twitter_all_data.csv',index_col=False)
twitter_df=twitter_df.fillna("")

twitter_df.tail()

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,annotation,oh_label,cat_enc,ed_label_0,ed_label_1,hashtags,tokenized,tokenized_text,stemmed_tokens
219585,219585,219585,` These sources don't exactly exude a sense ...,none,0.0,0,0.888889,0.111111,[],source exactly exude sense impartiality newswe...,"['source', 'exactly', 'exude', 'sense', 'impar...","['sourc', 'exactli', 'exud', 'sens', 'imparti'..."
219586,219586,219586,The Institute for Historical Review is a pee...,none,0.0,0,0.9,0.1,[],institute historical review peer reviewed jour...,"['institute', 'historical', 'review', 'peer', ...","['institut', 'histor', 'review', 'peer', 'revi..."
219587,219587,219587,:The way you're trying to describe it in this...,none,0.0,0,1.0,0.0,[],way trying describe article serious step where...,"['way', 'trying', 'describe', 'article', 'seri...","['wai', 'try', 'describ', 'articl', 'seriou', ..."
219588,219588,219588,== Warning == There is clearly a protection...,none,0.0,0,0.8,0.2,[],warning clearly protectionist regime going wit...,"['warning', 'clearly', 'protectionist', 'regim...","['warn', 'clearli', 'protectionist', 'regim', ..."
219589,219589,219589,Alternate option=== Is there perhaps enough ne...,none,0.0,0,1.0,0.0,[],alternate option perhaps enough newsworthy inf...,"['alternate', 'option', 'perhaps', 'enough', '...","['altern', 'option', 'perhap', 'enough', 'news..."


In [28]:
# Check if all data is labelled
twitter_df[twitter_df['oh_label']=='']

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,text,annotation,oh_label,cat_enc,ed_label_0,ed_label_1,hashtags,tokenized,tokenized_text,stemmed_tokens


In [29]:
# Preparing the dataset

tweets = " ".join(tw for tw in twitter_df.tokenized)
all_sentences = nltk.sent_tokenize(tweets)

all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

all_words[:1]


[['wrong',
  'isi',
  'follows',
  'example',
  'mohammed',
  'quran',
  'exactly',
  'good',
  'muslim',
  'good',
  'despite',
  'bad',
  'religion',
  'yeah',
  'called',
  'caring',
  'human',
  'life',
  'idiot',
  'something',
  'genocidal',
  'daesh',
  'would',
  'nt',
  'understand',
  'muslim',
  'brain',
  'dead',
  'repeat',
  'others',
  'said',
  'million',
  'time',
  'want',
  'understand',
  'lie',
  'muslim',
  'living',
  'peace',
  'jew',
  'read',
  'ibn',
  'warraq',
  'total',
  'liar',
  'like',
  'pedophile',
  'prophet',
  'un',
  'soldier',
  'burn',
  'people',
  'alive',
  'daesh',
  'shia',
  'militia',
  'driven',
  'religion',
  'hatred',
  'bigotry',
  'freedom',
  'tikrit',
  'regardless',
  'muslim',
  'world',
  'ever',
  'produced',
  'anything',
  'tyrant',
  'dictator',
  'fascist',
  'fanatic',
  'would',
  'support',
  'islam',
  'answer',
  'anything',
  'pretend',
  'answer',
  'illogical',
  'delusional',
  'superstition',
  'attacking',
  'e

In [30]:
# Tokenize the text column to get the new column 'tokenized_text'
twitter_df['tokenized_text'] = [simple_preprocess(line, deacc=True) for line in twitter_df['tokenized']] 
print(twitter_df['tokenized_text'].tail(10))

porter_stemmer = PorterStemmer()
# Get the stemmed_tokens
twitter_df['stemmed_tokens'] = [[porter_stemmer.stem(word) for word in tokens] for tokens in twitter_df['tokenized_text'] ]
twitter_df['stemmed_tokens'].tail(10)

219580    [lead, original, research, proper, citation, g...
219581                                 [well, done, thanks]
219582    [talking, making, unjustified, major, change, ...
219583    [yes, word, guci, puci, meaning, flash, flashl...
219584    [comment, gentleman, article, provides, insigh...
219585    [source, exactly, exude, sense, impartiality, ...
219586    [institute, historical, review, peer, reviewed...
219587    [way, trying, describe, article, serious, step...
219589    [alternate, option, perhaps, enough, newsworth...
Name: tokenized_text, dtype: object


219580    [lead, origin, research, proper, citat, gamerg...
219581                                  [well, done, thank]
219582    [talk, make, unjustifi, major, chang, edit, re...
219583    [ye, word, guci, puci, mean, flash, flashlight...
219584    [comment, gentleman, articl, provid, insight, ...
219585    [sourc, exactli, exud, sens, imparti, newsweek...
219586    [institut, histor, review, peer, review, journ...
219587    [wai, try, describ, articl, seriou, step, wher...
219588    [warn, clearli, protectionist, regim, go, witc...
219589    [altern, option, perhap, enough, newsworthi, i...
Name: stemmed_tokens, dtype: object

In [31]:
# Train and test set
X_train, X_test, y_train, y_test = train_test_split(twitter_df[['stemmed_tokens']], twitter_df['oh_label'], stratify=twitter_df['oh_label'], random_state=42)
X_train = X_train.reset_index()
X_test = X_test.reset_index()
y_train = y_train.to_frame()
y_train = y_train.reset_index()
y_test = y_test.to_frame()
y_test = y_test.reset_index()

In [32]:
# Save for later
X_train.to_csv("data/X_train.csv")
X_test.to_csv("data/X_test.csv")
y_train.to_csv("data/y_train.csv")
y_test.to_csv("data/y_test.csv")

In [33]:
# Load the model from the model file
word2vec_model_file = "model/twitter_train_data.csv" + 'word2vec_' + '.model'

stemmed_tokens = pd.Series(X_train['stemmed_tokens']).values
stemmed_tokens

array([list(['quit', 'alright', 'thank', 'pleas', 'excus', 'blunt', 'style', 'argument', 're', 'ction', 'other', 'attitud', 'noth', 'person', 'practic', 'grammat', 'would', 'cours', 'better', 'avoid', 'point', 'conflict', 'disagr', 'campi', 'pre', 'mptive', 'resolv', 'avoid', 'plural', 'opt', 'compromis', 'spell', 'pre', 'emptiv', 'instead', 'preemptiv', 'pre', 'mptive', 'dichotomi', 'respect', 'thank', 'civil']),
       list(['wikipedia', 'censorship', 'receiv', 'zero', 'fund']),
       list(['homophobia', 'section', 'butt', 'homophob', 'joke', 'dress', 'sens', 'high', 'pitch', 'voic']),
       ...,
       list(['bad', 'faith', 'troll', 'oppos', 'good', 'faith', 'troll', 'lol']),
       list(['notabl', 'refrenc', 'need', 'link', 'site', 'charact', 'list', 'playabl', 'case', 'unlimit', 'code', 'tiger', 'colosseum', 'tiger', 'colosseum', 'upper', 'refer', 'articl', 'alreadi', 'wikipedia', 'lancer', 'els', 'would', 'want', 'see']),
       list(['nice', 'nice', 'look', 'like', 'us', 'time

In [34]:
# Train the Word2Vec Model on X train data (Skip-gram model (sg = 1))
start_time = time.time()

w2v_model = Word2Vec(stemmed_tokens, vector_size=100,min_count = 1, workers = 4, window = 3, sg = 1)
print("Time taken to train word2vec model: " + str(time.time() - start_time))
w2v_model.save(word2vec_model_file)

Time taken to train word2vec model: 29.9097900390625


In [35]:
print(len(w2v_model.wv.key_to_index))
w2v_model.wv.key_to_index

125748


{'articl': 0,
 'page': 1,
 'wikipedia': 2,
 'edit': 3,
 'us': 4,
 'like': 5,
 'on': 6,
 'would': 7,
 'pleas': 8,
 'delet': 9,
 'talk': 10,
 'thank': 11,
 'sourc': 12,
 'see': 13,
 'think': 14,
 'make': 15,
 'know': 16,
 'go': 17,
 'also': 18,
 'time': 19,
 'peopl': 20,
 'get': 21,
 'sai': 22,
 'fuck': 23,
 'block': 24,
 'need': 25,
 'imag': 26,
 'mai': 27,
 'name': 28,
 'remov': 29,
 'want': 30,
 'link': 31,
 'person': 32,
 'user': 33,
 'look': 34,
 'help': 35,
 'work': 36,
 'inform': 37,
 'even': 38,
 'good': 39,
 'new': 40,
 'refer': 41,
 'list': 42,
 'well': 43,
 'comment': 44,
 'wai': 45,
 'chang': 46,
 'could': 47,
 'discuss': 48,
 'question': 49,
 'ad': 50,
 'section': 51,
 'point': 52,
 'editor': 53,
 'thing': 54,
 'take': 55,
 'read': 56,
 'first': 57,
 'wp': 58,
 'vandal': 59,
 'fact': 60,
 'seem': 61,
 'right': 62,
 'find': 63,
 'state': 64,
 'style': 65,
 'reason': 66,
 'revert': 67,
 've': 68,
 'place': 69,
 'ask': 70,
 'mani': 71,
 'try': 72,
 'made': 73,
 'much': 74,
 're

In [36]:
stepped_list = list(itertools.chain.from_iterable(stemmed_tokens.tolist()))
len(set(stepped_list))

125748

In [37]:
print(w2v_model.wv.most_similar('religion'))
print(w2v_model.wv.most_similar('femin'))
print(w2v_model.wv.most_similar('peopl'))

[('religi', 0.8389027714729309), ('hinduism', 0.8235896825790405), ('judaism', 0.8191343545913696), ('creed', 0.8161622881889343), ('humanist', 0.8136708736419678), ('islam', 0.8129653930664062), ('monotheist', 0.811299204826355), ('irreligion', 0.8093090057373047), ('sikhism', 0.807894229888916), ('dharmic', 0.7960187792778015)]
[('feminist', 0.8195350170135498), ('conservat', 0.781833291053772), ('paglia', 0.7771574854850769), ('gymnast', 0.7735275626182556), ('egalitarian', 0.7586634755134583), ('heterosexu', 0.7565826773643494), ('nuditi', 0.7550978064537048), ('nambla', 0.7531779408454895), ('intercours', 0.7511581778526306), ('vibrant', 0.7492883801460266)]
[('other', 0.803199827671051), ('instinct', 0.7902354001998901), ('ingrain', 0.7806429862976074), ('werent', 0.7800484299659729), ('profoundli', 0.7774443626403809), ('wouldnt', 0.774388313293457), ('resent', 0.7698257565498352), ('becuas', 0.7697687745094299), ('mingl', 0.7695815563201904), ('sudhan', 0.7665708661079407)]


In [38]:
twitter_df.to_csv("data/twitter_all_data.csv")