In [18]:
import pandas as pd
import numpy as np
# Preprocessing
import preprocessor as p
import string
import contractions
from collections import defaultdict
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.corpus import stopwords
# Model
from sklearn import model_selection, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
# Pipeline
from sklearn.pipeline import make_pipeline
from joblib import dump

In [19]:
# Download requires corpus
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [20]:
# Read data
df = pd.read_csv("../res/preprocessed_dataset4.csv")
df.rename(columns={"tweet_text":"text"}, inplace=True)
df.head()

Unnamed: 0,text,Homophobe,NotHate,OtherHate,Racist,Religion,Sexist
0,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,1,0,0,1,1,0
1,My horses are retarded https://t.co/HYhqc6d5WN,0,0,1,0,0,0
2,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,0,1,0,0,0,0
3,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,0,1,0,1,0,0
4,“EVERYbody calling you Nigger now!” https://t....,0,1,0,1,0,0


In [21]:
# FOR TRIAL AND ERROR PURPOSES
# TODO: Remove when training for final model
# DF is split to reduce training time for trial and error purposes
df = df.iloc[:5000,:]
len(df)

5000

In [22]:
# Clean text with twitter-preprocessor
clean = []
for i,v in enumerate(df['text']):
    clean.append(p.clean(v))
df["text"] = clean

# Convert to lower case
df["text"] = [entry.lower() for entry in df["text"]]

# Expand Contractions
def expand_contractions(s):
    expanded = []
    for word in s.split():
        expanded.append(contractions.fix(word))
    return(' '.join(expanded))

df["text"] = [expand_contractions(entry) for entry in df['text']]

# Remove punctuation marks
df["text"] = [entry.translate(str.maketrans('', '', string.punctuation)) for entry in df["text"]]

# Tokenization
df['text']= [word_tokenize(entry) for entry in df['text']]

In [23]:
# TODO: Experiment if NOT removing stop words will improve model accuracy

# Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(df['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

In [25]:
df.head()

Unnamed: 0,text,Homophobe,NotHate,OtherHate,Racist,Religion,Sexist,text_final
0,[nigga],1,0,0,1,1,0,['nigga']
1,"[my, horses, are, retarded]",0,0,1,0,0,0,"['horse', 'retard']"
2,"[nigga, on, ma, momma, youngboy, be, spitting,...",0,1,0,0,0,0,"['nigga', 'momma', 'youngboy', 'spit', 'real',..."
3,"[xxsugvngxx, i, ran, into, this, holy, nigga, ...",0,1,0,1,0,0,"['xxsugvngxx', 'run', 'holy', 'nigga', 'today']"
4,"[everybody, calling, you, nigger, now]",0,1,0,1,0,0,"['everybody', 'call', 'nigger']"


Reference for multilabel preprocessing: https://www.section.io/engineering-education/multi-label-classification-with-scikit-multilearn/

In [32]:
# Get labels
y = df[["Homophobe", "NotHate", "OtherHate", "Racist", "Religion", "Sexist"]]
# Split train - test; 70/30
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['text_final'],y,test_size=0.3)

<H1>Creation of Model </H1>

In [47]:
# pip install scikit-multilearn
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import ClassifierChain

<h4>BinaryRevelance</h4>

In [35]:
clf = make_pipeline(
    TfidfVectorizer(max_features=200000),
    BinaryRelevance(svm.SVC(kernel='linear', probability=True))
)
clf = clf.fit(X=train_x, y=train_y)

In [36]:
predictions = clf.predict(test_x)

In [46]:
print("SVM Accuracy Score -> ",accuracy_score(predictions, test_y))

SVM Accuracy Score ->  0.416


<h4>One vs Rest</h4>

In [50]:
ovr_classifier = make_pipeline(
    TfidfVectorizer(max_features=200000),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)
over_classifier = ovr_classifier.fit(X=train_x, y=train_y)
over_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(max_features=200000)),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [51]:
predictions_ovr = ovr_classifier.predict(test_x)
print("SVM Accuracy Score -> ",accuracy_score(predictions_ovr, test_y))

SVM Accuracy Score ->  0.416


<h4>Classifer Chain</h4>

In [55]:
cf_classifier = make_pipeline(
    TfidfVectorizer(max_features=200000),
    ClassifierChain(svm.SVC(kernel='linear', probability=True))
)
cf_classifier = cf_classifier.fit(X=train_x, y=train_y)
cf_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer(max_features=200000)),
                ('classifierchain',
                 ClassifierChain(classifier=SVC(kernel='linear',
                                                probability=True),
                                 require_dense=[True, True]))])

In [56]:
predictions_cf = cf_classifier.predict(test_x)
print("SVM Accuracy Score -> ",accuracy_score(predictions_cf, test_y))

SVM Accuracy Score ->  0.41733333333333333
