In [1]:
import pandas as pd
import numpy as np
# Preprocessing
import preprocessor as p
import string
import contractions
from collections import defaultdict
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.corpus import stopwords
# Model
from sklearn import model_selection, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
# Pipeline
from sklearn.pipeline import make_pipeline
from joblib import dump

In [2]:
# Download requires corpus
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\LANCE\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LANCE\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\LANCE\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\LANCE\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LANCE\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [3]:
# Read data
df = pd.read_csv("../res/preprocessed_dataset4.csv")
df.rename(columns={"tweet_text":"text"}, inplace=True)
df.head()

Unnamed: 0,text,Homophobe,NotHate,OtherHate,Racist,Religion,Sexist
0,@FriskDontMiss Nigga https://t.co/cAsaLWEpue,1,0,0,1,1,0
1,My horses are retarded https://t.co/HYhqc6d5WN,0,0,1,0,0,0
2,“NIGGA ON MA MOMMA YOUNGBOY BE SPITTING REAL S...,0,1,0,0,0,0
3,RT xxSuGVNGxx: I ran into this HOLY NIGGA TODA...,0,1,0,1,0,0
4,“EVERYbody calling you Nigger now!” https://t....,0,1,0,1,0,0


In [4]:
# FOR TRIAL AND ERROR PURPOSES
# TODO: Remove when training for final model
# DF is split to reduce training time for trial and error purposes
df = df.iloc[:5000,:]
len(df)

5000

In [5]:
# Clean text with twitter-preprocessor
clean = []
for i,v in enumerate(df['text']):
    clean.append(p.clean(v))
df["text"] = clean

# Convert to lower case
df["text"] = [entry.lower() for entry in df["text"]]

# Expand Contractions
def expand_contractions(s):
    expanded = []
    for word in s.split():
        expanded.append(contractions.fix(word))
    return(' '.join(expanded))

df["text"] = [expand_contractions(entry) for entry in df['text']]

# Remove punctuation marks
df["text"] = [entry.translate(str.maketrans('', '', string.punctuation)) for entry in df["text"]]

# Tokenization
df['text']= [word_tokenize(entry) for entry in df['text']]

In [6]:
# TODO: Experiment if NOT removing stop words will improve model accuracy

# Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(df['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

In [7]:
df.head()

Unnamed: 0,text,Homophobe,NotHate,OtherHate,Racist,Religion,Sexist,text_final
0,[nigga],1,0,0,1,1,0,['nigga']
1,"[my, horses, are, retarded]",0,0,1,0,0,0,"['horse', 'retard']"
2,"[nigga, on, ma, momma, youngboy, be, spitting,...",0,1,0,0,0,0,"['nigga', 'momma', 'youngboy', 'spit', 'real',..."
3,"[xxsugvngxx, i, ran, into, this, holy, nigga, ...",0,1,0,1,0,0,"['xxsugvngxx', 'run', 'holy', 'nigga', 'today']"
4,"[everybody, calling, you, nigger, now]",0,1,0,1,0,0,"['everybody', 'call', 'nigger']"


Reference for multilabel preprocessing: https://www.section.io/engineering-education/multi-label-classification-with-scikit-multilearn/

In [8]:
# Get labels
y = df[["Homophobe", "NotHate", "OtherHate", "Racist", "Religion", "Sexist"]]
# Split train - test; 70/30
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['text_final'],y,test_size=0.3)

<H1>Creation of Model </H1>

In [9]:
# pip install scikit-multilearn
from skmultilearn.problem_transform import BinaryRelevance
from sklearn.multiclass import OneVsRestClassifier
from skmultilearn.problem_transform import ClassifierChain
from skmultilearn.problem_transform import LabelPowerset

<h4>BinaryRevelance</h4>

In [10]:
br_classifier = make_pipeline(
    TfidfVectorizer(),
    BinaryRelevance(svm.SVC(kernel='linear', probability=True))
)
br_classifier = br_classifier.fit(X=train_x, y=train_y)
br_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('binaryrelevance',
                 BinaryRelevance(classifier=SVC(kernel='linear',
                                                probability=True),
                                 require_dense=[True, True]))])

In [11]:
predictions_br = br_classifier.predict(test_x)
print("SVM Accuracy Score -> ",accuracy_score(predictions_br, test_y))

SVM Accuracy Score ->  0.40066666666666667


<h4>One vs Rest</h4>

In [12]:
ovr_classifier = make_pipeline(
    TfidfVectorizer(),
    OneVsRestClassifier(svm.SVC(kernel='linear', probability=True))
)
ovr_classifier = ovr_classifier.fit(X=train_x, y=train_y)
ovr_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('onevsrestclassifier',
                 OneVsRestClassifier(estimator=SVC(kernel='linear',
                                                   probability=True)))])

In [13]:
predictions_ovr = ovr_classifier.predict(test_x)
print("SVM Accuracy Score -> ",accuracy_score(predictions_ovr, test_y))

SVM Accuracy Score ->  0.40066666666666667


<h4>Classifer Chain</h4>

In [14]:
cf_classifier = make_pipeline(
    TfidfVectorizer(),
    ClassifierChain(svm.SVC(kernel='linear', probability=True))
)
cf_classifier = cf_classifier.fit(X=train_x, y=train_y)
cf_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('classifierchain',
                 ClassifierChain(classifier=SVC(kernel='linear',
                                                probability=True),
                                 require_dense=[True, True]))])

In [15]:
predictions_cf = cf_classifier.predict(test_x)
print("SVM Accuracy Score -> ",accuracy_score(predictions_cf, test_y))

SVM Accuracy Score ->  0.4013333333333333


<h4>Label Powerset</h4> 

In [16]:
lp_classifier = make_pipeline(
    TfidfVectorizer(),
    LabelPowerset(svm.SVC(kernel='linear', probability=True))
)
lp_classifier = lp_classifier.fit(X=train_x, y=train_y)
lp_classifier

Pipeline(steps=[('tfidfvectorizer', TfidfVectorizer()),
                ('labelpowerset',
                 LabelPowerset(classifier=SVC(kernel='linear',
                                              probability=True),
                               require_dense=[True, True]))])

In [17]:
predictions_lp = lp_classifier.predict(test_x)
print("SVM Accuracy Score -> ",accuracy_score(predictions_lp, test_y))

SVM Accuracy Score ->  0.38466666666666666


In [18]:
classifer_names = ["BinaryRelevance", "OneVSRest", "ClassifierChain", "LabelPowerset"]
classifiers = [br_classifier, ovr_classifier, cf_classifier, lp_classifier]

<h2>Make a Single prediction</h2>

In [19]:
# Create preprocessing function
def preprocess(text):
    # twitter-presprocessor
    text = p.clean(text)
    # lower case
    text = text.lower()
    # Expand Contractions
    expanded = []
    for word in text.split():
        expanded.append(contractions.fix(word))
    text = ' '.join(expanded)
    # Remove punctuation marks
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    text = word_tokenize(text)
    
    lemmanized = preprocess_part2(text)

    # return str(text)
    return lemmanized

def preprocess_part2(text):
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(text):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    return str(Final_words)


In [20]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
sample_texts = [text1, text2, text3, text4, text5]

for text in sample_texts:
    arr = [preprocess(text)]
    classes = ovr_classifier.classes_
    proba = ovr_classifier.predict_proba(arr)[0]
    pred = ovr_classifier.predict(arr)[0]

    headers = y.columns

    print(f'INPUT: {text}')
    for i,e in enumerate(classes):
        print(f'  {headers[i]}: {proba[i]*100}')
    print(f'PREDICTION: {pred}\n')

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Homophobe: 3.417265185378283
  NotHate: 95.08715609313548
  OtherHate: 15.957655844436799
  Racist: 27.252884323461775
  Religion: 1.5984497986154251
  Sexist: 14.28566715101337
PREDICTION: [0 1 0 0 0 0]

INPUT: gay people are mentally ill
  Homophobe: 45.41112484013117
  NotHate: 90.52057736482844
  OtherHate: 50.74865019187449
  Racist: 25.283556688546444
  Religion: 1.4666829793156884
  Sexist: 10.843994189985157
PREDICTION: [1 1 1 0 0 0]

INPUT: Islam people are all terrorists
  Homophobe: 6.697683877919598
  NotHate: 94.36657668527072
  OtherHate: 11.51885068948646
  Racist: 30.595237924415574
  Religion: 45.29051010145928
  Sexist: 7.929021617020744
PREDICTION: [0 1 0 0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Homophobe: 2.409895364250828
  NotHate: 93.27200330271448
  OtherHate: 12.963688800543943
  Racist: 45.17053790875567
  Religion: 1.4732540693644

In [23]:
for i,classifier in enumerate(classifiers):
    print(f"------------ {classifer_names[i]} ------------\n")
    for text in sample_texts:
        arr = [preprocess(text)]
        classes = classifier.classes_
        proba = classifier.predict_proba(arr)[0]
        pred = classifier.predict(arr)[0]

        headers = y.columns

        print(f'INPUT: {text}')
        for i,e in enumerate(classes):
            print(f'  {headers[i]}: {proba[i]*100}')
        print(f'PREDICTION: {pred}\n')

------------ BinaryRelevance ------------



AttributeError: 'BinaryRelevance' object has no attribute 'classes_'

<h2>Export Classifier Models</h2>

In [25]:
# Export model as joblib file
for i,classifier in enumerate(classifiers):
    filename = classifer_names[i] + ".joblib.z"
    path = "../out/subtask3/" + filename
    dump(classifier, path)

<H2>Load Models</H2>

In [35]:
from joblib import load 

In [61]:
text1 = "Girl bitches should just kill themselves if they don't know how to cook"
text2 = "gay people are mentally ill"
text3 = "Islam people are all terrorists"
text4 = "Asians should just go back to their country, all they do is take our jobs"
text5 = "If you want to make the world a better place, look at yourself and make a change"
text6 = "The mentally ill should seek help no cap"
sample_texts = [text1, text2, text3, text4, text5, text6]

In [53]:
# Initialize
path = "../out/subtask3/"
extension = ".joblib.z"
br_classifier = load(path + "BinaryRelevance" + extension)
ovr_classifier = load(path + "OneVSRest" + extension)
cf_classifier = load(path + "ClassifierChain" + extension)
lp_classifier = load(path + "LabelPowerset" + extension)

In [62]:
classifer_names = ["BinaryRelevance", "ClassifierChain", "LabelPowerset"]
classifiers = [br_classifier, cf_classifier, lp_classifier]
headers = y.columns

In [63]:
for i,classifier in enumerate(classifiers):
    print(f"------------ {classifer_names[i]} ------------\n")
    for text in sample_texts:
        arr = [preprocess(text)]
        proba = classifier.predict_proba(arr).toarray()[0]
        pred = classifier.predict(arr).toarray()[0]

        print(f'INPUT: {text}')
        for i,e in enumerate(proba):
            print(f'  {headers[i]}: {e*100}')
        print(f'PREDICTION: {pred}\n')

------------ BinaryRelevance ------------

INPUT: Girl bitches should just kill themselves if they don't know how to cook
  Homophobe: 3.2266965875375706
  NotHate: 95.10907093835792
  OtherHate: 15.938275183210504
  Racist: 26.94560942783787
  Religion: 1.5721904256209234
  Sexist: 14.230099565607837
PREDICTION: [0 1 0 0 0 0]

INPUT: gay people are mentally ill
  Homophobe: 47.723661884801714
  NotHate: 90.20893668375912
  OtherHate: 53.00006168680037
  Racist: 25.009825716168443
  Religion: 1.4332809260186665
  Sexist: 10.687020317583498
PREDICTION: [1 1 1 0 0 0]

INPUT: Islam people are all terrorists
  Homophobe: 6.562874384697421
  NotHate: 94.34908851227136
  OtherHate: 11.297616861267496
  Racist: 30.231812673323017
  Religion: 50.0
  Sexist: 7.717969503386206
PREDICTION: [0 1 0 0 0 0]

INPUT: Asians should just go back to their country, all they do is take our jobs
  Homophobe: 2.236149474555906
  NotHate: 93.1774648952431
  OtherHate: 12.799872117093495
  Racist: 44.5940430756