Reference:

Tweet preprocessing: https://towardsdatascience.com/basic-tweet-preprocessing-in-python-efd8360d529e

Preprocessing: https://medium.com/@bedigunjit/simple-guide-to-text-classification-nlp-using-svm-and-naive-bayes-with-python-421db3a72d34

In [231]:
import pandas as pd
import numpy as np
# Preprocessing
import preprocessor as p
import string
import contractions
from collections import defaultdict
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet as wn
from nltk import pos_tag
from nltk.corpus import stopwords
# Model
from sklearn import model_selection, svm
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score
# Pipeline
from sklearn.pipeline import make_pipeline
from joblib import dump

In [232]:
# Download requires corpus
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\cgab\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

<H2>PREPROCESSING</H2>

In [233]:
# Read data
df = pd.read_csv("../res/HASOC_compiled.csv")
# Drop irrelevant features
df.drop(["tweet_id", "task2", "ID"], axis=1, inplace=True)
df.head()

Unnamed: 0,text,task1
0,"hate wen females hit ah nigga with tht bro 😂😂,...",HOF
1,RT @airjunebug: When you're from the Bay but y...,HOF
2,RT @DonaldJTrumpJr: Dear Democrats: The Americ...,NOT
3,RT @SheLoveTimothy: He ain’t on drugs he just ...,HOF
4,RT @TavianJordan: Summer ‘19 I’m coming for yo...,NOT


In [234]:
# Clean text with twitter-preprocessor
clean = []
for i,v in enumerate(df['text']):
    clean.append(p.clean(v))
df["text"] = clean

# Convert to lower case
df["text"] = [entry.lower() for entry in df["text"]]

# Expand Contractions
def expand_contractions(s):
    expanded = []
    for word in s.split():
        expanded.append(contractions.fix(word))
    return(' '.join(expanded))

df["text"] = [expand_contractions(entry) for entry in df['text']]

# Remove punctuation marks
df["text"] = [entry.translate(str.maketrans('', '', string.punctuation)) for entry in df["text"]]

# Tokenization
df['text']= [word_tokenize(entry) for entry in df['text']]

In [235]:
# TODO: Experiment if NOT removing stop words will improve model accuracy

# Remove Stop words, Non-Numeric and perfom Word Stemming/Lemmenting.
# WordNetLemmatizer requires Pos tags to understand if the word is noun or verb or adjective etc. By default it is set to Noun
tag_map = defaultdict(lambda : wn.NOUN)
tag_map['J'] = wn.ADJ
tag_map['V'] = wn.VERB
tag_map['R'] = wn.ADV
for index,entry in enumerate(df['text']):
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(entry):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    # The final processed set of words for each iteration will be stored in 'text_final'
    df.loc[index,'text_final'] = str(Final_words)

In [236]:
df.head()

Unnamed: 0,text,task1,text_final
0,"[hate, wen, females, hit, ah, nigga, with, tht...",HOF,"['hate', 'wen', 'female', 'hit', 'ah', 'nigga'..."
1,"[when, you, are, from, the, bay, but, you, are...",HOF,"['bay', 'really', 'ny', 'nigga', 'heart', 'w']"
2,"[dear, democrats, the, american, people, are, ...",NOT,"['dear', 'democrats', 'american', 'people', 's..."
3,"[he, are, not, on, drugs, he, just, bored, i, ...",HOF,"['drug', 'bore', 'shit', 'bore']"
4,"[summer, i, am, coming, for, you, no, boring, ...",NOT,"['summer', 'come', 'bore', 'shit', 'beach', 'd..."


In [237]:
# Split data 70-30
train_x, test_x, train_y, test_y = model_selection.train_test_split(df['text_final'],df['task1'],test_size=0.3)

In [238]:
# Categorical to Numerical Data
Encoder = LabelEncoder()
train_Y = Encoder.fit_transform(train_y)
test_Y = Encoder.fit_transform(test_y)

<h2>SVM Model<h2>

In [239]:
tfidf_vect = TfidfVectorizer(max_features=200000)
tfidf_vect.fit(df['text_final'])
train_x_tfidf = tfidf_vect.transform(train_x)
test_x_tfidf = tfidf_vect.transform(test_x)

In [240]:
# Classifier - Algorithm - SVM
# fit the training dataset on the classifier
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM.fit(train_x_tfidf,train_y)
# predict the labels on validation dataset
predictions_SVM = SVM.predict(test_x_tfidf)
# Use accuracy_score function to get the accuracy
print("SVM Accuracy Score -> ",accuracy_score(predictions_SVM, test_y)*100)

SVM Accuracy Score ->  88.35666912306559


<H2>Create Pipeline</H2>
Pipelines will only contain vectorization and training

In [241]:
clf = make_pipeline(
    TfidfVectorizer(max_features=200000),
    svm.SVC(kernel='linear', probability=True)
)
clf = clf.fit(X=train_x, y=train_y)

In [242]:
pipeline_predict = clf.predict(test_x)
print("SVM Accuracy Score -> ",accuracy_score(pipeline_predict, test_y)*100)

SVM Accuracy Score ->  88.28297715549004


<h2>Exporting Model + Creation of preprocessing functions for API</h2>

In [249]:
# Create preprocessing function
def preprocess(text):
    # twitter-presprocessor
    text = p.clean(text)
    # lower case
    text = text.lower()
    # Expand Contractions
    expanded = []
    for word in text.split():
        expanded.append(contractions.fix(word))
    text = ' '.join(expanded)
    # Remove punctuation marks
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Tokenization
    text = word_tokenize(text)
    
    lemmanized = preprocess_part2(text)

    # return str(text)
    return lemmanized

def preprocess_part2(text):
    tag_map = defaultdict(lambda : wn.NOUN)
    tag_map['J'] = wn.ADJ
    tag_map['V'] = wn.VERB
    tag_map['R'] = wn.ADV
    # Declaring Empty List to store the words that follow the rules for this step
    Final_words = []
    # Initializing WordNetLemmatizer()
    word_Lemmatized = WordNetLemmatizer()
    # pos_tag function below will provide the 'tag' i.e if the word is Noun(N) or Verb(V) or something else.
    for word, tag in pos_tag(text):
        # Below condition is to check for Stop words and consider only alphabets
        if word not in stopwords.words('english') and word.isalpha():
            word_Final = word_Lemmatized.lemmatize(word,tag_map[tag[0]])
            Final_words.append(word_Final)
    return str(Final_words)


In [252]:
preprocess("You are the worst person in the world. You should kill yourself. I hope you die in pain")

"['bad', 'person', 'world', 'kill', 'hope', 'die', 'pain']"

In [259]:
arr = [preprocess("You are the worst person in the wasdorld. You should kill yourself. I hope you die in pain")]
# arr = pd.Series(arr)
print(clf.predict_proba(arr))
print(clf.predict(arr))

[[0.06220578 0.93779422]]
['NOT']


In [246]:
# Export model as joblib file
filename = "subtask1.joblib.z"
path = "../out/" + filename
dump(clf, path)

['../out/subtask1.joblib.z']