# NLP and Text Classification

For this project you will need to classify some angry comments into their respective category of angry. The process that you'll need to follow is (roughly):
<ol>
<li> Use NLP techniques to process the training data. 
<li> Train model(s) to predict which class(es) each comment is in.
    <ul>
    <li> A comment can belong to any number of classes, including none. 
    </ul>
<li> Generate predictions for each of the comments in the test data. 
<li> Write the test data predicitions to a CSV file. 
</ol>

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import SnowballStemmer

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC

from sklearn.ensemble import BaggingClassifier
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix


from imblearn.under_sampling import RandomUnderSampler
from imblearn.over_sampling import SMOTE
from imblearn.over_sampling import SVMSMOTE
from imblearn.over_sampling import ADASYN


### Load Training Data

In [4]:
train_df = pd.read_csv("../Data/train.csv")
train_df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [9]:
target_columns = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

### Data Exploration

In [10]:
# Function dataExplore() helps view the data distribution for the targets.
def dataExplore(data, target_columns):
    import ipywidgets as widgets
    
    #initiate the widget objects into a list
    outs = [widgets.Output() for i in range(len(target_columns))]
    tab = widgets.Tab(children = outs)
    
    #Set the title of each widget tab
    for i, target in enumerate(target_columns):    
        tab.set_title(i, target)
    display(tab)
    
    for i, target in enumerate(target_columns):
        with outs[i]:
            value_count = data[target].value_counts()
            print(value_count)
            plt.pie(value_count, labels=value_count.index, autopct='%1.1f%%')
            plt.title(f"'{target}' comment distribution")
            plt.show()

In [11]:
dataExplore(train_df, target_columns)

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### Setting up Preprocessing and Tokenizers

In [12]:
# Preprocess text to remove punctuations
def preprocess_text(text):
    # Lowercase the text
    text = text.lower()
    
    # Tokenization
    tokens = word_tokenize(text)
    
    # Remove punctuation
    tokens = [token for token in tokens if token not in string.punctuation]
    
    # Join tokens back into text
    preprocessed_text = ' '.join(tokens)
    
    return preprocessed_text

In [13]:
stop_words = set(stopwords.words('english')) 

# Lemmatization
class lemmaTokenizer(object):
    def __init__(self, stop_words):
        self.stop_words = stop_words
        from nltk.stem import WordNetLemmatizer
        self.lemmatizer = WordNetLemmatizer()
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        filtered_tok = []
        for tok in tokens:
            if tok not in stop_words:
                tok = re.sub('\W+','', tok) #Punctuation strip
                tmp = self.lemmatizer.lemmatize(tok)
                if len(tmp) >= 2:
                    filtered_tok.append(tmp)
        preprocessed_text = ' '.join(filtered_tok)
        return preprocessed_text

In [14]:
class swTokenizer(object):
    def __init__(self, stop_words):
        self.stop_words = stop_words
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        filtered_tok = []
        for tok in tokens:
            if tok not in stop_words:
                filtered_tok.append(tok)
        return filtered_tok

In [15]:
class stemTokenizer(object):
    def __init__(self, stop_words):
        self.stop_words = stop_words
        self.stemmer = SnowballStemmer(language='english')
    def __call__(self, doc):
        tokens = word_tokenize(doc)
        filtered_tok = []
        for tok in tokens:
            if tok not in stop_words:
                filtered_tok.append(self.stemmer.stem(tok))
        return filtered_tok

In [16]:
# Function nlpModeling() below incoporates a widget function to help with visualization of the modeling performance 
#for the various targets. This function is specifically used for initial model evaluation
def nlpModeling(X, df, targets, model):
    import ipywidgets as widgets

    #initiate the widget objects into a list
    outs = [widgets.Output() for i in range(len(targets))]
    tab = widgets.Tab(children = outs)
    
    #Set the title of each widget tab
    for i, target in enumerate(targets):    
        title = target
        tab.set_title(i, title)
    display(tab)
    
    # Model fitting and prediction
    predictions = {}
    fit_models = {}
    for i, target in enumerate(targets):
        y = df[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        fit_model = model

        fit_models[target] = fit_model.fit(X_train, y_train)
        preds = fit_model.predict(X_test)
        predictions[target] = preds
        with outs[i]:
            print(f'Performance for {target}: Model - {fit_model}')
            print(classification_report(y_test, preds))
            sns.heatmap(confusion_matrix(y_test, preds), annot=True)
            plt.show()
    return fit_models, predictions

In [17]:
# Set-up vectorizer, features and target
vectorizer  = TfidfVectorizer(max_features=1000, stop_words=stop_words, ngram_range=[1,3])
train_df['cleaned_text'] = train_df['comment_text'].apply(lemmaTokenizer(stop_words))
X = vectorizer.fit_transform(train_df['cleaned_text'])

### 1. NLP with LogisticRegression

In [18]:
#Classificationn model
model= LogisticRegression(max_iter = 2000, )
reg_preds = nlpModeling(X, train_df, target_columns, model=model)

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### 2. NLP with DecisionTreeClassifier

In [19]:
model1= DecisionTreeClassifier()
tree_preds = nlpModeling(X, train_df, target_columns, model=model1)

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### 3. NLP with Random Forest

In [20]:
model2= RandomForestClassifier()
forest_preds = nlpModeling(X, train_df, target_columns, model=model2)

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### 4. NLP with Linear SVC

In [21]:
model3 = LinearSVC()
bag_preds = nlpModeling(X, train_df, target_columns, model=model3)

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### 5. Bagging with Logistic regression

In [22]:
model4 = BaggingClassifier(base_estimator=LogisticRegression(max_iter=10000),n_estimators=10, random_state=0)
bag_preds = nlpModeling(X, train_df, target_columns, model=model4)

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### 6. Bagging with Linear SVC

In [23]:
model5 = BaggingClassifier(base_estimator=LinearSVC(),n_estimators=10, random_state=0)
bag_preds = nlpModeling(X, train_df, target_columns, model=model5)

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

Comparing the base performances from the above set of models, we can see that the LogisticRegression seems to perform a better job on the average than all the other models used. We therefore choose the LogisticRegression as our base model.

We also note that the classification scores for the non-rude comments are better, compared to the case of the rude comments. This is particularly true for the severe, threat, and identity_hate classified comments with regards to the Recall and f1 scores. This is likely due to the fact that cases of these comments are rare meaning less data of these types in the training dataset, as pictured in the distribution pie chart under data exploration. To improve performance therefore, we will look to use some sampling techniques.
After this, The next step will be to tune the performance of the Logistic regression with a grid search.

### Sampling

Now we will try to sample the rude comments 

In [24]:
# Function nlpModeling() below incoporates a widget function to help with visualization of the modeling performance 
#for the various targets. This function is specifically used for initial model evaluation
def nlpSampling(X, df, targets, model, sampler):
    import ipywidgets as widgets

    #initiate the widget objects into a list
    outs = [widgets.Output() for i in range(len(targets))]
    tab = widgets.Tab(children = outs)
    
    #Set the title of each widget tab
    for i, target in enumerate(targets):    
        title = target
        tab.set_title(i, title)
    display(tab)
    
    # Model fitting and prediction
    predictions = {}
    fit_models = {}
    for i, target in enumerate(targets):
        y = df[target]
        X_train, X_test, y_train, y_test = train_test_split(X, y)
        X_train, y_train = sampler.fit_resample(X_train, y_train)
        fit_model = model

        fit_models[target] = fit_model.fit(X_train, y_train)
        preds = fit_model.predict(X_test)
        predictions[target] = preds
        with outs[i]:
            print(f'Performance for {target}: Model - {fit_model}')
            print(classification_report(y_test, preds))
            sns.heatmap(confusion_matrix(y_test, preds), annot=True)
            plt.show()
    return fit_models, predictions

### Logistic regression with RandomUnderSampler

In [25]:
model= LogisticRegression(max_iter = 2000)
reg_preds1 = nlpSampling(X, train_df, target_columns, model=model, sampler=RandomUnderSampler())

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### Logistic regression with SMOTE

In [26]:
model= LogisticRegression(max_iter = 2000)
reg_preds2 = nlpSampling(X, train_df, target_columns, model=model, sampler=SMOTE())

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### Logistic regression with ADASYN

In [27]:
model= LogisticRegression(max_iter = 2000)
reg_preds3 = nlpSampling(X, train_df, target_columns, model=model, sampler=ADASYN())

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### Decision tree with RandomUnderSampler

In [28]:
#Decision tree with sampling
model1= DecisionTreeClassifier()
tree_preds1 = nlpSampling(X, train_df, target_columns, model=model1, sampler=RandomUnderSampler())

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### Decision tree with SMOTE

In [29]:
model1= DecisionTreeClassifier()
tree_preds2 = nlpSampling(X, train_df, target_columns, model=model1, sampler=SMOTE())

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### Decision tree with ADASYN

In [30]:
model1= DecisionTreeClassifier()
tree_preds3 = nlpSampling(X, train_df, target_columns, model=model1, sampler=ADASYN())

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### RandomForest with RandomUnderSampler

In [31]:
model2= RandomForestClassifier()
forest_preds1 = nlpSampling(X, train_df, target_columns, model=model2, sampler=RandomUnderSampler())

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### RandomForest with SMOTE

In [32]:
model2= RandomForestClassifier()
forest_preds2 = nlpSampling(X, train_df, target_columns, model=model2, sampler=SMOTE())

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### LinearSVC with SVMSMOTE

In [33]:
#LinearSVC with sampling
model4= LinearSVC()
svc_preds1 = nlpSampling(X, train_df, target_columns, model=model4, sampler=SVMSMOTE())

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

### LinearSVC with SMOTE

In [34]:
model4= LinearSVC()
svc_preds2 = nlpSampling(X, train_df, target_columns, model=model4, sampler=SMOTE())


Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

After several trial attempts to improve the precision and f1 scores of predictions using different methods including sampling and bagging, we see that the model with the best performance is still the base logistic regression alongside LinearSVC. we will therefore do our classification of the test data using the base Logistic regression.

### Final Model and predictions

In [35]:
# Function nlpFitting() below is used for building the model and making actual predictions
def nlpFitting(train_data, feature, targets, test_data, score=False,  model=LogisticRegression(max_iter = 2000)):
    import ipywidgets as widgets

    #initiate the widget objects into a list
    outs = [widgets.Output() for i in range(len(targets))]
    tab = widgets.Tab(children = outs)
    
    #Set the title of each widget tab
    for i, target in enumerate(targets):    
        title = target
        tab.set_title(i, title)
    display(tab)
    
    # Model fitting and prediction
    vectorizer  = TfidfVectorizer(max_features=3000)
    train_data['cleaned_text'] = train_data[feature].apply(preprocess_text)
    X_train = vectorizer.fit_transform(train_data['cleaned_text'])
    test_data['cleaned_text'] = test_data[feature].apply(preprocess_text)
    X_test = vectorizer.fit_transform(test_data['cleaned_text'])
    
    for i, target in enumerate(targets):
        y_train = train_data[target]
        model.fit(X_train, y_train)
        y_preds = model.predict(X_test)
        test_data[target] = y_preds
        with outs[i]:
            if score:
                y_preds1 = model.predict(X_train)
                y_test=train_data[target]
                print(classification_report(y_test, y_preds1))
                sns.heatmap(confusion_matrix(y_test, y_preds1), annot=True)
                plt.show()
            
            print(test_data[target].value_counts())
    test_data.drop(columns=['comment_text', 'cleaned_text'], axis=1, inplace=True)        
    return test_data

## Test Data

In [36]:
test_df = pd.read_csv("test.csv")
test_df.head()

Unnamed: 0,id,comment_text
0,1,Yo bitch Ja Rule is more succesful then you'll...
1,2,== From RfC == \n\n The title is fine as it is...
2,3,""" \n\n == Sources == \n\n * Zawe Ashton on Lap..."
3,4,":If you have a look back at the source, the in..."
4,5,I don't anonymously edit articles at all.


In [37]:
test_df.shape

(153164, 2)

## Output

In [38]:
# Generate result calling the nlp(Fitting() method
result = nlpFitting(train_data=train_df, feature='comment_text', targets=target_columns, test_data=test_df, score=True)

# Write result to csv with the name "out.csv"
result.to_csv('out.csv', index=False)  

Tab(children=(Output(), Output(), Output(), Output(), Output(), Output()), selected_index=0, titles=('toxic', …

In [39]:
result.head()

Unnamed: 0,id,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,1,0,0,0,0,0,0
1,2,0,0,0,0,0,0
2,3,0,0,0,0,0,0
3,4,0,0,0,0,0,0
4,5,0,0,0,0,0,0
