# Data Exploration and Model 1

In [39]:
import pandas as pd
from sentence_transformers import SentenceTransformer
from bertopic import BERTopic
import matplotlib.pyplot as plt
%matplotlib inline
import plotly.express as px
import plotly.graph_objects as go

In [40]:
df =  pd.read_csv('input_data/train_clean_4.csv')
df

Unnamed: 0,comment_text,cleaned_text,label
0,Explanation\nWhy the edits made under my usern...,explanation why the edits made under my userna...,0
1,D'aww! He matches this background colour I'm s...,dawn he matches this background colour im seem...,0
2,"Hey man, I'm really not trying to edit war. It...",hey man im really not trying to edit war its j...,0
3,"""\nMore\nI can't make any real suggestions on ...",more i cant make any real suggestions on impro...,0
4,"You, sir, are my hero. Any chance you remember...",you sir are my hero any chance you remember wh...,0
...,...,...,...
184349,you's a muthaf***in lie &#8220;LifeAsKing: 20_...,yous a muthafin lie lifesaving pearls coreyema...,1
184350,"you've gone and broke the wrong heart baby, an...",youve gone and broke the wrong heart baby and ...,0
184351,young buck wanna eat.. dat nigguh like I aint ...,young buck wan na eat dat niggah like i aint f...,1
184352,youu got wild bitches tellin you lies,youu got wild bitches tellin you lies,1


In [3]:
df.isnull().sum()

comment_text    0
cleaned_text    7
label           0
dtype: int64

In [None]:
#df.mean()

In [None]:
#fig = px.bar(df.mean())
#fig.update_layout(
#    title="Percentage of each class",
#    xaxis_title="Class",
#    yaxis_title="Percentage",
#    showlegend=False)

In [36]:
fig = px.bar(df.label.value_counts() / len(df))
fig.update_layout(title='Percent of each class',
                 xaxis_title='Toxic',
                 yaxis_title='Percentage',
                 showlegend=False)
fig.write_image("plots/class_dist.png")

Based on this, we should do some sort of resampling or not?

In [59]:
#df['toxic_any'] =  (df[['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']].sum(axis=1) > 0).astype(int)

In [5]:
df.label.mean()

0.19986005185675385

Even grouping them all together only 9.85% of the data is toxic. We should do an upsampling. Like he commented, accuracy is high on the papers due to class imbalance.

## Let's try some topic modelling

In [6]:
# Create embeddings
sentence_model = SentenceTransformer('sentence-transformers/paraphrase-xlm-r-multilingual-v1')
embeddings = sentence_model.encode(df['comment_text'], show_progress_bar=True)

# Create topic model
topic_model = BERTopic() 
topics, _ = topic_model.fit_transform(df['comment_text'], embeddings)

# Show list of clusters
topic_model.get_topic_info() 

Batches:   0%|          | 0/5762 [00:00<?, ?it/s]

KeyboardInterrupt: 

In [53]:
# Interactive visualization
topic_model.visualize_topics() 

## Preprocessing data

Text preprocessing is an approach for cleaning and preparing text data for use in a specific context. The ultimate goal of cleaning and preparing text data is to reduce the text to only the words that you need for your NLP goals.

Steps:

1) Remove punctuation and non-alphabetical characters

2) Remove words which appear less than 1% of the vocabulary and more than 99%

3) Apply lowercase to the text

4) Tokenize

5) Remove stop words

6) Steeming - bluntly removing prefixes and suffixes from a word

7) Lemmatization – replacing a single-word token with its root

In [30]:
import unicodedata
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import word_tokenize

In [44]:
import unicodedata
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk import word_tokenize


def remove_numbers(text):
    '''Removes integers'''
    text = ''.join([i for i in text if not i.isdigit()])
    return text

def remove_punctuation(text):
    text = re.sub('[%s]' % re.escape(string.punctuation), ' ', text)  
    return text

def text_lowercase(text):
    return text.lower()

def strip_whitespace(text):
    text = " ".join(text.split())
    return text

def remove_stopwords(text, lan='english', add_sw = []):

    if lan == 'english':
        sp_en = set(stopwords.words('english'))
        words_tokens = word_tokenize(text)
        text = [i for i in words_tokens if i not in sp_en]
        text = ' '.join(text)

    else:
        print('''Could not remove stopwords. Please specify a language. 
        Only available in English or Spanish''')
        return

    return text

def apply_stemming(text, lan):

    if lan == 'english':
        stemmer_en = SnowballStemmer('english')
        words_tokens = word_tokenize(text)
        text = [stemmer_en.stem(i) for i in words_tokens]
        text = ' '.join(text)
    else:
        print('Stemmer only available in english or spanish')
        return
    
    return text

def clean_text(text, punctuation=True, numbers=True, lowercase=True, 
          whitespace=True, lan = 'english', stopwords = False, stemming = False, add_sw=[]):

    if stopwords:
        text = remove_stopwords(text, lan = lan, add_sw=add_sw)

    if punctuation:
        text = remove_punctuation(text)

    if numbers:
        text = remove_numbers(text)

    if lowercase:
        text = text_lowercase(text)

    if whitespace:
        text = strip_whitespace(text)

    if stemming:
        text = apply_stemming(text, lan = lan)

    return(text)

In [49]:
clean_comments = [] 
for text in df['comment_text']:
    clean_comments.append(clean_text(text))
    
df['clean_comments'] = clean_comments

In [50]:
df

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate,toxic_any,clean_comments
0,73aee1d0f4ac3434,I you mistagged this one as it does not meet t...,0,0,0,0,0,0,False,i you mistagged this one as it does not meet t...
1,78bffc73bde6f503,How do you create a references section on the ...,0,0,0,0,0,0,False,how do you create a references section on the ...
2,aea668767345dcd1,Semi-protected edit request on 15 October 2015,0,0,0,0,0,0,False,semi protected edit request on october
3,6161150e0239aaff,"""\nOh, look how sweet he is. And what happened...",0,0,0,0,0,0,False,oh look how sweet he is and what happened with...
4,c94ad7853c443b4d,"""\n\n Taiwan \n\nShouldn't the """"Taiwan""""'s be...",0,0,0,0,0,0,False,taiwan shouldn t the taiwan s be changed to fo...
...,...,...,...,...,...,...,...,...,...,...
9995,4c9a5a2d483edb6f,St Mikes \n\nWe really should put up a St. Mik...,0,0,0,0,0,0,False,st mikes we really should put up a st mikes wi...
9996,597b324d55677463,if they dont score a milliuon goals for vinnie...,1,0,1,0,1,0,True,if they dont score a milliuon goals for vinnie...
9997,fe21ab6576f42b25,"""\n\n Defacing compliant sigs \n\nThis edit ha...",0,0,0,0,0,0,False,defacing compliant sigs this edit has been par...
9998,fb457ae943eb757f,image \ncan`t get his image to work. =[,0,0,0,0,0,0,False,image can t get his image to work


## Model 1

We will make a model that selects the most common label in the data, and see what is the performance. This will be our lowest benchmark

In [26]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

def model_1(df_test, label_col, most_common_label=0):
    '''
    Input:
    - df (DataFrame) containing the data
    - comments_col (str) Comments column name
    - label_col (str) label column name
    
    Output:
    - model predictions
    - Accuracy
    - F1
    '''
    y_test = df_test[label_col]
    y_pred = [most_common_label]*len(y_test)
    
    print(classification_report(y_test, y_pred))
    
    

In [27]:
model_1(df, 'label')

              precision    recall  f1-score   support

           0       0.80      1.00      0.89     29445
           1       0.00      0.00      0.00      7327

    accuracy                           0.80     36772
   macro avg       0.40      0.50      0.44     36772
weighted avg       0.64      0.80      0.71     36772



In [15]:
model_1(df, 'comment_text', 'label')

              precision    recall  f1-score   support

           0       0.81      1.00      0.89      7401
           1       0.00      0.00      0.00      1792

    accuracy                           0.81      9193
   macro avg       0.40      0.50      0.45      9193
weighted avg       0.65      0.81      0.72      9193

