# Baseline Model for Detecting Toxic Comments 

In [2]:
import pandas as pd 
from tqdm.auto import tqdm
import re


In [10]:
training_raw = pd.read_csv('./data/processed_binary_dataset.csv')

## 1. Data Exploration 

In [7]:
def transformDataset(training_raw, test=None):

    toxic = 0
    non_toxic = 0
    toxicity = []
    data = {'comment' : [],
            'toxic' : [],
            'toxicity' : []
           }

    for i in tqdm(range(len(training_raw))):

        comment = training_raw.loc[i].comment_text
        values = training_raw.loc[i][2:].values
        data['comment'].append(comment)
        
        if max(values) == 1:
            toxic +=1
            data['toxic'].append(1)

        else:
            non_toxic += 1
            data['toxic'].append(0)

        data['toxicity'].append(sum(values))

    return data

In [8]:
transformed_raw = transformDataset(training_raw)

  0%|          | 0/159571 [00:00<?, ?it/s]

In [1]:
transformed_raw

NameError: name 'transformed_raw' is not defined

## 2. Balance Dataset

In [9]:
transformed_df = pd.DataFrame(transformed_raw)

In [10]:
from utils.data_processing import balance_dataset

In [11]:
training_x, training_y = balance_dataset(transformed_df, MAX_DF=20000, MAX_TOXIC=16000)

  0%|          | 0/159571 [00:00<?, ?it/s]

Num Toxic: 16000
Num Non-toxic: 20000


## 3. Preprocessing

In [12]:
bad_strings = re.compile(r'[\n"\.,]')
encodings = re.compile(u'\\?[a-z]+[0-9]·')

def updateVec(row):
    
    vec = []
    
    vec.append(row.toxic)
    vec.append(row.severe_toxic)
    vec.append(row.obscene)
    vec.append(row.threat)
    vec.append(row.insult)
    vec.append(row.identity_hate)
    
    return vec

def preprocessString(input_string):
    
    string = re.sub(bad_strings, '' , input_string)
    string = string.lower()
    string = re.sub(encodings, '', string)
    
    return string

def processRawDataFromCSV(input_csv, test=None):
    
    x = [] 
    y = []
    
    for i in tqdm(range(len(input_csv))):
        row = input_csv.loc[i]
        x.append(preprocessString(row.comment_text))
        
        if not test:
            y.append(updateVec(row))
        
    return x, y

def processRawDataFromList(x):
    
    return [preprocessString(i) for i in tqdm(x)]

In [13]:
training_x_processed = processRawDataFromList(training_x)

  0%|          | 0/36000 [00:00<?, ?it/s]

In [14]:
from utils.preprocessing import buildVocab

In [15]:
vocab, word_counts = buildVocab(training_x_processed)

## 4. Encode Data

In [16]:
from utils.dataEncoder import DataEncoder

In [17]:
data_encoder = DataEncoder(training_x_processed, modelFormat ='svm', vocab=vocab,threshold=None,
                       max_num=None, min_num= None, word_counts=word_counts, 
                 pretrained=None, pretrained_dim=None )

In [18]:
encoded_train = data_encoder.encode(test=None)

## Support Vector Machine 

In [22]:
from sklearn.feature_extraction.text import CountVectorizer

In [23]:
vectorizer = CountVectorizer(analyzer='word', 
                             ngram_range=(1, 2),
                             stop_words='english',
                             min_df=5)

In [24]:
X = vectorizer.fit_transform(training_x_processed)

In [25]:
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler

In [26]:
clf = make_pipeline(SVC(gamma='auto',kernel='linear', verbose=True))

In [27]:
clf.fit(X, training_y)

[LibSVM]

Pipeline(steps=[('svc', SVC(gamma='auto', kernel='linear', verbose=True))])

In [28]:
test = ['I hope you have a nice day you are a really great person']

test_vec = vectorizer.transform(test)

In [29]:
def test(input_string):
    test_vec = vectorizer.transform([input_string])
    pred(clf, test_vec)

def pred(clf, test_vec):
    if clf.predict(test_vec).item() == 1:
        print('Toxic')
    else:
        print('Not Toxic')

## Testing Model

In [37]:
import pandas as pd
from tqdm.auto import tqdm
import random

In [38]:
eval_data = pd.read_csv('./data/processed_test.csv')
train_x, train_y = balance_dataset(eval_data, MAX_TOXIC=6243, MAX_DF=15000)

  0%|          | 0/153164 [00:00<?, ?it/s]

Num Toxic: 6243
Num Non-toxic: 15000


In [40]:
from torch.utils.data import DataLoader
import torch
from utils.preprocessing import rnnDataset

In [42]:
X = vectorizer.transform(train_x)

In [44]:
preds = clf.predict(X)

In [46]:
from sklearn.metrics import confusion_matrix, classification_report

array([1, 1, 1, ..., 0, 1, 1])

In [56]:
print(classification_report(train_y, preds))

              precision    recall  f1-score   support

           0       0.96      0.72      0.82     15000
           1       0.58      0.92      0.71      6243

    accuracy                           0.78     21243
   macro avg       0.77      0.82      0.77     21243
weighted avg       0.85      0.78      0.79     21243



## Save Model 

In [34]:
import pickle

In [35]:
model_name = 'svm_baseline.pkl'
folder = './models'

def save_model(model_name, folder, model):
    with open(f'{folder}/{model_name}', 'wb') as file:

        pickle.dump(model, file)
        
    print(f'Model outputted to {folder}/{model_name}')

In [36]:
save_model(model_name, folder, clf)

Model outputted to ./models/svm_baseline.pkl
