### Import the library

In [1]:
import numpy as np
import pandas as pd 

from sklearn.feature_extraction.text import CountVectorizer
from sklearn import metrics

import string
import spacy
np.random.seed(42)

import warnings
warnings.filterwarnings('ignore')

### Loading the dataset

In [2]:
df = pd.read_csv('data.csv', encoding = 'latin-1')

In [3]:
# first 5 rows
df.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


In [4]:
# last 5 rows
df.tail()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
995,fc27bf5d8ed568e0,Take a look at this\n\nWikipedia:Disruptive ed...,0,0,0,0,0,0
996,fc2828355aed9e5e,Your Edits to Albert Einstein,0,0,0,0,0,0
997,fc284cc939fea168,RFC error \nFor some reason the replaced your...,0,0,0,0,0,0
998,fc29b8a68f192b65,"""\n Darwin Rebellion \n\nMy apologies for that...",0,0,0,0,0,0
999,fc2a808948207a7b,Ambrosi\nThank you for experimenting with the ...,0,0,0,0,0,0


In [5]:
# shape of the dataset
df.shape

(1000, 8)

In [6]:
# target column distribution
df['toxic'].value_counts()

0    500
1    500
Name: toxic, dtype: int64

In [7]:
nlp = spacy.load("en_core_web_sm")
stop_words = nlp.Defaults.stop_words
print(stop_words)

{'nine', 'formerly', 'nothing', 'other', 'either', 'ourselves', 'no', 'into', 'front', 'through', 'seem', 'hereby', 'under', 'become', '‘d', 'became', 'off', 'becomes', 'less', 'please', 'whereby', 'out', 'them', 'therefore', 'former', 'two', 'every', 'moreover', 'around', 'various', 'nobody', "'ve", 'full', 'against', "n't", 'your', 'we', 'a', 'whole', 'thereupon', 'whereas', 'move', "'d", 'made', 'four', 'amongst', "'m", 'amount', 'else', 'themselves', 'sometime', 'what', 'beyond', 'show', 'yourselves', 'which', 'mostly', 'still', 'thru', 'becoming', 'forty', 'behind', 'from', 'whenever', 'done', 'everything', 'now', 'you', 'without', 'otherwise', 'anyone', 'an', 'himself', 'up', 'more', 'something', 'anyway', 'not', 'rather', 'many', 'least', 'whereafter', 'six', 'others', 'five', 'could', 'then', 'get', 'three', 'will', 'thereafter', 'and', 'both', 'empty', 'wherever', 'he', 'noone', 'whither', 'yours', 'since', 'just', 'yet', 'sometimes', 'why', 'same', 'along', 'latterly', 'as', 

In [8]:
punctuations = string.punctuation
print(punctuations)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [9]:
# Creating our tokenizer function

def spacy_tokenizer(sentence):
    
    # Creating our token object, which is used to create documents with linguistic annotations.
    doc = nlp(sentence)

    # Lemmatizing each token and converting each token into lowercase
    mytokens = [ word.lemma_.lower().strip() for word in doc ]

    # print(mytokens)

    # Removing stop words
    mytokens = [ word for word in mytokens if word not in stop_words and word not in punctuations ]

    # return preprocessed list of tokens
    return mytokens

In [10]:
sentence = "I am eating apple ?"
spacy_tokenizer(sentence)

['eat', 'apple']

In [11]:
## Count Vectorizer

count_vectorizer = CountVectorizer(tokenizer = spacy_tokenizer)

In [12]:
count_vectorizer.fit_transform(["I am eating apple, I like apple","I am playing cricket"]).toarray()

array([[2, 0, 1, 1, 0],
       [0, 1, 0, 0, 1]], dtype=int64)

In [13]:
count_vectorizer.get_feature_names_out()

array(['apple', 'cricket', 'eat', 'like', 'play'], dtype=object)

In [14]:
count_vectorizer.vocabulary_

{'eat': 2, 'apple': 0, 'like': 3, 'play': 4, 'cricket': 1}

In [15]:
# Splitting the data into independent and dependent

x = df['comment_text']
y = df['toxic']

In [16]:
# Splitting the data into train and test

from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, stratify = y, random_state = 1)

In [17]:
X_train= count_vectorizer.fit_transform(x_train)
X_test = count_vectorizer.transform(x_test)

In [18]:
X_train

<800x6068 sparse matrix of type '<class 'numpy.int64'>'
	with 16476 stored elements in Compressed Sparse Row format>

In [19]:
X_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 3, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], dtype=int64)

In [20]:
X_train.shape

(800, 6068)

In [21]:
X_test.shape

(200, 6068)

### Logistic Regression

In [22]:
from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

In [23]:
model.fit(X_train, y_train)

In [24]:
y_pred = model.predict(X_test)

### Evaluation Metrics

In [25]:
from sklearn.metrics import accuracy_score, precision_score, recall_score

In [26]:
Accuracy = accuracy_score(y_test, y_pred)
print("Acuracy ", Accuracy)

Acuracy  0.83


In [27]:
Precision = precision_score(y_test, y_pred)
print("Precision ", Precision)

Precision  0.8055555555555556


In [28]:
Recall = recall_score(y_test, y_pred)
print("Recall ", Recall)

Recall  0.87
