# **Toxic Comment Classification**

## Importing the libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score

## Importing the dataset

In [None]:
'''
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!ls ~/.kaggle

!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge
!unzip train.csv
'''

'\n!mkdir -p ~/.kaggle\n!cp kaggle.json ~/.kaggle/\n!chmod 600 ~/.kaggle/kaggle.json\n!ls ~/.kaggle\n\n!kaggle competitions download -c jigsaw-toxic-comment-classification-challenge\n!unzip train.csv\n'

https://www.kaggle.com/c/jigsaw-toxic-comment-classification-challenge/data

In [None]:
dataSet = pd.read_csv('train.csv')

dataSet

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
159566,ffe987279560d7ff,""":::::And for the second time of asking, when ...",0,0,0,0,0,0
159567,ffea4adeee384e90,You should be ashamed of yourself \n\nThat is ...,0,0,0,0,0,0
159568,ffee36eab5c267c9,"Spitzer \n\nUmm, theres no actual article for ...",0,0,0,0,0,0
159569,fff125370e4aaaf3,And it looks like it was actually you who put ...,0,0,0,0,0,0


In [None]:
dataSet.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 159571 entries, 0 to 159570
Data columns (total 8 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   id             159571 non-null  object
 1   comment_text   159571 non-null  object
 2   toxic          159571 non-null  int64 
 3   severe_toxic   159571 non-null  int64 
 4   obscene        159571 non-null  int64 
 5   threat         159571 non-null  int64 
 6   insult         159571 non-null  int64 
 7   identity_hate  159571 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 9.7+ MB


In [None]:
X = dataSet['comment_text'].values

In [None]:
y = dataSet.iloc[:, 2:].any(axis=1).astype('int8')

## Text Pre-processing

In [None]:
import re
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
import os

def clean_data(dataSet):
  ps = PorterStemmer()
  corpus = []

  i = 0
  for text in dataSet:
      text = re.sub(r"[^a-zA-Z]", " ", text.lower())
      
      text = text.split()  # Spliting text into words
      text = [ps.stem(word) for word in text if ((len(word) < 200) and (not word in set(stopwords.words('english'))))]  # Removing stopwords & stemming
      text = ' '.join(text)  # Joining text
      corpus.append(text)
  return corpus

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [None]:
X = clean_data(X)

In [None]:
len(X)

159571

In [None]:
df = pd.DataFrame(columns = ['X', 'y'])
df['X'] = X
df['y'] = y

df

Unnamed: 0,X,y
0,explan edit made usernam hardcor metallica fan...,0
1,aww match background colour seemingli stuck th...,0
2,hey man realli tri edit war guy constantli rem...,0
3,make real suggest improv wonder section statis...,0
4,sir hero chanc rememb page,0
...,...,...
159566,second time ask view complet contradict covera...,0
159567,asham horribl thing put talk page,0
159568,spitzer umm there actual articl prostitut ring...,0
159569,look like actual put speedi first version dele...,0


In [None]:
df = df[df['X'].str.len() > 1]

X = df['X']
y = df['y']

In [None]:
df

Unnamed: 0,X,y
0,explan edit made usernam hardcor metallica fan...,0
1,aww match background colour seemingli stuck th...,0
2,hey man realli tri edit war guy constantli rem...,0
3,make real suggest improv wonder section statis...,0
4,sir hero chanc rememb page,0
...,...,...
159566,second time ask view complet contradict covera...,0
159567,asham horribl thing put talk page,0
159568,spitzer umm there actual articl prostitut ring...,0
159569,look like actual put speedi first version dele...,0


In [None]:
df.to_csv('data.csv', index=False)

In [None]:
import shutil
shutil.copy('data.csv', '/content/drive/MyDrive')

'/content/drive/MyDrive/data.csv'

In [None]:
pd.read_csv('data.csv')

Unnamed: 0,X,y
0,explan edit made usernam hardcor metallica fan...,0
1,aww match background colour seemingli stuck th...,0
2,hey man realli tri edit war guy constantli rem...,0
3,make real suggest improv wonder section statis...,0
4,sir hero chanc rememb page,0
...,...,...
159509,second time ask view complet contradict covera...,0
159510,asham horribl thing put talk page,0
159511,spitzer umm there actual articl prostitut ring...,0
159512,look like actual put speedi first version dele...,0
