In [2]:
import pandas as pd
import numpy as np
import plotly.express as px

from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import re
from nltk.stem.snowball import SnowballStemmer
import tqdm

In [3]:
data=pd.read_csv("bad_comment.csv")

In [4]:
comments = data['comment'].values
labels = data['label'].values

In [5]:
text_cleaning = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"
stemmer = SnowballStemmer('english', ignore_stopwords=False)

In [6]:
def preprocess_data(text):
    text = re.sub(text_cleaning, ' ', str(text).lower()).strip()
    text = stemmer.stem(str(text))
    return text

X = []
for i in tqdm.tqdm(range(len(comments))):
    X.append(preprocess_data(comments[i]))

100%|██████████| 1010826/1010826 [00:31<00:00, 31969.33it/s]


In [7]:
tokenizer = Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts(X)

In [8]:
sequences = tokenizer.texts_to_sequences(X)
padded = pad_sequences(sequences, padding='post', maxlen=20)

In [15]:
xtrain, xtest, ytrain, ytest = train_test_split(X, np.array(labels))

In [16]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [17]:
print(f"Actual Sentence: {comments[860]}\nStemmed Sentence: {X[860]}\nTokenized: {sequences[860]}\nPadded: {padded[860]}")

Actual Sentence: This comment makes me so hungry
Stemmed Sentence: this comment makes me so hungri
Tokenized: [20, 387, 181, 71, 25, 9377]
Padded: [  20  387  181   71   25 9377    0    0    0    0    0    0    0    0
    0    0    0    0    0    0]


In [22]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
clf = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('nb', MultinomialNB())
])

In [20]:
xtrain

['c mon he was just taking a picture of his friend not the band',
 'no',
 'agre',
 'she s probably on a ketogenic diet',
 'driving like dick bags on a public road awesom',
 'specs idk really bad computer from 2006 64tick 40 50fps 128 30 40fps this is the best computer you can get',
 'i mean you did post it',
 'lol haven t heard that one in a whil',
 'maybe sully lost so much weight he can reclassify as a small forward',
 'so you want bernie supporters to vote for a third party candidate instead of hillary to ensure a republican gets to nominate supreme court justices for the next 4 8 year',
 'i don t know man mignolet has shown time and time again that he can be counted on',
 'you mean non religious bel',
 'it s striking to see the water line along the mountains in the desert palm springs california area',
 'i mean it was in a drink called a blow job',
 'but it violates muh human rights to live in a world where this movie isn t being shown',
 'thode din mein ww2 hone vala hai modi hitl

In [24]:
clf.fit(xtrain,ytrain)

Pipeline(steps=[('vectorizer', CountVectorizer()), ('nb', MultinomialNB())])

In [25]:
clf.score(xtest,ytest)

0.6774921153747225

In [None]:
import sklearn.