# Summary

1. [Librairies](#Librairies)
2. [Data](#Data)
3. [Text Preprocessing](#Text-Preprocessing)
4. [Model](#Model)

# Librairies

In [48]:
# Import classical libraries
import numpy as np
import pandas as pd

# Import AI related libraires
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Dense, Embedding, LSTM
from tensorflow.keras.callbacks import LambdaCallback
import tensorflow as tf

# Import nltk for text processing
from nltk.tokenize import word_tokenize
from nltk.text import Text
from nltk.stem.lancaster import LancasterStemmer
from nltk.stem import WordNetLemmatizer
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Dyrudis\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Dyrudis\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Data

In [32]:
# Load the data
train = pd.read_csv('./data/train.csv')
test = pd.read_csv('./data/test.csv')

# Only keep the first 5000 rows
train = train[:5000]

# Print train
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0


# Text Preprocessing

In [33]:
# Lowercase the text
train['comment_text'] = train['comment_text'].str.lower()

# Print train
train.head(100)

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,explanation\nwhy the edits made under my usern...,0,0,0,0,0,0
1,000103f0d9cfb60f,d'aww! he matches this background colour i'm s...,0,0,0,0,0,0
2,000113f07ec002fd,"hey man, i'm really not trying to edit war. it...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"""\nmore\ni can't make any real suggestions on ...",0,0,0,0,0,0
4,0001d958c54c6e35,"you, sir, are my hero. any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...,...
95,003b9f448ee4a29d,"""\n\nthanks. i can see that violating clearly ...",0,0,0,0,0,0
96,003bd094feef5263,"""\nhi\nthanks for our kind words. see you arou...",0,0,0,0,0,0
97,003caacc6ce6c9e9,collusion in poker \n\nthis is regarded as mos...,0,0,0,0,0,0
98,003d77a20601cec1,"thanks much - however, if it's been resolved, ...",0,0,0,0,0,0


In [34]:
# Tokenize the text
tokenizer = nltk.RegexpTokenizer(r'\w+')
train['comment_text'] = train['comment_text'].apply(tokenizer.tokenize)

# Print train
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"[explanation, why, the, edits, made, under, my...",0,0,0,0,0,0
1,000103f0d9cfb60f,"[d, aww, he, matches, this, background, colour...",0,0,0,0,0,0
2,000113f07ec002fd,"[hey, man, i, m, really, not, trying, to, edit...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"[more, i, can, t, make, any, real, suggestions...",0,0,0,0,0,0
4,0001d958c54c6e35,"[you, sir, are, my, hero, any, chance, you, re...",0,0,0,0,0,0


In [35]:
# Stopwords filtering
stopwords = nltk.corpus.stopwords.words()
train['comment_text'] = train['comment_text'].apply(lambda x: [word for word in x if word not in stopwords])

# Print train
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"[explanation, edits, made, username, hardcore,...",0,0,0,0,0,0
1,000103f0d9cfb60f,"[aww, matches, background, colour, seemingly, ...",0,0,0,0,0,0
2,000113f07ec002fd,"[hey, edit, guy, constantly, removing, relevan...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"[make, real, suggestions, improvement, wondere...",0,0,0,0,0,0
4,0001d958c54c6e35,"[sir, hero, chance, remember, page]",0,0,0,0,0,0


In [36]:
# Lemmatization
Word_Lemmatizer = WordNetLemmatizer()
train['comment_text'] = train['comment_text'].apply(lambda x: [Word_Lemmatizer.lemmatize(word) for word in x])

# Print train
train.head()

Unnamed: 0,id,comment_text,toxic,severe_toxic,obscene,threat,insult,identity_hate
0,0000997932d777bf,"[explanation, edits, made, username, hardcore,...",0,0,0,0,0,0
1,000103f0d9cfb60f,"[aww, match, background, colour, seemingly, st...",0,0,0,0,0,0
2,000113f07ec002fd,"[hey, edit, guy, constantly, removing, relevan...",0,0,0,0,0,0
3,0001b41b1c6bb37e,"[make, real, suggestion, improvement, wondered...",0,0,0,0,0,0
4,0001d958c54c6e35,"[sir, hero, chance, remember, page]",0,0,0,0,0,0


In [37]:
# Vocabulary
vocabulary = set()
for sentence in train['comment_text']:
    for word in sentence:
        vocabulary.add(word)

print(len(vocabulary))

21614


In [38]:
# Bag of words
bag_of_words = {}
for word in vocabulary:
    bag_of_words[word] = 0

# Count the number of words
for sentence in train['comment_text']:
    for word in sentence:
        bag_of_words[word] += 1

# Print the 10 most common words
print(sorted(bag_of_words.items(), key=lambda x: x[1], reverse=True)[:10])

[('article', 2267), ('page', 1831), ('wikipedia', 1418), ('talk', 1114), ('as', 726), ('source', 718), ('user', 649), ('time', 594), ('edit', 578), ('make', 520)]


# Model

In [44]:
# Create a first model to predict the toxicity of a comment
embedding_dim = 100
max_length = 100
embeddings_matrix = np.zeros((len(vocabulary)+1, embedding_dim))

model = Sequential([
    Embedding(len(vocabulary)+1, embedding_dim, input_length=max_length, weights=[embeddings_matrix], trainable=False),
    LSTM(units=64),
    Dense(1, activation='sigmoid')
])

# Print the model summary
model.summary()

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 100, 100)          2161500   
                                                                 
 lstm (LSTM)                 (None, 64)                42240     
                                                                 
 dense_11 (Dense)            (None, 1)                 65        
                                                                 
Total params: 2,203,805
Trainable params: 42,305
Non-trainable params: 2,161,500
_________________________________________________________________


In [54]:
num_epochs = 1

training_comments = tf.convert_to_tensor(train['comment_text'], dtype=tf.float32)
training_labels = tf.convert_to_tensor(train['toxic'], dtype=tf.float32)

# Fit the model
model.fit(training_comments, training_labels, epochs=num_epochs, batch_size=32, verbose=1)

ValueError: setting an array element with a sequence.