In [None]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
import seaborn as sns

In [None]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

In [None]:
from keras.preprocessing.text import Tokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

def remove_stop_words(text):
    text = word_tokenize(text.lower())
    stop_words = set(stopwords.words('english'))
    text = [word for word in text if word not in stop_words]
    return ' '.join(text)

In [None]:
cleaned = df_train["text"].apply(remove_stop_words)

In [None]:
from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences

tokenizer = Tokenizer()
tokenizer.fit_on_texts(cleaned)
sequences = tokenizer.texts_to_sequences(cleaned)

In [None]:
vocab_size = len(tokenizer.word_index) + 1
max_length = max([len(s.split()) for s in cleaned])

In [None]:
sequences = pad_sequences(sequences, maxlen=max_length)

In [None]:
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding
from keras.layers.convolutional import Conv1D, MaxPooling1D

model = Sequential() 
model.add(Embedding(vocab_size, 100, input_length=max_length)) 
model.add(Conv1D(filters=32, kernel_size=8, activation='relu'))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(10, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

In [None]:
hist = model.fit(sequences, df_train["label"], validation_split=0.2, epochs=5, batch_size=20)

In [None]:
acc = hist.history['accuracy']
val = hist.history['val_accuracy']
epochs = range(1, len(acc) + 1)

plt.plot(epochs, acc, '-', label='Training accuracy')
plt.plot(epochs, val, ':', label='Validation accuracy')
plt.title('Training and Validation Accuracy')
plt.xlabel('Epoch')
plt.ylabel('Accuracy')
plt.legend(loc='lower right')
plt.plot()

In [None]:
cleaned_test = df_test["text"].apply(remove_stop_words)
sequence_test = tokenizer.texts_to_sequences(cleaned_test)
sequence_test = pad_sequences(sequence_test, maxlen=max_length)
pred = model.predict(sequence_test)

In [None]:
pred[1][0]

In [None]:
df_test["Category"] = df_test.apply(lambda row: 0 if pred[row.name][0] < 0.5 else 1, axis=1)

In [None]:
df_test[["Id", "Category"]].to_csv("CNN.csv", index=False)