In [12]:
import pandas as pd
from tensorflow import keras
import numpy as np
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
import re
import string
from keras import layers
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.preprocessing import sequence
from tensorflow.keras.optimizers import RMSprop
from collections import Counter
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\danii\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
# Read train and test data
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [6]:
# Preprocessing
# Initialize function to remove URLs
def remove_URL(text):
    url = re.compile(r"https?://\S+|www\.\S+")
    return url.sub(r"", text)
# Initialize function to remove punctuation
def remove_punct(text):
    translator = str.maketrans("", "", string.punctuation)
    return text.translate(translator)
# Load english stopwords
stop = set(stopwords.words("english"))
# Initialize function to remove stopwords
def remove_stopwords(text):
    filtered_words = [word.lower() for word in text.split() if word.lower() not in stop]
    return " ".join(filtered_words)

In [9]:
# Proccessing data
# Remove URLs, punctuation and stopwords from training data
train['text'] = train['text'].map(remove_URL)
train['text'] = train['text'].map(remove_punct)
train['text'] = train['text'].map(remove_stopwords)
# Remove URLs, punctuation and stopwords from testing data
test['text'] = test['text'].map(remove_URL)
test['text'] = test['text'].map(remove_punct)
test['text'] = test['text'].map(remove_stopwords)

In [10]:
# Count unique words
def counter_word(text_col):
    count = Counter()
    for text in text_col.values:
        for word in text.split():
            count[word] += 1
    return count

df = pd.concat([train, test], ignore_index=True)
counter = counter_word(df['text'])
num_unique_words = len(counter)

In [16]:
X_train, X_val, y_train, y_val = train_test_split(train['text'], train['target'], test_size=0.33, random_state=42)
X_test = test['text'].to_numpy()
X_train = X_train.to_numpy()
X_val = X_val.to_numpy()
y_train = y_train.to_numpy()
y_val = y_val.to_numpy()

In [19]:
max_len = 50
# create tokenizer
token = Tokenizer(num_words = num_unique_words)
token.fit_on_texts(X_train)
# tokenize X_train, X_val and X_test
X_train_seq = token.texts_to_sequences(X_train)
X_val_seq = token.texts_to_sequences(X_val)
X_test_seq = token.texts_to_sequences(X_test)
# pad X_train, X_val and X_test
X_train_pad = sequence.pad_sequences(X_train_seq , maxlen = max_len)
X_val_pad = sequence.pad_sequences(X_val_seq , maxlen = max_len)
X_test_pad = sequence.pad_sequences(X_test_seq, maxlen=max_len)

In [25]:
model = Sequential([
    layers.Embedding(num_unique_words , 32 , input_length = max_len),
    layers.LSTM(64),
    layers.Flatten(),
    layers.Dense(250, activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(120, activation='relu'),
    layers.Dropout(0.4),
    layers.Dense(1, activation='sigmoid')
])

model.compile(loss = 'binary_crossentropy' , optimizer = 'RMSprop' , metrics = 'accuracy')
model.summary()

Model: "sequential_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 50, 32)            725536    
                                                                 
 lstm_2 (LSTM)               (None, 64)                24832     
                                                                 
 flatten_2 (Flatten)         (None, 64)                0         
                                                                 
 dense_6 (Dense)             (None, 250)               16250     
                                                                 
 dropout_4 (Dropout)         (None, 250)               0         
                                                                 
 dense_7 (Dense)             (None, 120)               30120     
                                                                 
 dropout_5 (Dropout)         (None, 120)              

In [26]:
model.fit(X_train_pad,y_train,batch_size=128,epochs=5,
          validation_data=(X_val_pad, y_val))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x21e4c6fe620>