In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.layers import Conv1D, Bidirectional, LSTM, Dense, Input, Dropout
import numpy as np
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.callbacks import ModelCheckpoint
from collections import Counter

from tensorflow.keras import regularizers
from tensorflow.keras import layers
from tensorflow.keras import losses

In [2]:
df = pd.read_csv("cleaned_text_data.csv")
df.head()

Unnamed: 0,sentiment,message
0,Negative,upset update Facebook texting might cry result...
1,Negative,Kenichan I dived many time ball Managed save 5...
2,Negative,whole body feel itchy like fire
3,Negative,nationwideclass behaving mad I see
4,Negative,Kwesidei whole crew


In [3]:
df.shape

(1599999, 2)

In [4]:
smaller_used_data, rest_data = train_test_split(df, train_size = 0.4, stratify=df["sentiment"], random_state=42)

In [5]:
smaller_used_data.shape

(639999, 2)

In [6]:
df = smaller_used_data

In [7]:
df.shape

(639999, 2)

In [8]:
train_data,test_data = train_test_split(df,test_size=0.2,random_state=42)

In [9]:
train_data.shape

(511999, 2)

In [10]:
train_data = pd.DataFrame(train_data)
test_data = pd.DataFrame(test_data)
train_data['message'] = train_data['message'].astype(str)
train_data['sentiment'] = train_data['sentiment'].astype(str)
test_data['message'] = test_data['message'].astype(str)
test_data['sentiment'] = test_data['sentiment'].astype(str)

In [11]:
max_words = 20000
oov_token = "unknown"

tokenizer = Tokenizer(num_words=max_words, oov_token=oov_token)
tokenizer.fit_on_texts(train_data['message'])

tokenizer_json = tokenizer.to_json()
with open('tokenizer.json', 'w') as json_file:
    json_file.write(tokenizer_json)

In [12]:
print(str(tokenizer.texts_to_sequences(['There seems to be something wrong with the game today'])))

[[364, 410, 305, 776, 133, 349, 746, 47, 176, 11]]


In [13]:
X_train, X_valid, y_train, y_valid= train_test_split(train_data['message'].tolist(),
                                                      train_data['sentiment'].tolist(),
                                                      test_size=0.1,
                                                      stratify = train_data['sentiment'].tolist(),
                                                      random_state=8)

#further divided traindata into train data and validation data.

In [14]:
print('Train data len:'+str(len(X_train)))
print('Class distribution'+str(Counter(y_train)))
print('Valid data len:'+str(len(X_valid)))
print('Class distribution'+ str(Counter(y_valid)))

Train data len:460799
Class distributionCounter({'Positive': 230574, 'Negative': 230225})
Valid data len:51200
Class distributionCounter({'Positive': 25619, 'Negative': 25581})


In [15]:
train_sequences = tokenizer.texts_to_sequences(X_train)
valid_sequences = tokenizer.texts_to_sequences(X_valid)
test_sequences = tokenizer.texts_to_sequences(test_data['message'].tolist())

X_train = pad_sequences(train_sequences, padding='post', maxlen=50)
x_valid = pad_sequences(valid_sequences, padding='post', maxlen=50)
x_test = pad_sequences(test_sequences, padding='post', maxlen=50)



X_train = np.array(X_train)
x_valid = np.array(x_valid)
x_test = np.array(x_test)
print(X_train.shape)
print(x_valid.shape)
print(x_test.shape)

(460799, 50)
(51200, 50)
(128000, 50)


In [16]:
le = LabelEncoder()

train_labels = le.fit_transform(y_train)
train_labels = np.asarray( tf.keras.utils.to_categorical(train_labels))

valid_labels = le.transform(y_valid)
valid_labels = np.asarray( tf.keras.utils.to_categorical(valid_labels))

test_labels = le.transform(test_data['sentiment'].tolist())
test_labels = np.asarray(tf.keras.utils.to_categorical(test_labels))



train_ds = tf.data.Dataset.from_tensor_slices((X_train,train_labels))
valid_ds = tf.data.Dataset.from_tensor_slices((x_valid,valid_labels))
test_ds = tf.data.Dataset.from_tensor_slices((x_test,test_labels))

In [17]:
X_train[0]

array([5676,  439,    2,  180, 3448,  452, 2514, 1166,  280, 3751,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0])

In [18]:
number_of_classes = 2  

max_features = 20000
embedding_dim = 64
sequence_length = 50

model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(max_features + 1, embedding_dim, input_length=sequence_length,
                                    embeddings_regularizer=regularizers.l2(0.0005)))

model.add(tf.keras.layers.Conv1D(128, 3, activation='relu',
                                 kernel_regularizer=regularizers.l2(0.0005),
                                 bias_regularizer=regularizers.l2(0.0005)))

model.add(tf.keras.layers.GlobalMaxPooling1D())

model.add(tf.keras.layers.Dropout(0.5))


model.add(tf.keras.layers.Dense(number_of_classes, activation='sigmoid',
                                kernel_regularizer=regularizers.l2(0.001),
                                bias_regularizer=regularizers.l2(0.001)))

model.summary()


model.compile(loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False), 
              optimizer='Nadam', 
              metrics=["CategoricalAccuracy"])

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 64)            1280064   
                                                                 
 conv1d (Conv1D)             (None, 48, 128)           24704     
                                                                 
 global_max_pooling1d (Globa  (None, 128)              0         
 lMaxPooling1D)                                                  
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense (Dense)               (None, 2)                 258       
                                                                 
Total params: 1,305,026
Trainable params: 1,305,026
Non-trainable params: 0
______________________________________________

In [19]:
epochs = 20
history = model.fit(train_ds.shuffle(2000).batch(128),
                    epochs= epochs ,
                    validation_data=valid_ds.batch(128),
                    verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [20]:
model.save('sentimentmodel.keras')  

In [21]:
model.save('sentimentmodel.h5') 

In [22]:
import json 
max_features=20000
tokenizer = Tokenizer(num_words=max_features)  
json_string = tokenizer.to_json()
with open('tokenizer.json', 'w') as outfile:
    outfile.write(json_string)

In [23]:
tokenizer

<keras.preprocessing.text.Tokenizer at 0x23e0dd30988>