In [None]:
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
from google.colab import drive
import math
import requests


import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPool1D, Dense, Dropout, Flatten
from tensorflow.keras.callbacks import EarlyStopping, TensorBoard
import tensorflow_datasets as tfds 

from sklearn.model_selection import train_test_split

In [None]:
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
path = '/content/drive/My Drive/Google_Colab/NYCFC_Merch_Sentiment_Analysis'
os.chdir(os.path.join(path, 'Review_Sentiment_Data'))

In [None]:
data = pd.read_csv('Reddit_Data.csv')
data.head()

Unnamed: 0,clean_comment,category
0,family mormon have never tried explain them t...,1
1,buddhism has very much lot compatible with chr...,1
2,seriously don say thing first all they won get...,-1
3,what you have learned yours and only yours wha...,0
4,for your own benefit you may want read living ...,1


In [None]:
os.chdir('..')

In [None]:
data.describe(include='all')

Unnamed: 0,clean_comment,category
count,37149.0,37249.0
unique,36799.0,
top,,
freq,115.0,
mean,,0.202771
std,,0.778515
min,,-1.0
25%,,0.0
50%,,0.0
75%,,1.0


In [None]:
data.isna().sum()

clean_comment    100
category           0
dtype: int64

In [None]:
data[data.isna().any(axis=1)]

Unnamed: 0,clean_comment,category
413,,0
605,,0
2422,,0
2877,,0
3307,,0
...,...,...
35975,,0
36036,,0
37043,,0
37111,,0


In [None]:
data.dropna(axis=0, inplace=True)

In [None]:
data = data.iloc[:-500, :]

In [None]:
from tensorflow.keras.preprocessing import text

tokenizer = text.Tokenizer(num_words=1000)
tokenizer.fit_on_texts(data['clean_comment'])
print(tokenizer.word_index)



In [None]:
tokenized_data = tokenizer.texts_to_matrix(data['clean_comment'])

In [None]:
tokenized_data.shape

(36649, 1000)

In [None]:
data[data['category'] == -1] = 2

In [None]:
data['category'].value_counts()

1    15621
0    12831
2     8197
Name: category, dtype: int64

In [None]:
data['category'].shape

(36649,)

In [None]:
MAX_LEN = max([len(sentence) for sentence in tokenized_data])

input_data = tf.keras.preprocessing.sequence.pad_sequences(tokenized_data, 
                                                           maxlen=MAX_LEN, padding='post')

In [None]:
print(input_data.shape)
print(data['category'].shape)

(36649, 1000)
(36649,)


In [None]:
X_train, X_test, y_train, y_test = train_test_split(input_data, data['category'], test_size=0.3,
                                                    random_state=101)

In [None]:
model = Sequential()

model.add(Embedding(1000, 32, input_length=MAX_LEN))
model.add(Dropout(0.2))

model.add(Conv1D(64, 2, padding='same', activation='relu'))
model.add(MaxPool1D())

model.add(Conv1D(64, 3, padding='same', activation='relu'))
model.add(MaxPool1D())

model.add(Conv1D(64, 4, padding='same', activation='relu'))
model.add(MaxPool1D())

model.add(Flatten())

model.add(Dense(256, activation='relu'))

model.add(Dense(3, activation='softmax'))

model.summary()

Model: "sequential_8"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_9 (Embedding)      (None, 1000, 32)          32000     
_________________________________________________________________
dropout_1 (Dropout)          (None, 1000, 32)          0         
_________________________________________________________________
conv1d_18 (Conv1D)           (None, 1000, 64)          4160      
_________________________________________________________________
max_pooling1d_6 (MaxPooling1 (None, 500, 64)           0         
_________________________________________________________________
conv1d_19 (Conv1D)           (None, 500, 64)           12352     
_________________________________________________________________
max_pooling1d_7 (MaxPooling1 (None, 250, 64)           0         
_________________________________________________________________
conv1d_20 (Conv1D)           (None, 250, 64)          

In [None]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5)
tensorboard = TensorBoard(log_dir='logs/')

model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001), loss='sparse_categorical_crossentropy',
                  metrics=['sparse_categorical_accuracy'])

In [None]:
model.fit(X_train, y_train, epochs=10, batch_size=64, validation_data=(X_test, y_test),
              callbacks=[early_stopping, tensorboard])

Epoch 1/10




Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10


<tensorflow.python.keras.callbacks.History at 0x7f7d604bdc18>

In [None]:
model.save('CNN_Text_Model')

Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


Instructions for updating:
This property should not be used in TensorFlow 2.0, as updates are applied automatically.


INFO:tensorflow:Assets written to: CNN_Text_Model/assets


INFO:tensorflow:Assets written to: CNN_Text_Model/assets
