# Import packages

In [1]:
import numpy as np
import math
import re
import pandas as pd
from bs4 import BeautifulSoup
#from google.colab import drive
import pickle
import lxml

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [2]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer




# Load files

In [3]:
cols =['sentiment','id','date','query','user','text']

data = pd.read_csv(
    "NLP-Project-Sentiment/training.1600000.processed.noemoticon.csv",
    header=None,
    names=cols,
    engine='python',
    encoding='latin1'
)

In [4]:
data.head(3)

Unnamed: 0,sentiment,id,date,query,user,text
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...


# Pre process data

In [5]:
data.drop(['id','date','query','user'],axis=1,inplace=True)

In [6]:
data.head()

Unnamed: 0,sentiment,text
0,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,is upset that he can't update his Facebook by ...
2,0,@Kenichan I dived many times for the ball. Man...
3,0,my whole body feels itchy and like its on fire
4,0,"@nationwideclass no, it's not behaving at all...."


In [7]:
def cleanText(text):
  text = BeautifulSoup(text,'html.parser').get_text()
  text = re.sub(r'@[A-za-z0-9]+'," ",text)
  text = re.sub(r'https?://[A-za-z0-9./]+',' ',text)
  text = re.sub(r'[^a-zA-Z.!?]',' ',text)
  text = re.sub(r' +'," ",text)
  return text

## Clean the text

In [8]:
data_clean = [cleanText(text) for text in data.text]

  text = BeautifulSoup(text,'html.parser').get_text()


In [9]:
data_clean[:5]

[' Awww that s a bummer. You shoulda got David Carr of Third Day to do it. D',
 'is upset that he can t update his Facebook by texting it... and might cry as a result School today also. Blah!',
 ' I dived many times for the ball. Managed to save The rest go out of bounds',
 'my whole body feels itchy and like its on fire ',
 ' no it s not behaving at all. i m mad. why am i here? because I can t see you all over there. ']

## Clean Labels
0 is Positive

1 is Negative

In [10]:
data_labels = data.sentiment.values
data_labels[data_labels==4] = 1
set(data_labels)

{0, 1}

## Tokenization

In [11]:
# Load the tokenizer from a file
with open('NLP-Project-Sentiment/tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [None]:
tokenizer = Tokenizer(num_words=2**16, oov_token='<OOV>')
tokenizer.fit_on_texts(data_clean)
# Save the tokenizer to a file
with open('NLP-Project-Sentiment/tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [12]:
sequences = tokenizer.texts_to_sequences(data_clean)
data_inputs =sequences

## Padding
Add 0 for all sentences to match maximum length of each sentence after it was tokenized

In [13]:
MAX_LENGTH = max([len(sentence) for sentence in data_inputs])
data_inputs= tf.keras.preprocessing.sequence.pad_sequences(
    data_inputs,
    value=0,
    padding="post",
    maxlen=MAX_LENGTH
)

In [14]:
print(MAX_LENGTH)
data_inputs[:5]

52


array([[  445,    17,    13,     5,  1174,     9,  3429,    50,   836,
         9439,    15,  1864,    34,     3,    43,     7,   140,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0],
       [   10,   781,    17,    85,    31,    14,   544,   184,   532,
          124,  1992,     7,     8,   290,   528,    84,     5,  2301,
          144,    42,   266,  1117,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0],
       [    2, 51831,   314,   351,    12,     4,  1228,  1659,     3,
          896,     4,   468,    41,    36,    15, 22589,     0,     0,
            0,     0,     0,    

## Split Data into Train/Test

In [15]:
test_idx = np.random.randint(0,800000,8000)
test_idx = np.concatenate((test_idx,test_idx+800000))

In [16]:
test_inputs = data_inputs[test_idx]
test_labels = data_labels[test_idx]

In [17]:
train_inputs = np.delete(data_inputs,test_idx,axis=0)
train_labels = np.delete(data_labels,test_idx)

# Building the model

In [18]:
class DeepCNN(tf.keras.Model):
  def __init__(self,
               vocab_size,
               emb_dim=128,
               nb_filters=50,
               FFN_units=512,
               nb_classes=2,
               dropout_rate=0.1,
               training=False,
               name="DeepCNN",
               ):
    super(DeepCNN,self).__init__(name=name)
    self.embedding= layers.Embedding(vocab_size,
                                      emb_dim)
    self.bigram = layers.Conv1D(filters=nb_filters,
                                kernel_size=2,
                                padding="valid",
                                activation="relu")
    self.pool_1= layers.GlobalMaxPool1D()
    self.trigram = layers.Conv1D(filters=nb_filters,
                                kernel_size=3,
                                padding="valid",
                                activation="relu")
    self.pool_2= layers.GlobalMaxPool1D()
    self.quadgram = layers.Conv1D(filters=nb_filters,
                                kernel_size=4,
                                padding="valid",
                                activation="relu")
    self.pool_3= layers.GlobalMaxPool1D()
    self.dense_1 = layers.Dense(units=FFN_units, activation="relu")
    self.dropout = layers.Dropout(rate = dropout_rate)
    if nb_classes == 2:
      self.last_dense = layers.Dense(units=1,activation="sigmoid")
    else:
      # You will need to get the maximum probablity later
      self.last_dense = layers.Dense(units=nb_classes,activation="softmax")

  def call(self,inputs,training):
    x = self.embedding(inputs)
    x_1 = self.bigram(x)
    x_1 = self.pool_1(x_1)
    x_2 = self.trigram(x)
    x_2 = self.pool_2(x_2)
    x_3 = self.quadgram(x)
    x_3 = self.pool_3(x_3)

    merged = tf.concat([x_1,x_2,x_3],axis=-1) # something like this (batchsize, 3*nb_filters)
    merged = self.dense_1(merged)
    merged = self.dropout(merged,training)
    output = self.last_dense(merged)
    return output



# Where the magic happens

## Config

In [19]:
VOCAB_SIZE = len(tokenizer.word_index)

EMB_DIM = 200
NB_FILTERS = 100
FFN_UNITS = 256
NB_CLASSES= len(set(train_labels))

DROUPOUT_RATE = 0.2

BATCH_SIZE = 32
NB_EPOCHS = 1

# Training

In [20]:
deepCNN = DeepCNN(vocab_size=VOCAB_SIZE,
                  emb_dim=EMB_DIM,
                  nb_filters=NB_FILTERS,
                  FFN_units=FFN_UNITS,
                  nb_classes=NB_CLASSES,
                  dropout_rate=DROUPOUT_RATE,
                  )



In [21]:
if NB_CLASSES ==2:
  deepCNN.compile(loss="binary_crossentropy",
                  optimizer="adam",
                  metrics=["accuracy"]
                  )
else:
  deepCNN.compiler(loss="sparse_categorical_crossentropy",
                   optimizer="adam",
                   metrics=["sparse_categorical_accuracy"]
                   )



In [22]:
checkpoint_path = "NLP-Project-Sentiment"
ckpt = tf.train.Checkpoint(deepCNN=deepCNN)
ckpt_manager=tf.train.CheckpointManager(ckpt,checkpoint_path,max_to_keep=1)

if(ckpt_manager.latest_checkpoint):
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("Latest Checkpoint restored")

Latest Checkpoint restored


In [23]:
print(tf.__version__)
print("Num GPUs Available: ", tf.config.experimental.list_physical_devices())

2.15.0
Num GPUs Available:  [PhysicalDevice(name='/physical_device:CPU:0', device_type='CPU')]


In [24]:
deepCNN.fit(train_inputs,train_labels,batch_size=BATCH_SIZE,epochs=NB_EPOCHS)
ckpt_manager.save()

  113/49503 [..............................] - ETA: 1:44:34 - loss: 0.1284 - accuracy: 0.9483

KeyboardInterrupt: 

## Evaluation

In [25]:
results = deepCNN.evaluate(test_inputs,test_labels,batch_size=BATCH_SIZE)
print(results)

[0.11270835995674133, 0.9540625214576721]


# User Input

In [31]:
def processInput(text):
    # Convert the input text to sequence
    sequence = tokenizer.texts_to_sequences([text])
    padded_sequence = tf.keras.preprocessing.sequence.pad_sequences(sequence, maxlen=MAX_LENGTH, padding='post', truncating='post')
    print(padded_sequence)
    # Predict using the sequence
    prediction = deepCNN(np.array(padded_sequence ), training=False)
    # Convert model's output to a user-friendly result
    if prediction >= 0.5:
        return f"The result is Positive, {prediction} confidence"
    else:
        return f"The result is Negative, {1-prediction} confidence"

In [34]:
# This cell will be run to take user input and display the result

# Take user input|
user_text = input("Please enter your text: ")

# Process the input and display the result
result = processInput(user_text)
print(result)

[[146   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0]]
The result is Negative, [[0.8493748]] confidence


In [28]:
import tensorflow as tf

# Assume deepCNN is your model and it's correctly defined and restored from checkpoint
model_dir = 'sentiment_analysis_model'  # Directory to save the exported model

# Save the model in SavedModel format
tf.saved_model.save(ckpt.deepCNN, model_dir)

INFO:tensorflow:Assets written to: sentiment_analysis_model\assets


INFO:tensorflow:Assets written to: sentiment_analysis_model\assets
