<a href="https://colab.research.google.com/github/EthicalFlipper/MachineLearning/blob/main/Sentiment_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np

import string
from string import punctuation
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

import tensorflow as tf

from keras.layers import Input, Dense, Dropout
from keras.models import Sequential 

import sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer

import joblib

data = pd.read_csv('Reviews.csv') #Read the data into the program
data = data.drop(['UserId', 'Id', 'Time'], axis = 1) #Drop unnecessary columns
data.dropna(inplace = True) #Drop empty columns
data['Polarity_Rating'] = data['Score'].apply(lambda x: 'Positive' if x > 3 else('Neutral' if x == 3 else 'Negative')) #Create a new column to keep track of if the review is positive, negative, or neutral
data_positive = data[data['Polarity_Rating'] == 'Positive'] #Make a list of positive reviews
data_neutral = data[data['Polarity_Rating'] == 'Neutral'] #Make a list of neutral reviews
data_negative = data[data['Polarity_Rating'] == 'Negative'] #Make a list of negative reviews

print("Positive:", data_positive.shape)
print("Neutral:", data_neutral.shape)
print("Negative:", data_negative.shape)

data_positive = data_positive.sample(8000) #Get sample of the positive data, 8000 is good because its not too large
data_neutral = data_neutral.sample(8000) #Get sample of the neutral data
data_negative = data_negative.sample(8000) #Get sample of the negative data

print("Positive:", data_positive.shape)
print("Neutral:", data_neutral.shape)
print("Negative:", data_negative.shape)

data = pd.concat([data_positive, data_negative, data_neutral]) #Combine lists together to create one large data set
print(data.shape)

#data.head() #Print out the first few lines

def text_cleanup(text): #Function that cleans up words like 'is', 'are', 'the' and also cleans up punctuation
  stopwrds = stopwords.words('english') #The stop words in english (is, the, are, etc)
  no_punc = [char for char in text if char not in string.punctuation] #List comprehension
  no_punc = ''.join(no_punc)
  return ' '.join([word for word in no_punc.split() if word.lower not in stopwrds])
data['reviews'] = data['Text'].apply(text_cleanup) #Applies text_cleanup function
data = data[["reviews", "Polarity_Rating"]] #Reduce data to simply two columns: cleaned up review, and polarity rating.
one_hot = pd.get_dummies(data["Polarity_Rating"]) #Function from the pandas library to turn the polarity rating into a matrix
data = pd.concat([data, one_hot], axis = 1) #Data equal to the reviews column combined with the new dataset you just created
data.drop(["Polarity_Rating"], axis = 1, inplace = True) #Drop the polarity rating column from data
x_rev = data["reviews"].values #This will be all of the reviews
y_pol = data.drop("reviews", axis = 1) #This should be everything except the reviews
x_rev_train, x_rev_test, y_pol_train, y_pol_test = train_test_split(x_rev, y_pol, test_size = 0.30, shuffle = True) #Split this data into two groups of input and output

data.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Positive: (443766, 8)
Neutral: (42638, 8)
Negative: (82007, 8)
Positive: (8000, 8)
Neutral: (8000, 8)
Negative: (8000, 8)
(24000, 8)


Unnamed: 0,reviews,Negative,Neutral,Positive
109995,This is truly rich and satisfying cup of coffe...,0,0,1
18150,Package with a reasonable size great condition...,0,0,1
295573,Disagree with previous review These rainbow co...,0,0,1
431807,I have a hard time finding gifts for relatives...,0,0,1
232103,I love that I am able to find such dessert fla...,0,0,1


In [None]:
#Vectorize - the process of converting text data into numerical data so that a neural network can perform calculations with it
vect = CountVectorizer() #Make a vectorizer object
vect.max_features = 15000 #Maximum amount of features for the vectorizer
vect.fit(x_rev) #Call the fit function to create the vocab going to use all the review data so that the input is a consistent size
vocab = vect.vocabulary_ #Save vocabulary as a variable
print(vocab)
joblib.dump(vocab, "vocab.pkl") #Export vocabulary so use later

x_rev_train_v = vect.transform(x_rev_train) #Create a new dataset for the vectorized (training) data that has been transformed
x_rev_test_v = vect.transform(x_rev_test) #Create a new dataset for the vectorized (test) data that has been transformed

x_rev_train_v = x_rev_train_v.toarray() #Dataset changed to an array
x_rev_test_v = x_rev_test_v.toarray() #Dataset changed to an array
print(x_rev_train_v.shape)
print(x_rev_test_v.shape)

(16800, 15000)
(7200, 15000)


In [None]:
model = Sequential() #Make a sequential model
model.add(Dense(units = 4000, activation = "relu")) #Input layer - first layer takes all the text input and starts to perform calculations with it
model.add(Dropout(0.5)) #Dropout layer - help prevent the network from relying too much on specific neurons and force all the neurons to do some work

#Calculation Layers - do the bulk of the calculation
model.add(Dense(units = 1000, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(units = 300, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(units = 100, activation = "relu"))
model.add(Dropout(0.5))
model.add(Dense(units = 70, activation = "relu"))
model.add(Dropout(0.5))

#Output Layer
model.add(Dense(units = 3, activation= "softmax"))

#Compile network
opt = tf.keras.optimizers.Adam(learning_rate = 0.001) #Variable to keep track of the optimizer
model.compile(loss = "categorical_crossentropy", optimizer = opt, metrics = ['accuracy'])
#Fit the model
model.fit(x = x_rev_train_v, y = y_pol_train, batch_size = 256, epochs = 10, validation_data = (x_rev_test_v, y_pol_test))

#Evaluate Network
scores = model.evaluate(x_rev_test_v, y_pol_test, verbose = 0)
print("Test accuracy:", scores[1])
#Save the network
model.save('sentiments.h5')

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test accuracy: 0.6940277814865112
