# Sentiment Analysis

In [None]:
from google.colab import drive
drive.mount('/content/drive')
import os
#Changing the directory
os.chdir('/content/drive/My Drive/CPCS 481')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import numpy as np
import pandas as pd
import nltk

## Import Dataset

In [None]:
dataset = pd.read_csv('Amazon_Food_Review.csv')
dataset['Combined_Text'] = dataset['Summary'] + ' ' + dataset['Text']

In [None]:
print(dataset['Combined_Text'])

0       Good Quality Dog Food I have bought several of...
1       Not as Advertised Product arrived labeled as J...
2       "Delight" says it all This is a confection tha...
3       Cough Medicine If you are looking for the secr...
4       Great taffy Great taffy at a great price.  The...
                              ...                        
9995    constipation we switched from the advance simi...
9996    Constipation Not A Problem if... Like the bad ...
9997    Love this formula! I wanted to solely breastfe...
9998    very convenient i love the fact that i can get...
9999    The best weve tried so far We have a 7 week ol...
Name: Combined_Text, Length: 10000, dtype: object


### Preprocessing

In [None]:
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords

In [None]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

### Tokenize

In [None]:
tokenizer = RegexpTokenizer(r'\w+')
en_stopwords = set(stopwords.words('english'))

### Clean Data

In [None]:
def preprocess_text(text):
    # Convert to lowercase
    if isinstance(text, str):
      text = text.lower()

      # Tokenization
      tokens = tokenizer.tokenize(text)

      # Remove stopwords
      tokens = [token for token in tokens if token.isalnum() and token not in en_stopwords]

      # Stemming
      ps = PorterStemmer()
      tokens = [ps.stem(token) for token in tokens]

      wnet =  WordNetLemmatizer()
      lem = []
      for words in tokens:
          w = []
          for token in words:
              w.append(wnet.lemmatize(token))
          lem.append(w)

      return ' '.join(tokens)
    else:
        # If not a string, return an empty string
        return ""

In [None]:
# Process 'Summary' and 'Text' columns separately
dataset['Processed_Text'] = dataset['Combined_Text'].apply(preprocess_text)

In [None]:
print(dataset['Processed_Text'])

0       good qualiti dog food bought sever vital can d...
1       advertis product arriv label jumbo salt peanut...
2       delight say confect around centuri light pillo...
3       cough medicin look secret ingredi robitussin b...
4       great taffi great taffi great price wide assor...
                              ...                        
9995    constip switch advanc similac organ product th...
9996    constip problem like bad review say organ form...
9997    love formula want sole breastfe unabl keep sup...
9998    conveni love fact get deliev hous delievi char...
9999    best weve tri far 7 week old ga constip proble...
Name: Processed_Text, Length: 10000, dtype: object


## Naive Bayers

In [None]:
dataset['Sentiment'] = np.where(dataset['Score'] > 3, 'Positive', np.where(dataset['Score'] < 3, 'Negative', 'Neutral'))

### TFIDF Vectorization

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(dataset['Processed_Text']).toarray()
y = dataset['Sentiment'].values

### Training the Naive Bayes model on the Training set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [None]:
print(X_train)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
print(X_test)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]


In [None]:
print(y_train)

['Negative' 'Positive' 'Positive' ... 'Positive' 'Positive' 'Negative']


In [None]:
print(y_test)

['Positive' 'Positive' 'Positive' ... 'Positive' 'Positive' 'Positive']


In [None]:
from sklearn.naive_bayes import MultinomialNB
classifier = MultinomialNB()
classifier.fit(X_train, y_train)

### Predicting the Test set results

In [None]:
y_pred = classifier.predict(X_test)
print(np.concatenate((y_pred.reshape(len(y_pred), 1), y_test.reshape(len(y_test), 1)) , 1))

[['Positive' 'Positive']
 ['Positive' 'Positive']
 ['Positive' 'Positive']
 ...
 ['Positive' 'Positive']
 ['Positive' 'Positive']
 ['Positive' 'Positive']]


## Making the Confusion Matrix

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test, y_pred)

[[   0    0  386]
 [   1    0  229]
 [   0    0 1884]]


0.7536