# SENTIMENTAL ANALYSIS PROJECT

In [None]:
from google.colab import files
files.upload()

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns


## Load Dataset

In [4]:
df1 = pd.read_csv('/content/review50k.csv')
df1

FileNotFoundError: ignored

In [None]:
df1.isnull().sum()

In [None]:
label_mapping = {
    1:1 ,
    2:2 ,
    3:2 ,
    4:3 ,
    5:3
}

# Map the labels using the dictionary
df1['Scores'] = df1['Score'].map(label_mapping)

In [None]:
df1['Scores']

In [None]:
df1.Scores.value_counts()

### Classify all Labels equally using Undersampling

In [None]:
negative_val ,neutral_val, positive_val = df1.Scores.value_counts()
print(negative_val)
print(neutral_val)
print(positive_val)
min_samples = df1[df1['Scores'] == 1].shape[0]
min_samples

In [None]:
dfNeg = df1[df1.Scores==1]

dfNeu = df1[df1.Scores==2].sample(min_samples, random_state=333)
dfPos = df1[df1.Scores==3].sample(min_samples, random_state=333)

In [None]:
print(dfNeu.shape)
print(dfNeg.shape)
print(dfPos.shape)

In [None]:
from operator import index
df_balance = pd.concat([dfNeg,dfNeu,dfPos],axis=0)


In [None]:
df_balance.reset_index(drop=True,inplace=True)

In [None]:
df_balance.head(2)

In [None]:
#
df1 = df_balance.drop('Unnamed: 0',axis=1)

In [None]:
df1.head()

In [None]:
df1.Text[3]

In [None]:
sns.countplot(df1,x='Scores')

### Importing Dependencies

In [None]:
plt.style.use('ggplot')
import nltk
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import BernoulliNB,MultinomialNB
from sklearn.metrics import accuracy_score,precision_score,recall_score,confusion_matrix

In [None]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from wordcloud import WordCloud
from sklearn.model_selection import GridSearchCV

## Text Preprocessing

### represent stop words in English

In [None]:
nltk.download('stopwords')
stopwords_en = stopwords.words('english')
print('Stop Words: ',stopwords_en)

### stemming or lemmatizing

In [None]:
port_stem = PorterStemmer()
lm = WordNetLemmatizer()

In [None]:
def Lemmatization(content):
    lemma_words = re.sub('[^a-zA-z]',' ',content)
    lemma_words = lemma_words.lower()
    lemma_words = lemma_words.split()
#     stem_words = [port_stem.stem(word) for word in stem_words if not word in stopwords.words('english')]
    lemma_words = [lm.lemmatize(word) for word in lemma_words if word not in set(stopwords.words('english'))]
    lemma_words = ' '.join(lemma_words)
    return lemma_words

In [None]:
df1['HelpfulnessNumerator'] = df1['Text'].apply(Lemmatization)

In [None]:
df1.rename(columns={'HelpfulnessNumerator':'Lemmatized_Text'},inplace=True)

In [None]:
df1['Text'][2]

In [None]:
df1['Lemmatized_Text'].values[2]

In [None]:
corpus = df1['Lemmatized_Text'].values

In [None]:
corpus[2]

In [None]:
phrases_to_drop = ["br", "food"]

filtered_corpus = [doc for doc in corpus if not any(phrase in doc for phrase in phrases_to_drop)]


In [None]:
phrases_to_drop = ["br","one", "amazon","product","even",'cup','bag','dog','used']
filtered_corpus = [doc for doc in corpus if not any(phrase in doc for phrase in phrases_to_drop)]

In [None]:
word_cloud = ""
for row in filtered_corpus:
    for word in row:
        word_cloud+=" ".join(word)
wordcloud = WordCloud(width = 1000, height = 500,background_color ='white',min_font_size = 10).generate(word_cloud)
plt.figure(figsize=(20, 8))
plt.imshow(wordcloud)

In [None]:
x = df1['Lemmatized_Text'].values
y = df1['Scores'].values
print(x.shape)
print(y.shape)

### splitting dataset into Train & Test

In [None]:
xtrain, xtest, ytrain, ytest = train_test_split(x,y, test_size=0.2,stratify=y,random_state=2)
print(xtrain.shape, xtest.shape)
print(ytrain.shape, ytest.shape)

### vectorization

In [None]:
vectorizer = TfidfVectorizer()
xtrain = vectorizer.fit_transform(xtrain)
xtest = vectorizer.transform(xtest)

## ML Model Training


In [None]:
log_reg = LogisticRegression()
log_reg.fit(xtrain,ytrain)

In [None]:
xtrain_pred = log_reg.predict(xtrain)
xtest_pred = log_reg.predict(xtest)

# train_score = precision_score(ytrain,xtrain_pred,average='micro')
# test_score = precision_score(ytest,xtest_pred,average='micro')
train_score = accuracy_score(ytrain,xtrain_pred)
test_score = accuracy_score(ytest,xtest_pred)
print("Acccuracy training: ",train_score)
print('Acccuracy testing: ',test_score)

In [None]:
nb = MultinomialNB()
nb.fit(xtrain,ytrain)

In [None]:
xtrain_pred = nb.predict(xtrain)
xtest_pred = nb.predict(xtest)

train_score = precision_score(ytrain,xtrain_pred,pos_label='positive',
                                           average='micro')
test_score = precision_score(ytest,xtest_pred,pos_label='positive',
                                           average='micro')
print("Training quality: ",train_score)
print('Testing quality: ',test_score)

In [None]:
svc = SVC(kernel='rbf')
svc.fit(xtrain,ytrain)

In [None]:
xtrain_pred = svc.predict(xtrain)
xtest_pred = svc.predict(xtest)

train_score = precision_score(ytrain,xtrain_pred,pos_label='positive',
                                           average='micro')
test_score = precision_score(ytest,xtest_pred,pos_label='positive',
                                           average='micro')
print("Training quality: ",train_score)
print('Testing quality: ',test_score)

In [None]:
xtest[1].toarray()

In [None]:
def expression(predict_input):
    if predict_input ==1:
        print('Negative sentiment')
    elif predict_input==2:
        print('Neutral sentiment')
    elif predict_input == 3:
        print('Positive sentiment')
    else:
        print('Wrong Sentiment')
def sentiment_prediction(input_text):
    if isinstance(input_text, list):
        for text in input_text:
            processed_text = Lemmatization(text)
            transformed_input = vectorizer.transform([processed_text])  # Transform a single text, make it a list
            prediction = log_reg.predict(transformed_input)
            expression(prediction)
    else:
        processed_text = Lemmatization(input_text)
        transformed_input = vectorizer.transform([processed_text])  # Transform a single text, make it a list
        prediction = log_reg.predict(transformed_input)
        expression(prediction)


In [None]:
txt1 = ["This oatmeal is fine. But the previous one was much better"]
txt2 = ["Worst coffee ever drink was starbucks "]
txt3 = ["That tea is so tasty"]
sentiment_prediction(txt1)
sentiment_prediction(txt2)
sentiment_prediction(txt3)

In [None]:
# from sklearn.svm import SVC

# params= {
#     'kernel':['linear','rbf','poly'],
#     'C':[0.1,1,10],
#     'gamma':[0.1,1,'auto']
# }

# grid_svc = GridSearchCV(SVC(),params,cv=5,)

# grid_svc.fit(xtrain,ytrain)
\

\
# xtrain_pred = grid_svc.predict(xtrain)
# xtest_pred = grid_svc.predict(xtest)

# train_score = accuracy_score(ytrain,xtrain_pred,average='micro')
# test_score = accuracy_score(ytest,xtest_pred,average='micro')
# best_params = grid_svc.best_params_
# print('Best params of SVC: ',best_params)
# print("Acccuracy training: ",train_score)
# print('Acccuracy testing: ',test_score)
