# Customer Product Review: Sentiment Analysis
-----
-----

In [None]:
!pip install scikit-plot

In [None]:
from sklearn.svm import LinearSVC

from sklearn.metrics import accuracy_score,classification_report
from sklearn.linear_model import SGDClassifier
from scipy.sparse import hstack
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

import string
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [None]:
from wordcloud import WordCloud, STOPWORDS
import seaborn as sns


In [None]:
import scikitplot as skplt


In [None]:
import pandas as pd
dataset = pd.read_csv("245_1_part3.csv")

In [None]:
dataset.head()

In [None]:
dataset.shape

In [None]:
dataset.isnull().sum()

## Data Cleaning:
------

### Selecting Required Columns:

In [None]:
dataset = dataset[['brand','manufacturer','reviews.didPurchase','reviews.rating', 'reviews.text']]

### Checking Null values:

In [None]:
dataset.isnull().sum()

### Inputing null values on 'reviews.didpurchase' & Removing all the null values:

In [None]:
dataset['reviews.didPurchase'] = dataset['reviews.didPurchase'].fillna('Not Avialable') 

In [None]:
dataset = dataset.dropna()

## Visualizing The Result:
----

### Review Ratings Distribution:
-----

In [None]:
data = dataset['reviews.rating'].value_counts()

In [None]:
sns.barplot(x=data.index, y=data.values)

**Need to map 1,2 - Unhappy, 3-OK and 4,5 - Happy**

### Fake Reviews Provided by User:
-----

In [None]:
ax_plt = sns.countplot(dataset['reviews.didPurchase'])
ax_plt.set_xlabel(xlabel="User's Reviews",fontsize=12)
ax_plt.set_ylabel(ylabel='No. of Reviews',fontsize=12)
ax_plt.axes.set_title('Accurate No. of Reviews',fontsize=12)
ax_plt.tick_params(labelsize=11)

**True reviews are less in number and as you can see poeple who didn't even purchase the product has provided reviews these are all fake reviews which are much higher than those who have purchased and provided the review.**

### Wordcloud
----

In [None]:
stopwords = set(STOPWORDS)
def wordcloud(data, title = None):
    wordcloud = WordCloud(
        background_color='white',
        stopwords=stopwords,
        max_words=250,
        max_font_size=30,
        scale=2,
        random_state=5 #chosen a andom by flipping a coin; it was heads
).generate(str(data))

    fig = plt.figure(1, figsize=(15, 15))
    plt.axis('off')
    if title: 
        fig.suptitle(title, fontsize=20)
        fig.subplots_adjust(top=2.3)
    plt.imshow(wordcloud)
    plt.show()

wordcloud(dataset['reviews.text'])

## Feature Transformation:
-----

In [None]:
data=dataset['reviews.text']
train_data=dataset['reviews.text']
y_target=dataset['reviews.rating'].map({1:'Unhappy',2:'Unhappy',3:'Ok',4:'Happy',5:'Happy'})

In [None]:
vectorize_word = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='word',token_pattern=r'\w{1,}',stop_words='english',ngram_range=(1, 1),max_features=10000)
vectorize_word.fit(data)
train_features_word = vectorize_word.transform(train_data)

In [None]:
vectorize_char = TfidfVectorizer(sublinear_tf=True,strip_accents='unicode',analyzer='char',stop_words='english',ngram_range=(2, 6),max_features=50000)
vectorize_char.fit(data)
train_features_char = vectorize_char.transform(train_data)
train_features = hstack([train_features_char, train_features_word])

In [None]:
X_train, X_test, y_train, y_test = train_test_split(train_features, y_target,test_size=0.3,random_state=101,shuffle=True)

## SVC Model:
----

In [None]:
lsvm = LinearSVC(class_weight='balanced')
l = lsvm.fit(X_train,y_train)

### Linear SVC Metric Over Training:

In [None]:
pred_train = l.predict(X_train)

In [None]:
print("Accuracy Train: {}".format(accuracy_score(y_train,pred_train)))
print(classification_report(y_train,pred_train))

### Linear SVC Metric Over Test:

In [None]:
pred_test=l.predict(X_test)

In [None]:
print("Accuracy Test : {}".format(accuracy_score(y_test,pred_test)))
print(classification_report(y_test,pred_test))

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, pred_test, normalize=True)
plt.show()

## SGD Classifier:
----

In [None]:
svm1=SGDClassifier(class_weight='balanced',n_jobs=-1,n_iter=300)
svm1.fit(X_train,y_train)

### SGD Metric Over Training:

In [None]:
pred_train_sgd=svm1.predict(X_train)
print("Accuracy Train: {}".format(accuracy_score(y_train,pred_train_sgd)))
print(classification_report(y_train,pred_train_sgd))

### SGD Metric Over Test:

In [None]:
pred_test_sgd=svm1.predict(X_test)
print("Accuracy Test: {}".format(accuracy_score(y_test,pred_test_sgd)))
print(classification_report(y_test,pred_test_sgd))

In [None]:
skplt.metrics.plot_confusion_matrix(y_test, pred_test_sgd, normalize=True)
plt.show()

In [84]:
! pip install scikit-plot


import scikitplot as skplt


Defaulting to user installation because normal site-packages is not writeable


ImportError: cannot import name 'interp' from 'scipy' (C:\Users\Udhaya\AppData\Local\Packages\PythonSoftwareFoundation.Python.3.13_qbz5n2kfra8p0\LocalCache\local-packages\Python313\site-packages\scipy\__init__.py)

In [83]:
from sklearn.linear_model import SGDClassifier
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
