# Women Cloth Reviews Prediction

In [146]:
import pandas as pd

In [147]:
import numpy as np

In [148]:
import seaborn as sns

In [149]:
import matplotlib.pyplot as plt

IMPORT DATASET

In [150]:
df = pd.read_csv("/content/train.csv")

In [151]:
print(df.head())

      Id  Age               Review_Title  \
0  17274   34      Cute fall/holiday top   
1   5921   35                        NaN   
2  16479   40               Disappointed   
3   1925   28         Gorgeous detailing   
4   5691   39  Cute and comfortable tee!   

                                              Review  Pos_Feedback_Cnt  \
0  Love this top! the quality is magnificent and ...                 1   
1                                                NaN                 0   
2  Sleeves were tight, was difficult to put on ?....                15   
3  I never write reviews but this clothe is so fa...                 3   
4  Love this tshirt! casual but can be clotheed u...                 0   

         Division Department Product_Category  Rating  Recommended  
0         General       Tops          Blouses       5            1  
1         General       Tops          Blouses       5            1  
2         General       Tops          Blouses       2            0  
3  General Pet

In [152]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14091 entries, 0 to 14090
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   Id                14091 non-null  int64 
 1   Age               14091 non-null  int64 
 2   Review_Title      11732 non-null  object
 3   Review            13588 non-null  object
 4   Pos_Feedback_Cnt  14091 non-null  int64 
 5   Division          14080 non-null  object
 6   Department        14080 non-null  object
 7   Product_Category  14080 non-null  object
 8   Rating            14091 non-null  int64 
 9   Recommended       14091 non-null  int64 
dtypes: int64(5), object(5)
memory usage: 1.1+ MB


In [153]:
df.shape

(14091, 10)

In [154]:
df.isna().sum()

Unnamed: 0,0
Id,0
Age,0
Review_Title,2359
Review,503
Pos_Feedback_Cnt,0
Division,11
Department,11
Product_Category,11
Rating,0
Recommended,0


In [155]:
df[df['Review']==""]=np.NaN

In [156]:
df['Review'].fillna("NO Review",inplace=True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['Review'].fillna("NO Review",inplace=True)


In [157]:
df.isna().sum()

Unnamed: 0,0
Id,0
Age,0
Review_Title,2359
Review,0
Pos_Feedback_Cnt,0
Division,11
Department,11
Product_Category,11
Rating,0
Recommended,0


In [158]:
df['Review']

Unnamed: 0,Review
0,Love this top! the quality is magnificent and ...
1,NO Review
2,"Sleeves were tight, was difficult to put on ?...."
3,I never write reviews but this clothe is so fa...
4,Love this tshirt! casual but can be clotheed u...
...,...
14086,The pattern and fabric on this clothe are very...
14087,"Like the previous reviewer stated, it's more l..."
14088,This sweater is so lovely.. i like the fact th...
14089,"I just love this top, it has a flattering cut,..."


DEFINE TARGET(X) AND FEATURE (Y)

In [159]:
df.columns

Index(['Id', 'Age', 'Review_Title', 'Review', 'Pos_Feedback_Cnt', 'Division',
       'Department', 'Product_Category', 'Rating', 'Recommended'],
      dtype='object')

In [160]:
X = df['Review']

In [161]:
Y = df['Rating']

In [162]:
df['Rating'].value_counts()

Unnamed: 0_level_0,count
Rating,Unnamed: 1_level_1
5,7907
4,3001
3,1748
2,933
1,502


TRAIN TEST SPLIT

In [163]:
from sklearn.model_selection import train_test_split

In [164]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.7,stratify=Y,random_state=2529)


In [165]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((9863,), (4228,), (9863,), (4228,))

**Get Feature Text Conversion To Tokens**

In [166]:
from sklearn.feature_extraction.text import CountVectorizer


In [167]:
cv = CountVectorizer(lowercase=True,analyzer='word',ngram_range=(2,3),stop_words='english',max_features=5000)

In [168]:
X_train = cv.fit_transform(x_train)

In [169]:
cv.get_feature_names_out()

array(['00p 0p', '0p fit', '10 12', ..., 'years come', 'yes runs',
       'yoga trousers'], dtype=object)

In [170]:
X_train.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [171]:
X_test = cv.fit_transform(x_test)

In [172]:
cv.get_feature_names_out()

array(['00p 0p', '10 12', '10 bought', ..., 'years come', 'yellow color',
       'zip way'], dtype=object)

In [173]:
X_test.toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

# **Get Model Train**

In [174]:
from sklearn.naive_bayes import MultinomialNB

In [175]:
model = MultinomialNB()

In [176]:
model.fit(X_train,y_train)

# ***Get Model Prediction***

In [177]:
y_pred = model.predict(X_test)

In [178]:
y_pred.shape

(4228,)

In [179]:
y_pred

array([5, 1, 5, ..., 4, 3, 1])

# ***Get Probability Of Each Predicted Classes***

In [180]:
model.predict_proba(X_test)

array([[0.08793455, 0.00159988, 0.00466671, 0.08679694, 0.81900191],
       [0.57812424, 0.02263402, 0.32873689, 0.00118997, 0.06931489],
       [0.07878753, 0.1825612 , 0.04035759, 0.01366008, 0.6846336 ],
       ...,
       [0.08713932, 0.00121738, 0.07228384, 0.75996438, 0.07939507],
       [0.00137501, 0.03638211, 0.84933924, 0.05132236, 0.06158128],
       [0.43823015, 0.15946341, 0.32705153, 0.00463564, 0.07061927]])

Get Model Evaluation

In [181]:
from sklearn.metrics import confusion_matrix,classification_report

In [182]:
print(confusion_matrix(y_test,y_pred))

[[   9   13   14   20   95]
 [  26   18   33   59  144]
 [  64   50   78   84  248]
 [ 103   73  129  161  434]
 [ 303  283  290  398 1099]]


In [183]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           1       0.02      0.06      0.03       151
           2       0.04      0.06      0.05       280
           3       0.14      0.15      0.15       524
           4       0.22      0.18      0.20       900
           5       0.54      0.46      0.50      2373

    accuracy                           0.32      4228
   macro avg       0.19      0.18      0.18      4228
weighted avg       0.37      0.32      0.35      4228



# ***Recategories Rating AS Poor(0) And Good(1)***

In [184]:
df['Rating'].value_counts()

Unnamed: 0_level_0,count
Rating,Unnamed: 1_level_1
5,7907
4,3001
3,1748
2,933
1,502


# ***Re-Rating 123 as 0 and 45 as 1***

In [185]:
df.replace({'Rating':{1:0,2:0,3:0,4:1,5:1}},inplace=True)


In [186]:
Y = df['Rating']

In [187]:
X = df['Review']

# ***Train Test Split***

In [188]:
from sklearn.model_selection import train_test_split

In [189]:
x_train,x_test,y_train,y_test = train_test_split(X,Y,train_size=0.7,stratify=Y,random_state=2529)

In [190]:
x_train.shape,x_test.shape,y_train.shape,y_test.shape

((9863,), (4228,), (9863,), (4228,))

***Get Feature Text Conversion To Tokens***

In [192]:
from sklearn.feature_extraction.text import CountVectorizer

In [191]:
cv = CountVectorizer(lowercase=True,analyzer='word',ngram_range=(2,3),stop_words='english',max_features=5000)

In [193]:
X_train = cv.fit_transform(x_train)

In [194]:
X_test = cv.fit_transform(x_test)

# ***Get Model Re-Train***

In [195]:
from sklearn.naive_bayes import MultinomialNB

In [196]:
model = MultinomialNB()

In [197]:
model.fit(X_train,y_train)

# ***Get Model Prediction***

In [198]:
y_pred = model.predict(X_test)

In [199]:
y_pred.shape

(4228,)

In [200]:
y_pred

array([1, 1, 1, ..., 1, 1, 1])

# ***Get Model Evaluation***

In [201]:
from sklearn.metrics import confusion_matrix,classification_report

In [202]:
print(confusion_matrix(y_test,y_pred))

[[ 203  752]
 [ 614 2659]]


In [204]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.25      0.21      0.23       955
           1       0.78      0.81      0.80      3273

    accuracy                           0.68      4228
   macro avg       0.51      0.51      0.51      4228
weighted avg       0.66      0.68      0.67      4228

