In [2]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
import joblib
from sklearn.model_selection import GridSearchCV, cross_val_predict, cross_val_score

## * Read raw data and, pre-prcocessing, train model and save all these processes

skip blocks which have * at the beginning of their title. These block contain processes like pre-process data and train model which are time consuming. Run these block only of you want apply pre-process and training process again. 

### * Read raw data

In [9]:
df = pd.read_csv("./data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 23486 entries, 0 to 23485
Data columns (total 11 columns):
 #   Column                   Non-Null Count  Dtype 
---  ------                   --------------  ----- 
 0   Unnamed: 0               23486 non-null  int64 
 1   Clothing ID              23486 non-null  int64 
 2   Age                      23486 non-null  int64 
 3   Title                    19676 non-null  object
 4   Review Text              22641 non-null  object
 5   Rating                   23486 non-null  int64 
 6   Recommended IND          23486 non-null  int64 
 7   Positive Feedback Count  23486 non-null  int64 
 8   Division Name            23472 non-null  object
 9   Department Name          23472 non-null  object
 10  Class Name               23472 non-null  object
dtypes: int64(6), object(5)
memory usage: 2.0+ MB


In [11]:
df.shape

(23486, 11)

In [12]:
df.isna().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      3810
Review Text                 845
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                14
Department Name              14
Class Name                   14
dtype: int64

**We remove all records which their Review Text is blan because we want to predict Recommendation and Rating according to this columns.**

In [13]:
df = df.dropna(subset=["Review Text"]) 

In [14]:
df.isna().sum()

Unnamed: 0                    0
Clothing ID                   0
Age                           0
Title                      2966
Review Text                   0
Rating                        0
Recommended IND               0
Positive Feedback Count       0
Division Name                13
Department Name              13
Class Name                   13
dtype: int64

In [15]:
df.shape

(22641, 11)

**We should apply pre-processing on the Review Text. In pre-processing we use a pre-prepared dictionary from NLTK library which helps us to tokenize text and remove stop words (and, ...) and lemmatize words. Lemmatization is the process of change all words to its root word. For example fishes, fishing, fished are all change to fish.**

In [16]:
import re 
import nltk
nltk.download('wordnet')

def preprocessing(text):
    text = text.lower()
    tokenizer = nltk.tokenize.TreebankWordTokenizer()
    tokens = tokenizer.tokenize(text)
    
    stemmer = nltk.stem.WordNetLemmatizer()
    text = " ".join(stemmer.lemmatize(token) for token in tokens)
    text = re.sub("[^a-z']"," ", text)
    return text

df["Clean_Review"] = df["Review Text"].apply(preprocessing)

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Ali\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


### * Save pro-processed data

We save the cleaned data to prevent pre-process steps for the next runs.

In [17]:
df.to_csv("cleaned_data.csv")

In [18]:
df.shape

(22641, 12)

### * Vectorize review text

In [19]:
df = pd.read_csv("./cleaned_data.csv")
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Clean_Review
0,0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,absolutely wonderful silky and sexy and comf...
1,1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,love this dress it 's sooo pretty i happene...
2,2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,i had such high hope for this dress and really...
3,3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,i love love love this jumpsuit it 's fun ...
4,4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,this shirt is very flattering to all due to th...


Machine learninng algorithms does not recognize words and texts. They only know vectors and numbers; That's why we transform clean texts' words to vectors. Then we can feed them to our models.

In [20]:
vectorizer = CountVectorizer()

X = vectorizer.fit_transform(df['Clean_Review'])

### * Create a function to find the best hyperparameters for each model and also define suitable models

In [21]:
def tune_model(model, model_param, train, test):
    grse = GridSearchCV(model, model_param, scoring="accuracy", cv=5)
    
    grse.fit(train, test)
    
    print("best score is: {}".format(grse.best_score_))
    print("best estimatior is: {}".format(grse.best_estimator_))    
    
    return grse.best_score_, grse.best_estimator_

We define three models. SVM, Random Forest and Naive Baysian. We also define a dictionary to save some values for these algorithms hyper-parameters. Then we can tune our models hyper-paramters and choose best set of hyper-paramteres for them. This process might be time consuming which is why we save the models for later usages.

In [22]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

alpha = np.array([0.1, 1, 10, 100])
models = [SVC(), RandomForestClassifier(), MultinomialNB()]
models_param = [
    {"C": (1, 10, 20, 50), "gamma": ("scale", "auto", 0.1, 1)},
    {"n_estimators": (70, 80, 90, 100), 'criterion': ("gini", "entropy"), "max_depth": (5, 6, 7, 10, 25)},
    {"alpha": alpha}
]

### * Recommend prediction

First we train our models for predicting Recommendation. Our target is boolean therefor our model should predict whether the products is recomended according to its text or not.  

In [23]:
y_recom = df['Recommended IND']

In [24]:
X_train_recom, X_test_recom, y_train_recom, y_test_recom = train_test_split(X, y_recom, test_size=0.33, random_state=1234)

In [25]:
models_recom_acc = []
models_recom_holder = []

In [26]:
for ind, model in enumerate(models):
    accuracy, model_fited = tune_model(model, models_param[ind], X_train_recom, y_train_recom)
    models_recom_acc.append(accuracy)
    models_recom_holder.append(model_fited)

best score is: 0.8866769208232622
best estimatior is: SVC(C=10)
best score is: 0.8191047890910379
best estimatior is: RandomForestClassifier(max_depth=25, n_estimators=70)
best score is: 0.8868743318117278
best estimatior is: MultinomialNB()


In [27]:
models_recom_holder

[SVC(C=10),
 RandomForestClassifier(max_depth=25, n_estimators=70),
 MultinomialNB()]

### * Save Recommendation predictor Models

In [29]:
svc_recom_file = 'svc_recom.sav'
joblib.dump(models_recom_holder[0], svc_recom_file)

rf_recom_file = 'rf_recom.sav'
joblib.dump(models_recom_holder[1], rf_recom_file)

nb_recom_file = 'nb_recom.sav'
joblib.dump(models_recom_holder[2], nb_recom_file)

['nb_recom.sav']

#### * Evaluate recommendation predictor models

In [31]:
model_recom_cross_vall_train = []
model_recom_cross_vall_test = []

for ind, model in enumerate(models_recom_holder):
    print("Model %i" % (ind))
    
    train_predict = cross_val_score(model, X_train_recom, y_train_recom, cv=3)
    model_recom_cross_vall_train.append(train_predict)
    print("Accuracy on train data {}".format(train_predict))
    
    test_predict = cross_val_score(model, X_test_recom, y_test_recom, cv=3)
    model_recom_cross_vall_test.append(train_predict)
    print("Accuracy on test data {}".format(test_predict))

Model 0
Accuracy on train data [0.88669171 0.8789557  0.88568038]
Accuracy on test data [0.88197511 0.88317945 0.87831325]
Model 1
Accuracy on train data [0.81886494 0.81863133 0.81922468]
Accuracy on test data [0.82175833 0.82296267 0.82248996]
Model 2
Accuracy on train data [0.89104212 0.8789557  0.8840981 ]
Accuracy on test data [0.86391008 0.87354476 0.87068273]


### * Rating predictor

Here we train our models 

In [32]:
y_rating = df['Rating']
X_train_rating, X_test_rating, y_train_rating, y_test_rating = train_test_split(X, y_rating, test_size=0.33, random_state=1234)

In [33]:
models_rating_acc = []
models_rating_holder = []

In [34]:
for ind, model in enumerate(models):
    accuracy, model_fited = tune_model(model, models_param[ind], X_train_rating, y_train_rating)
    models_rating_acc.append(accuracy)
    models_rating_holder.append(model_fited)

best score is: 0.6246944998121086
best estimatior is: SVC(C=10)
best score is: 0.551585471264128
best estimatior is: RandomForestClassifier(max_depth=25, n_estimators=80)
best score is: 0.6266721523579019
best estimatior is: MultinomialNB()


In [35]:
svc_rating_file = 'svc_rating.sav'
joblib.dump(models_rating_holder[0], svc_rating_file)

rf_rating_file = 'rf_rating.sav'
joblib.dump(models_rating_holder[1], rf_rating_file)

nb_rating_file = 'nb_rating.sav'
joblib.dump(models_rating_holder[2], nb_rating_file)

['nb_rating.sav']

In [36]:
model_rating_cross_vall_train = []
model_rating_cross_vall_test = []

for ind, model in enumerate(models_rating_holder):
    print("Model %i" % (ind))
    
    train_predict = cross_val_score(model, X_train_rating, y_train_rating, cv=3)
    model_rating_cross_vall_train.append(train_predict)
    print("Accuracy on train data {}".format(train_predict))
    
    test_predict = cross_val_score(model, X_test_rating, y_test_rating, cv=3)
    model_rating_cross_vall_test.append(train_predict)
    print("Accuracy on test data {}".format(test_predict))

Model 0
Accuracy on train data [0.61755982 0.62697785 0.62005538]
Accuracy on test data [0.61340827 0.60939382 0.6184739 ]
Model 1
Accuracy on train data [0.55111726 0.55122627 0.55102848]
Accuracy on test data [0.56122039 0.56162184 0.56345382]
Model 2
Accuracy on train data [0.61973502 0.63034019 0.61728639]
Accuracy on test data [0.59654757 0.59092734 0.59638554]


## Load saved models

In [3]:
models_recom_holder = []
models_rating_holder = []

models_recom_holder.append(joblib.load("./svc_recom.sav"))
models_recom_holder.append(joblib.load("./rf_recom.sav"))
models_recom_holder.append(joblib.load("./nb_recom.sav"))

models_rating_holder.append(joblib.load("./svc_rating.sav"))
models_rating_holder.append(joblib.load("./rf_rating.sav"))
models_rating_holder.append(joblib.load("./nb_rating.sav"))

### Read pre processed data

In [4]:
df = pd.read_csv("./cleaned_data.csv")
df.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name,Clean_Review
0,0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates,absolutely wonderful silky and sexy and comf...
1,1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses,love this dress it 's sooo pretty i happene...
2,2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses,i had such high hope for this dress and really...
3,3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants,i love love love this jumpsuit it 's fun ...
4,4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses,this shirt is very flattering to all due to th...


### Vectorize texts and split train and test data

In [5]:
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['Clean_Review'])

y_recom = df['Recommended IND']
X_train_recom, X_test_recom, y_train_recom, y_test_recom = train_test_split(X, y_recom, test_size=0.33, random_state=1234)

y_rating = df['Rating']
X_train_rating, X_test_rating, y_train_rating, y_test_rating = train_test_split(X, y_rating, test_size=0.33, random_state=1234)

**Now we have loaded models to our models holder list. We also prepared our text data and split them into train and test sets. So now we only need to check their accuracy on train and test data. We use cross validation score. Which is split data into some batchs and check accuracy on them. This process helps us to be sure that our model accuracy is not because of data bias. for example we can be sure that our train and test are equaly hard for the model to predict and our model result is more reliable when we use cross validation.**

## Check Recommendation predictor models accuracy.

In [7]:
model_recom_cross_vall_train = {}
model_recom_cross_vall_test = {}

### Check SVM algorithm accuracy

In [9]:
print("SVM evaluation in recommendation prediction: ")
train_predict = cross_val_score(models_recom_holder[0], X_train_recom, y_train_recom, cv=3)
model_recom_cross_vall_train["SVM"] = train_predict.mean()
print("Accuracy on train data {} ({} %)".format(train_predict.mean(), train_predict.mean() * 100))

test_predict = cross_val_score(models_recom_holder[0], X_test_recom, y_test_recom, cv=3)
model_recom_cross_vall_test["SVM"] = test_predict.mean()
print("Accuracy on test data {} ({} %)".format(test_predict.mean(), test_predict.mean() * 100))

SVM evaluation in recommendation prediction: 
Accuracy on train data 0.8837759301348592 (88.37759301348592 %)
Accuracy on test data 0.8811559364716998 (88.11559364716997 %)


### Check Random forest algorithm accuracy

In [10]:
print("Random Forest evaluation in recommendation prediction: ")
train_predict = cross_val_score(models_recom_holder[1], X_train_recom, y_train_recom, cv=3)
model_recom_cross_vall_train["RF"] = train_predict.mean()
print("Accuracy on train data {} ({} %)".format(train_predict.mean(), train_predict.mean() * 100))

test_predict = cross_val_score(models_recom_holder[1], X_test_recom, y_test_recom, cv=3)
model_recom_cross_vall_test["RF"] = test_predict.mean()
print("Accuracy on test data {} ({} %)".format(test_predict.mean(), test_predict.mean() * 100))

Random Forest evaluation in recommendation prediction: 
Accuracy on train data 0.8190388276183157 (81.90388276183157 %)
Accuracy on test data 0.8229389658191174 (82.29389658191174 %)


### Check Naive bayes algorithm accuracy

In [15]:
print("Naive bayes evaluation in recommendation prediction: ")
train_predict = cross_val_score(models_recom_holder[2], X_train_recom, y_train_recom, cv=3)
model_recom_cross_vall_train["NB"] = train_predict.mean()
print("Accuracy on train data {} ({} %)".format(train_predict.mean(), train_predict.mean() * 100))

test_predict = cross_val_score(models_recom_holder[2], X_test_recom, y_test_recom, cv=3)
model_recom_cross_vall_test["NB"] = train_predict.mean()
print("Accuracy on test data {} ({} %)".format(test_predict.mean(), test_predict.mean() * 100))

Naive bayes evaluation in recommendation prediction: 
Accuracy on train data 0.8846986391007493 (88.46986391007493 %)
Accuracy on test data 0.8693791894461292 (86.93791894461292 %)


## Check Rating predictor models accuracy

In [12]:
model_rating_cross_vall_train = {}
model_rating_cross_vall_test = {}

### Check SVM algorithm accuracy

In [13]:
train_predict = cross_val_score(models_rating_holder[0], X_train_rating, y_train_rating, cv=3)
model_rating_cross_vall_train["SVM"] = train_predict.mean()
print("Accuracy on train data {} ({} %)".format(train_predict.mean(), train_predict.mean() * 100))

test_predict = cross_val_score(models_rating_holder[0], X_test_rating, y_test_rating, cv=3)
model_rating_cross_vall_test["SVM"] = test_predict.mean()
print("Accuracy on test data {} ({} %)".format(test_predict.mean(), test_predict.mean() * 100))

Accuracy on train data 0.6215310153073528 (62.15310153073528 %)
Accuracy on test data 0.6137586610324611 (61.37586610324611 %)


### Check Random foresr algorithm accuracy

In [16]:
train_predict = cross_val_score(models_rating_holder[1], X_train_rating, y_train_rating, cv=3)
model_rating_cross_vall_train["RF"] = train_predict.mean()
print("Accuracy on train data {} ({} %)".format(train_predict.mean(), train_predict.mean() * 100))

test_predict = cross_val_score(models_rating_holder[1], X_test_rating, y_test_rating, cv=3)
model_rating_cross_vall_test["RF"] = test_predict.mean()
print("Accuracy on test data {} ({} %)".format(test_predict.mean(), test_predict.mean() * 100))

Accuracy on train data 0.551124042456085 (55.1124042456085 %)
Accuracy on test data 0.5626337814794572 (56.26337814794572 %)


### Check Naive bayes algorithm accuracy

In [17]:
train_predict = cross_val_score(models_rating_holder[2], X_train_rating, y_train_rating, cv=3)
model_rating_cross_vall_train["NB"] = train_predict.mean()
print("Accuracy on train data {} ({} %)".format(train_predict.mean(), train_predict.mean() * 100))

test_predict = cross_val_score(models_rating_holder[2], X_test_rating, y_test_rating, cv=3)
model_rating_cross_vall_test["NB"] = test_predict.mean()
print("Accuracy on test data {} ({} %)".format(test_predict.mean(), test_predict.mean() * 100))

Accuracy on train data 0.6224538676805932 (62.24538676805932 %)
Accuracy on test data 0.5946201506145014 (59.46201506145014 %)


## Details:

**As it can be seen from both predictors results, the recommendation predictors have a very reliable and better accuracy in compare to rating models. Its beacuse the recommendation prediction is a binary (boolean) classification which is very easier than multi-class classification (such as rating prediction). The recommendation predictors only need to determine whether the record have been recommended or not judging by its review text. Whereas the rating predictors should be abled to predict the rating of the record which is a number between 1 and 5. Therefor we have 5 class in this problem; That's why the rating models accuracy is much lower than recommendation models.**