In [1]:
from src.text_processing import normalize_corpus, tokenize_text
import pandas as pd
import numpy as np
import pprint
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import cross_val_predict, GridSearchCV
from sklearn.metrics import accuracy_score, mean_squared_error
from sklearn.pipeline import Pipeline
import time

  from numpy.core.umath_tests import inner1d


### Load and Inspect Data

In [2]:
""" Load and inspect data """
# https://www.kaggle.com/nicapotato/womens-ecommerce-clothing-reviews
df = pd.read_csv("womens_clothing_ecommerce_reviews.csv", index_col=0)
df.columns = [col.replace(" ", "_").lower() for col in df.columns]  # Standardize column names
print(df.isna().sum())

# Keep only rows with complete review field
keep_index = ~df.review_text.isna()
df = df.loc[keep_index, :]

clothing_id                   0
age                           0
title                      3810
review_text                 845
rating                        0
recommended_ind               0
positive_feedback_count       0
division_name                14
department_name              14
class_name                   14
dtype: int64


In [7]:
# What are we dealing with?
df.head()

Unnamed: 0,clothing_id,age,title,review_text,rating,recommended_ind,positive_feedback_count,division_name,department_name,class_name
0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


### Pull out `review_text` and `rating`

In [3]:
# Using only review_text, how accurately can we predict the rating?
data = df.loc[:, ["review_text", "rating"]]

# Hold out a 10% validation set for performance benchmarking
train_data, validation_data = train_test_split(data, test_size=.10, random_state=0)
train_data, validation_data = train_data.copy(), validation_data.copy()

### Preprocess Text
* Expand contractions
* Lemmatize
* Remove stopwords (should we remove them? hmmmm)

In [4]:
# Preprocess text
train_text_norm = normalize_corpus(train_data.review_text)
# Extract target
train_target = train_data.rating

### Fit a Vectorizer and Transform

In [5]:
# CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1, 1),
                                   stop_words="english",
                                   max_df=.95,
                                   min_df=.05)
train_count_vectorized = count_vectorizer.fit_transform(train_text_norm)

In [19]:
# What does the data look like?
# Get the array and show the data train_count_vectorized
train_count_vectorized_array = train_count_vectorized.toarray()
print("Here is the array shape \n", train_count_vectorized_array.shape, "\n")
# Show the features that were used
features = count_vectorizer.get_feature_names()
print("Here are the features extracted \n", features, "\n")
# Show a snip of the data
print("Here is what goes into the model \n", 
      pd.DataFrame(data=train_count_vectorized_array[:10, :], columns=features).head(), "\n")

Here is the array shape 
 (20376, 85) 

Here are the features extracted 
 ['arm', 'beautiful', 'big', 'bit', 'black', 'buy', 'color', 'come', 'comfortable', 'cut', 'cute', 'definitely', 'design', 'dress', 'fabric', 'fall', 'feel', 'fit', 'flatter', 'flattering', 'good', 'great', 'high', 'jean', 'large', 'lb', 'length', 'like', 'little', 'long', 'look', 'loose', 'love', 'make', 'material', 'medium', 'model', 'need', 'nice', 'online', 'order', 'pair', 'pant', 'perfect', 'perfectly', 'person', 'petite', 'picture', 'pretty', 'price', 'purchase', 'quality', 'really', 'retailer', 'return', 'review', 'right', 'run', 'sale', 'say', 'shirt', 'short', 'shoulder', 'size', 'skirt', 'sleeve', 'small', 'soft', 'store', 'style', 'summer', 'super', 'sweater', 'think', 'tight', 'time', 'true', 'try', 'usually', 'waist', 'want', 'way', 'wear', 'white', 'work'] 

Here is what goes into the model 
    arm  beautiful  big  bit  black  buy  color  come  comfortable  cut  ...   \
0    0          0    0    0 

In [8]:
# Cross-validate Predict
model = RandomForestRegressor(n_estimators=10, max_depth=5)
predict = cross_val_predict(model, train_count_vectorized, train_target)
predict_processed = [int(round(i)) for i in predict]

# Analyze accuracy
print("Accuracy: %.2f percent" % (accuracy_score(train_target, predict_processed) * 100))
# Analyze Root Mean Squared Error
print("RMSE: %.2f " % np.sqrt(mean_squared_error(train_target, predict)))

Accuracy: 38.36 percent
RMSE: 1.02 


In [9]:
# TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 1),
                                   stop_words="english",
                                   max_df=.95,
                                   min_df=.05)
train_tfidf_vectorized = tfidf_vectorizer.fit_transform(train_text_norm)


In [20]:
# What does the data look like?
# Get the array and show the data train_count_vectorized
train_tfidf_vectorized_array = train_tfidf_vectorized.toarray()
print("Here is the array shape \n", train_tfidf_vectorized_array.shape, "\n")
# Show the features that were used
features = tfidf_vectorizer.get_feature_names()
print("Here are the features extracted \n", features, "\n")
# Show a snip of the data
print("Here is what goes into the model \n", 
      pd.DataFrame(data=train_tfidf_vectorized_array[:10, :], columns=features).head(), "\n")

Here is the array shape 
 (20376, 85) 

Here are the features extracted 
 ['arm', 'beautiful', 'big', 'bit', 'black', 'buy', 'color', 'come', 'comfortable', 'cut', 'cute', 'definitely', 'design', 'dress', 'fabric', 'fall', 'feel', 'fit', 'flatter', 'flattering', 'good', 'great', 'high', 'jean', 'large', 'lb', 'length', 'like', 'little', 'long', 'look', 'loose', 'love', 'make', 'material', 'medium', 'model', 'need', 'nice', 'online', 'order', 'pair', 'pant', 'perfect', 'perfectly', 'person', 'petite', 'picture', 'pretty', 'price', 'purchase', 'quality', 'really', 'retailer', 'return', 'review', 'right', 'run', 'sale', 'say', 'shirt', 'short', 'shoulder', 'size', 'skirt', 'sleeve', 'small', 'soft', 'store', 'style', 'summer', 'super', 'sweater', 'think', 'tight', 'time', 'true', 'try', 'usually', 'waist', 'want', 'way', 'wear', 'white', 'work'] 

Here is what goes into the model 
    arm  beautiful  big       bit  black  buy     color  come  comfortable  \
0  0.0        0.0  0.0  0.00000

In [10]:
# Cross-validate Predict
model = RandomForestRegressor(n_estimators=10, max_depth=5)
predict = cross_val_predict(model, train_tfidf_vectorized, train_target)
predict_processed = [int(round(i)) for i in predict]

# Analyze accuracy
print("Accuracy: %.2f percent" % (accuracy_score(train_target, predict_processed) * 100))
# Analyze Root Mean Squared Error
print("RMSE: %.2f " % np.sqrt(mean_squared_error(train_target, predict)))

Accuracy: 37.39 percent
RMSE: 1.02 


### Tune all relevant parameters using Grid-Search and Pipeline

In [11]:
# Tuning parameters using a pipeline

pipeline = Pipeline([("vect", TfidfVectorizer(max_features=10000)),
                     ("model", RandomForestRegressor(random_state=0))])

parameters = {
    "vect__ngram_range": [(1, 2)],  # <- Notice (1, 2) vs (1, 1)
    "vect__max_df": [1.],
    "vect__min_df": [.04],
    "model__n_estimators": [50],
    "model__max_depth": [30]
}

grid = GridSearchCV(pipeline, parameters, cv=5, n_jobs=3)

start = time.time()
grid.fit(train_text_norm, train_target)
stop = time.time()
print("total time: ", stop-start)

total time:  354.0645890235901


In [14]:
# Analyze results
pprint.pprint(grid.best_params_)

# Generate predictions
model = grid.best_estimator_
predict = cross_val_predict(model, train_text_norm, train_target, cv=5, n_jobs=3)
predict_processed = [int(round(i)) for i in predict]

# Analyze accuracy
print("Accuracy: %.2f percent" % (accuracy_score(train_target, predict_processed) * 100))
# Analyze Root Mean Squared Error
print("RMSE: %.2f " % np.sqrt(mean_squared_error(train_target, predict)))

{'model__max_depth': 30,
 'model__n_estimators': 50,
 'vect__max_df': 1.0,
 'vect__min_df': 0.04,
 'vect__ngram_range': (1, 2)}
Accuracy: 48.48 percent
RMSE: 0.93 


## TODO's
### Can you...
### * Analyze the Validation performance?
### * Improve the score? (try a different model, different parameters... try without removing stop words?)