### Importing the Data

In [1]:
# data path
data_path = 'data/Womens Clothing E-Commerce Reviews.csv'

In [2]:
import pandas as pd
reviews_df = pd.read_csv(data_path)
reviews_df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [3]:
reviews_df.shape

(23486, 11)

In [4]:
reviews_df = reviews_df[['Review Text', 'Rating']]
reviews_df.head(4)

Unnamed: 0,Review Text,Rating
0,Absolutely wonderful - silky and sexy and comf...,4
1,Love this dress! it's sooo pretty. i happene...,5
2,I had such high hopes for this dress and reall...,3
3,"I love, love, love this jumpsuit. it's fun, fl...",5


### Preprocessing

#### Remove NaN values

In [5]:
reviews_df.isnull().sum()

Review Text    845
Rating           0
dtype: int64

In [6]:
reviews_df.dropna(inplace=True)
reviews_df.isnull().sum()

Review Text    0
Rating         0
dtype: int64

#### Convert to lower case

In [7]:
reviews_df['preprocessed'] = reviews_df['Review Text'].apply(lambda x: x.lower())
reviews_df.head(4)

Unnamed: 0,Review Text,Rating,preprocessed
0,Absolutely wonderful - silky and sexy and comf...,4,absolutely wonderful - silky and sexy and comf...
1,Love this dress! it's sooo pretty. i happene...,5,love this dress! it's sooo pretty. i happene...
2,I had such high hopes for this dress and reall...,3,i had such high hopes for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl...",5,"i love, love, love this jumpsuit. it's fun, fl..."


#### Remove non-alphabetical characters

In [8]:
import re

def remove_nonalpha(text):
    words = text.split()
    words = [re.sub(r'[^a-zA-Z]+', '', word) for word in words]
    text = ' '.join(words)
    
    # remove extra spaces between words
    text = re.sub(' +', ' ', text)
    return text

In [9]:
reviews_df['preprocessed'] = reviews_df['preprocessed'].apply(remove_nonalpha)
reviews_df.head(4)

Unnamed: 0,Review Text,Rating,preprocessed
0,Absolutely wonderful - silky and sexy and comf...,4,absolutely wonderful silky and sexy and comfor...
1,Love this dress! it's sooo pretty. i happene...,5,love this dress its sooo pretty i happened to ...
2,I had such high hopes for this dress and reall...,3,i had such high hopes for this dress and reall...
3,"I love, love, love this jumpsuit. it's fun, fl...",5,i love love love this jumpsuit its fun flirty ...


#### Remove stopwords

In [10]:
from nltk.corpus import stopwords
stop = stopwords.words('english')

reviews_df['preprocessed'] = reviews_df['preprocessed'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in stop])
)
reviews_df.head(4)

Unnamed: 0,Review Text,Rating,preprocessed
0,Absolutely wonderful - silky and sexy and comf...,4,absolutely wonderful silky sexy comfortable
1,Love this dress! it's sooo pretty. i happene...,5,love dress sooo pretty happened find store im ...
2,I had such high hopes for this dress and reall...,3,high hopes dress really wanted work initially ...
3,"I love, love, love this jumpsuit. it's fun, fl...",5,love love love jumpsuit fun flirty fabulous ev...


#### Lemmatization

In [11]:
import spacy
nlp = spacy.load('en_core_web_sm')

def lemmatize(text):
    doc = nlp(text)
    tokens = [token for token in doc]
    return " ".join([token.lemma_ for token in doc])

In [12]:
reviews_df['preprocessed'] = reviews_df['preprocessed'].apply(lemmatize)
reviews_df.head(4)

Unnamed: 0,Review Text,Rating,preprocessed
0,Absolutely wonderful - silky and sexy and comf...,4,absolutely wonderful silky sexy comfortable
1,Love this dress! it's sooo pretty. i happene...,5,love dress sooo pretty happen find store I m g...
2,I had such high hopes for this dress and reall...,3,high hope dress really want work initially ord...
3,"I love, love, love this jumpsuit. it's fun, fl...",5,love love love jumpsuit fun flirty fabulous ev...


### Get Positive and Negative Text

In [13]:
# drop rating of 3 (neutral rating)
reviews_df = reviews_df[reviews_df['Rating'] != 3]
reviews_df['sentiment'] = reviews_df['Rating'] >= 4 # True for Positive, False for Negative
reviews_df['sentiment'].replace(True, 1, inplace=True)
reviews_df['sentiment'].replace(False, 0, inplace=True)
reviews_df

Unnamed: 0,Review Text,Rating,preprocessed,sentiment
0,Absolutely wonderful - silky and sexy and comf...,4,absolutely wonderful silky sexy comfortable,1
1,Love this dress! it's sooo pretty. i happene...,5,love dress sooo pretty happen find store I m g...,1
3,"I love, love, love this jumpsuit. it's fun, fl...",5,love love love jumpsuit fun flirty fabulous ev...,1
4,This shirt is very flattering to all due to th...,5,shirt flatter due adjustable front tie perfect...,1
5,"I love tracy reese dresses, but this one is no...",2,love tracy reese dress one petite foot tall us...,0
...,...,...,...,...
23477,I'm so impressed with the beautiful color comb...,4,I m impressed beautiful color combination embr...,1
23478,I was surprised at the positive reviews for th...,1,surprised positive review product terrible cut...,0
23479,So i wasn't sure about ordering this skirt bec...,5,be not sure order skirt could not see person f...,1
23481,I was very happy to snag this dress at such a ...,5,happy snag dress great price easy slip flatter...,1


### Get training and test data

In [14]:
from sklearn.model_selection import train_test_split

X = reviews_df['preprocessed']
y = reviews_df['sentiment']

seed = 20
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)

print("Training data:", X_train.shape)
print('Validation data:', X_test.shape)

Training data: (14863,)
Validation data: (4955,)


### Using TF-IDF Vectorizer

In [15]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
X_train_tf = vectorizer.fit_transform(X_train)
X_test_tf = vectorizer.transform(X_test)

print("Training data:", X_train_tf.shape)
print('Validation data:', X_test_tf.shape)

Training data: (14863, 12369)
Validation data: (4955, 12369)


#### Model Training and Evaluation

In [16]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

linear = LogisticRegression(random_state=seed)
tree = DecisionTreeClassifier(random_state=seed)
forest = RandomForestClassifier(random_state=seed)
boost = GradientBoostingClassifier(random_state=seed)
svm = LinearSVC(random_state=seed)

In [17]:
from sklearn.metrics import accuracy_score

def get_accuracy(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [18]:
models = [linear, tree, forest, boost, svm]
model_names = [
    'Logistic Regression',
    'Decision Tree',
    'Random Forest',
    'Gradient Boosting',
    'Support Vector Machine'
]

In [19]:
for model, model_name in zip(models, model_names):
    acc = get_accuracy(model, X_train_tf, X_test_tf, y_train, y_test)
    print(model_name)
    print('==========================')
    print('Accuracy: %.2f%%'%(acc*100))
    print()

Logistic Regression
Accuracy: 92.82%

Decision Tree
Accuracy: 87.12%

Random Forest
Accuracy: 89.42%

Gradient Boosting
Accuracy: 90.88%

Support Vector Machine
Accuracy: 93.56%



### Using Count Vectorizer

In [20]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
X_train_cv = vectorizer.fit_transform(X_train)
X_test_cv = vectorizer.transform(X_test)

print("Training data:", X_train_cv.shape)
print('Validation data:', X_test_cv.shape)

Training data: (14863, 12369)
Validation data: (4955, 12369)


#### Model Training and Evaluation

In [21]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

linear = LogisticRegression(random_state=seed)
tree = DecisionTreeClassifier(random_state=seed)
forest = RandomForestClassifier(random_state=seed)
boost = GradientBoostingClassifier(random_state=seed)
svm = LinearSVC(random_state=seed)

In [22]:
from sklearn.metrics import accuracy_score

def get_accuracy(model, X_train, X_test, y_train, y_test):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    return accuracy_score(y_test, y_pred)

In [23]:
models = [linear, tree, forest, boost, svm]
model_names = [
    'Logistic Regression',
    'Decision Tree',
    'Random Forest',
    'Gradient Boosting',
    'Support Vector Machine'
]

In [24]:
for model, model_name in zip(models, model_names):
    acc = get_accuracy(model, X_train_cv, X_test_cv, y_train, y_test)
    print(model_name)
    print('==========================')
    print('Accuracy: %.2f%%'%(acc*100))
    print()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Logistic Regression
Accuracy: 93.48%

Decision Tree
Accuracy: 88.15%

Random Forest
Accuracy: 89.55%

Gradient Boosting
Accuracy: 90.49%

Support Vector Machine
Accuracy: 92.17%



