In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
#import data
reviews = pd.read_csv("reviews.csv")

In [3]:
#print column names
reviews.head()


   clothing_id  age                review_title  \
0         1095   39  Cute,looks like a dress on   
1         1095   28       So cute, great print!   
2          699   37              So flattering!   
3         1072   36                  Effortless   
4         1094   32              You need this!   

                                         review_text  recommended  \
0  If you are afraid of the jumpsuit trend but li...         True   
1  I love fitted top dresses like this but i find...         True   
2  I love these cozy, fashionable leggings. they ...         True   
3  Another reviewer said it best, "i love the way...         True   
4  Rompers are my fav so i'm biased writing this ...         True   

    division_name department_name review_date    rating  
0         General         Dresses  2019-07-08  Liked it  
1         General         Dresses  2019-05-17  Loved it  
2       Initmates        Intimate  2019-06-24  Loved it  
3  General Petite         Dresses  2019-12-06 

In [4]:
 
#print .info
reviews.info()


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   clothing_id      5000 non-null   int64 
 1   age              5000 non-null   int64 
 2   review_title     4174 non-null   object
 3   review_text      4804 non-null   object
 4   recommended      5000 non-null   bool  
 5   division_name    4996 non-null   object
 6   department_name  4996 non-null   object
 7   review_date      5000 non-null   object
 8   rating           5000 non-null   object
dtypes: bool(1), int64(2), object(6)
memory usage: 317.5+ KB
None


In [5]:
#look at the counts of recommended
print(reviews["recommended"].value_counts())
 

True     4166
False     834
Name: recommended, dtype: int64


In [6]:
#create binary dictionary
binary_dict = {True: 1, False:0}
 

In [7]:
#transform column
reviews["recommended"] = reviews["recommended"].map(binary_dict)
 

In [8]:
#print your transformed column
print(reviews["recommended"].value_counts())


1    4166
0     834
Name: recommended, dtype: int64


In [9]:
#look at the counts of rating
print(reviews["rating"].value_counts())
 

Loved it     2798
Liked it     1141
Was okay      564
Not great     304
Hated it      193
Name: rating, dtype: int64


In [10]:

#create dictionary
rating_dict = {"Loved it":5, "Liked it": 4, "Was Okay":3, "Not great":2, "Hated it":1}
 

In [11]:
#transform rating column
reviews["rating"] = reviews["rating"].map(rating_dict)
#print your transformed column values
print(reviews["rating"].value_counts())

5.0    2798
4.0    1141
2.0     304
1.0     193
Name: rating, dtype: int64


In [12]:
#get the number of categories in a feature
print(reviews["department_name"].value_counts())
 

Tops        2196
Dresses     1322
Bottoms      848
Intimate     378
Jackets      224
Trend         28
Name: department_name, dtype: int64


In [13]:
#perform get_dummies
one_hot = pd.get_dummies(reviews["department_name"])
 

In [14]:
#join the new columns back onto the original
reviews = reviews.join(one_hot)


In [15]:
#print column names
print(reviews.columns)


Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating', 'Bottoms',
       'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend'],
      dtype='object')


In [16]:
#transform review_date to date-time data
reviews["review_date"] = pd.to_datetime(reviews["review_date"])
#print review_date data type 
print(reviews["review_date"].dtypes)


datetime64[ns]


In [17]:

#get numerical columns
reviews = reviews[['clothing_id', 'age', 'recommended', 'rating', 'Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']].copy()

In [18]:
#reset index
reviews = reviews.set_index(reviews["clothing_id"])
#instantiate standard scaler
scaler = StandardScaler()

In [19]:
#fit transform data
model = scaler.fit_transform(reviews)
print(model)

[[ 0.85669131 -0.34814459  0.44742824 ... -0.21656679 -0.88496718
  -0.07504356]
 [ 0.85669131 -1.24475223  0.44742824 ... -0.21656679 -0.88496718
  -0.07504356]
 [-1.06545809 -0.51116416  0.44742824 ... -0.21656679 -0.88496718
  -0.07504356]
 ...
 [ 0.81300609 -0.59267395  0.44742824 ... -0.21656679 -0.88496718
  -0.07504356]
 [ 0.55574873 -1.24475223  0.44742824 ... -0.21656679 -0.88496718
  -0.07504356]
 [-0.33251728  1.68960003  0.44742824 ... -0.21656679  1.12998541
  -0.07504356]]
