In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:
reviews = pd.read_csv('reviews.csv')
reviews.head(3)

Unnamed: 0,clothing_id,age,review_title,review_text,recommended,division_name,department_name,review_date,rating
0,1095,39,"Cute,looks like a dress on",If you are afraid of the jumpsuit trend but li...,True,General,Dresses,2019-07-08,Liked it
1,1095,28,"So cute, great print!",I love fitted top dresses like this but i find...,True,General,Dresses,2019-05-17,Loved it
2,699,37,So flattering!,"I love these cozy, fashionable leggings. they ...",True,Initmates,Intimate,2019-06-24,Loved it


In [3]:
reviews.columns

Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating'],
      dtype='object')

In [4]:
reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 9 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   clothing_id      5000 non-null   int64 
 1   age              5000 non-null   int64 
 2   review_title     4174 non-null   object
 3   review_text      4804 non-null   object
 4   recommended      5000 non-null   bool  
 5   division_name    4996 non-null   object
 6   department_name  4996 non-null   object
 7   review_date      5000 non-null   object
 8   rating           5000 non-null   object
dtypes: bool(1), int64(2), object(6)
memory usage: 317.5+ KB


In [6]:
reviews['recommended'].value_counts()

recommended
True     4166
False     834
Name: count, dtype: int64

In [7]:
binary_dict = {True: 1, False: 0}

In [8]:
reviews['binary_recommended'] = reviews['recommended'].map(binary_dict)
reviews['binary_recommended'].value_counts()

binary_recommended
1    4166
0     834
Name: count, dtype: int64

In [9]:
reviews['rating'].value_counts()

rating
Loved it     2798
Liked it     1141
Was okay      564
Not great     304
Hated it      193
Name: count, dtype: int64

In [10]:
reting_dict = {'Loved it': 5, 'Liked it': 4, 'Was okey': 3, 'Not great': 2, 'Hated it': 1}

In [11]:
reviews['rating'] = reviews['rating'].map(reting_dict)
reviews['rating'].value_counts()

rating
5.0    2798
4.0    1141
2.0     304
1.0     193
Name: count, dtype: int64

In [12]:
reviews['department_name'].value_counts()

department_name
Tops        2196
Dresses     1322
Bottoms      848
Intimate     378
Jackets      224
Trend         28
Name: count, dtype: int64

In [13]:
ohe = pd.get_dummies(reviews['department_name'])
reviews = reviews.join(ohe)
reviews.head(3)

Unnamed: 0,clothing_id,age,review_title,review_text,recommended,division_name,department_name,review_date,rating,binary_recommended,Bottoms,Dresses,Intimate,Jackets,Tops,Trend
0,1095,39,"Cute,looks like a dress on",If you are afraid of the jumpsuit trend but li...,True,General,Dresses,2019-07-08,4.0,1,False,True,False,False,False,False
1,1095,28,"So cute, great print!",I love fitted top dresses like this but i find...,True,General,Dresses,2019-05-17,5.0,1,False,True,False,False,False,False
2,699,37,So flattering!,"I love these cozy, fashionable leggings. they ...",True,Initmates,Intimate,2019-06-24,5.0,1,False,False,True,False,False,False


In [14]:
reviews.columns

Index(['clothing_id', 'age', 'review_title', 'review_text', 'recommended',
       'division_name', 'department_name', 'review_date', 'rating',
       'binary_recommended', 'Bottoms', 'Dresses', 'Intimate', 'Jackets',
       'Tops', 'Trend'],
      dtype='object')

In [15]:
reviews['review_date'].dtype

dtype('O')

In [16]:
reviews['review_date'] = pd.to_datetime(reviews['review_date'])
reviews['review_date'].dtype

dtype('<M8[ns]')

# Escalar datos

In [17]:
reviews = reviews[['clothing_id', 'age', 'recommended', 'rating', 'Bottoms', 'Dresses', 'Intimate', 'Jackets', 'Tops', 'Trend']].copy()
reviews.head(3)

Unnamed: 0,clothing_id,age,recommended,rating,Bottoms,Dresses,Intimate,Jackets,Tops,Trend
0,1095,39,True,4.0,False,True,False,False,False,False
1,1095,28,True,5.0,False,True,False,False,False,False
2,699,37,True,5.0,False,False,True,False,False,False


In [18]:
reviews = reviews.set_index('clothing_id')
reviews.head(3)

Unnamed: 0_level_0,age,recommended,rating,Bottoms,Dresses,Intimate,Jackets,Tops,Trend
clothing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1095,39,True,4.0,False,True,False,False,False,False
1095,28,True,5.0,False,True,False,False,False,False
699,37,True,5.0,False,False,True,False,False,False


In [19]:
scaler = StandardScaler()

In [20]:
encoder = scaler.fit(reviews)

In [23]:
reviews_scaled = encoder.transform(reviews)
reviews_scaled = pd.DataFrame(reviews_scaled, index=reviews.index, columns=reviews.columns)
reviews_scaled.head(3)

Unnamed: 0_level_0,age,recommended,rating,Bottoms,Dresses,Intimate,Jackets,Tops,Trend
clothing_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
1095,-0.348145,0.447428,-0.336531,-0.451928,1.667977,-0.285977,-0.216567,-0.884967,-0.075044
1095,-1.244752,0.447428,0.59013,-0.451928,1.667977,-0.285977,-0.216567,-0.884967,-0.075044
699,-0.511164,0.447428,0.59013,-0.451928,-0.599529,3.496786,-0.216567,-0.884967,-0.075044
