# Importando bibliotecas

In [113]:
import re
import pandas as pd
import numpy as np

# Carregando os dados

In [114]:
df = pd.read_csv("./generated_data/df_raw_scrapping.csv")
df = df.drop("Unnamed: 0", axis=1)

In [115]:
df.head()

Unnamed: 0,product_id,product_name,product_type,product_colors,product_price,product_composition_shell,product_composition_pocket,scrapy_datetime
0,1024256001,Slim Jeans,men_jeans_slim,Black,$ 19.99,"Cotton 99%, Spandex 1%","Polyester 65%, Cotton 35%",2022-12-15 20:56:55
1,1024256001,Slim Jeans,men_jeans_slim,Light denim blue,$ 19.99,"Cotton 99%, Spandex 1%","Polyester 65%, Cotton 35%",2022-12-15 20:56:55
2,1024256001,Slim Jeans,men_jeans_slim,Light denim blue,$ 19.99,"Cotton 99%, Spandex 1%","Polyester 65%, Cotton 35%",2022-12-15 20:56:55
3,1024256001,Slim Jeans,men_jeans_slim,Denim blue,$ 19.99,"Cotton 99%, Spandex 1%","Polyester 65%, Cotton 35%",2022-12-15 20:56:55
4,1024256001,Slim Jeans,men_jeans_slim,Dark blue,$ 19.99,"Cotton 99%, Spandex 1%","Polyester 65%, Cotton 35%",2022-12-15 20:56:55


In [116]:
df.product_composition_shell.str.contains("Spandex").sum()

666

In [117]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 849 entries, 0 to 848
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype 
---  ------                      --------------  ----- 
 0   product_id                  849 non-null    int64 
 1   product_name                849 non-null    object
 2   product_type                849 non-null    object
 3   product_colors              849 non-null    object
 4   product_price               849 non-null    object
 5   product_composition_shell   849 non-null    object
 6   product_composition_pocket  605 non-null    object
 7   scrapy_datetime             849 non-null    object
dtypes: int64(1), object(7)
memory usage: 53.2+ KB


In [118]:
df.shape

(849, 8)

# Limpando os dados

### product_name

In [119]:
df["product_name"] = df["product_name"].apply(lambda x: x.replace(" ", "_").lower())

### product_colors

In [120]:
df["product_colors"] = df["product_colors"].apply(lambda x: x.replace(" ", "_").lower())

### product_price

In [121]:
df["product_price"] = df["product_price"].apply(lambda x: x.replace("$", "").strip())
df["product_price"] = df["product_price"].astype(float)

### product_composition_shell

In [122]:
df.product_composition_shell.unique()

array(['Cotton 99%, Spandex 1%', 'Cotton 98%, Spandex 2%', 'Cotton 100%',
       'Cotton 90%, Elastomultiester 8%, Spandex 2%',
       'Cotton 89%, Elastomultiester 9%, Spandex 2%',
       'Cotton 79%, Polyester 20%, Spandex 1%',
       'Cotton 78%, Polyester 21%, Spandex 1%',
       'Cotton 77%, Polyester 21%, Spandex 2%',
       'Cotton 80%, Polyester 19%, Spandex 1%',
       'Lyocell 55%, Cotton 30%, Rayon 15%', 'Rayon 50%, Lyocell 50%'],
      dtype=object)

Colunas que quero criar:

    * Cotton
    * Spandex
    * Elastomultiester
    * Polyester
    * Lyocell
    * Rayon

In [123]:
# dataframe auxiliar
df_aux = df.product_composition_shell.str.split(",", expand=True)

In [124]:
# criar um dataframe de referência
df_ref = pd.DataFrame(index=np.arange(len(df)), columns=["cotton", "spandex", 
                                                         "elastomultiester", "polyester", "lyocell", "rayon"])

In [125]:
#cotton
df_cotton = df_aux.loc[df_aux[0].str.contains("Cotton", na=True), 0]
df_cotton.name = "cotton"
df_ref = pd.concat([df_ref, df_cotton], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep="last")]

In [126]:
#spandex
df_spandex = pd.Series(dtype=str)

df_aux = df_aux.fillna("string vazia")

for i in df_aux.index:
    spandex_in_1 = "Spandex" in df_aux.loc[i][1]
    spandex_in_2 = "Spandex" in df_aux.loc[i][2]
    if spandex_in_1:
        df_spandex.loc[i] = df_aux.loc[i][1]
    elif spandex_in_2:
        df_spandex.loc[i] = df_aux.loc[i][2]
    else:
        df_spandex.loc[i] = np.nan
        
df_spandex.name = "spandex"

df_ref = pd.concat([df_ref, df_spandex], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep="last")]

In [127]:
#elastomultiester
df_elastomultiester = pd.Series(dtype=str)

for i in df_aux.index:
    elastomultiester_in_1 = "Elastomultiester" in df_aux.loc[i][1]
    if elastomultiester_in_1:
        df_elastomultiester.loc[i] = df_aux.loc[i][1]
    else:
        df_elastomultiester.loc[i] = np.nan

df_elastomultiester.name = "elastomultiester"

df_ref = pd.concat([df_ref, df_elastomultiester], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep="last")]

In [128]:
#polyester
df_polyester = pd.Series(dtype=str)

for i in df_aux.index:
    polyester_in_1 = "Polyester" in df_aux.loc[i][1]
    if polyester_in_1:
        df_polyester.loc[i] = df_aux.loc[i][1]
    else:
        df_polyester.loc[i] = np.nan
        
df_polyester.name = "polyester"

df_ref = pd.concat([df_ref, df_polyester], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep="last")]

In [129]:
#lyocell
df_lyocell = pd.Series(dtype=str)

for i in df_aux.index:
    lyocell_in_0 = "Lyocell" in df_aux.loc[i][0]
    lyocell_in_1 = "Lyocell" in df_aux.loc[i][1]
    if lyocell_in_0:
        df_lyocell.loc[i] = df_aux.loc[i][0]
    elif lyocell_in_1:
        df_lyocell.loc[i] = df_aux.loc[i][1]
    else:
        df_lyocell.loc[i] = np.nan
        
df_lyocell.name = "lyocell"

df_ref = pd.concat([df_ref, df_lyocell], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep="last")]

In [130]:
#rayon
df_rayon = pd.Series(dtype=str)

for i in df_aux.index:
    rayon_in_0 = "Rayon" in df_aux.loc[i][0]
    rayon_in_2 = "Rayon" in df_aux.loc[i][2]
    if rayon_in_0:
        df_rayon.loc[i] = df_aux.loc[i][0]
    elif rayon_in_2:
        df_rayon.loc[i] = df_aux.loc[i][2]
    else:
        df_rayon.loc[i] = np.nan
        
df_rayon.name = "rayon"

df_ref = pd.concat([df_ref, df_rayon], axis=1)
df_ref = df_ref.iloc[:, ~df_ref.columns.duplicated(keep="last")]

In [131]:
df_ref.head()

Unnamed: 0,cotton,spandex,elastomultiester,polyester,lyocell,rayon
0,Cotton 99%,Spandex 1%,,,,
1,Cotton 99%,Spandex 1%,,,,
2,Cotton 99%,Spandex 1%,,,,
3,Cotton 99%,Spandex 1%,,,,
4,Cotton 99%,Spandex 1%,,,,


In [132]:
#joining df_ref and df
df = pd.concat([df, df_ref], axis=1)

In [133]:
#drop product_composition column
df = df.drop("product_composition_shell", axis=1)

In [134]:
#get only the number composition
columns_composition = df_ref.columns
for column in columns_composition:
    df[column] = df[column].apply(lambda x: int(re.search('\d+', x).group(0))/100 if pd.notnull(x) else x)

### product_composition_pocket

In [135]:
#drop product_composition_pocker because what really matter is the shell
df = df.drop("product_composition_pocket", axis=1)

### scrapy_datetime

In [137]:
#reordering the columns
column_order = ['product_id', 'product_name', 'product_type', 'product_colors',
       'product_price', 'cotton', 'spandex',
       'elastomultiester', 'polyester', 'lyocell', 'rayon', 'scrapy_datetime']

df = df[column_order]

In [138]:
#convert to datetime type
df["scrapy_datetime"] = pd.to_datetime(df["scrapy_datetime"], format="%Y-%m-%d %H:%M:%S")

# Final DataFrame

In [139]:
df.head()

Unnamed: 0,product_id,product_name,product_type,product_colors,product_price,cotton,spandex,elastomultiester,polyester,lyocell,rayon,scrapy_datetime
0,1024256001,slim_jeans,men_jeans_slim,black,19.99,0.99,0.01,,,,,2022-12-15 20:56:55
1,1024256001,slim_jeans,men_jeans_slim,light_denim_blue,19.99,0.99,0.01,,,,,2022-12-15 20:56:55
2,1024256001,slim_jeans,men_jeans_slim,light_denim_blue,19.99,0.99,0.01,,,,,2022-12-15 20:56:55
3,1024256001,slim_jeans,men_jeans_slim,denim_blue,19.99,0.99,0.01,,,,,2022-12-15 20:56:55
4,1024256001,slim_jeans,men_jeans_slim,dark_blue,19.99,0.99,0.01,,,,,2022-12-15 20:56:55


In [140]:
df.shape

(849, 12)

In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 849 entries, 0 to 848
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   product_id        849 non-null    int64         
 1   product_name      849 non-null    object        
 2   product_type      849 non-null    object        
 3   product_colors    849 non-null    object        
 4   product_price     849 non-null    float64       
 5   cotton            843 non-null    float64       
 6   spandex           666 non-null    float64       
 7   elastomultiester  48 non-null     float64       
 8   polyester         51 non-null     float64       
 9   lyocell           6 non-null      float64       
 10  rayon             6 non-null      float64       
 11  scrapy_datetime   849 non-null    datetime64[ns]
dtypes: datetime64[ns](1), float64(7), int64(1), object(3)
memory usage: 79.7+ KB


In [142]:
df.isna().sum()

product_id            0
product_name          0
product_type          0
product_colors        0
product_price         0
cotton                6
spandex             183
elastomultiester    801
polyester           798
lyocell             843
rayon               843
scrapy_datetime       0
dtype: int64

In [143]:
#verifying columns that has lyocell or rayon and cleaning them
df[df.rayon.notnull()]

Unnamed: 0,product_id,product_name,product_type,product_colors,product_price,cotton,spandex,elastomultiester,polyester,lyocell,rayon,scrapy_datetime
749,1063141001,loose_jeans,men_jeans_loose,denim_blue,44.99,,,,,0.55,0.15,2022-12-15 20:56:55
782,1044249001,loose_pull-on_jeans,men_jeans_loose,light_denim_blue,44.99,,,,,0.5,0.5,2022-12-15 20:56:55
783,1044249001,loose_pull-on_jeans,men_jeans_loose,dark_denim_blue,44.99,,,,,0.5,0.5,2022-12-15 20:56:55
804,1051644001,loose_jeans,men_jeans_loose,denim_blue,44.99,,,,,0.55,0.15,2022-12-15 20:56:55
805,1044249002,loose_pull-on_jeans,men_jeans_loose,light_denim_blue,44.99,,,,,0.5,0.5,2022-12-15 20:56:55
806,1044249002,loose_pull-on_jeans,men_jeans_loose,dark_denim_blue,44.99,,,,,0.5,0.5,2022-12-15 20:56:55


In [144]:
df.loc[749, "cotton"] = 0.3
df.loc[804, "cotton"] = 0.3

In [145]:
#dropping duplicates
df = df.drop_duplicates()

In [146]:
df.shape

(624, 12)

In [147]:
#reseting index
df = df.reset_index(drop=True)

In [151]:
#fillna with 0, meaning that it doesn't compose the product
df = df.fillna(0)

In [152]:
df.head()

Unnamed: 0,product_id,product_name,product_type,product_colors,product_price,cotton,spandex,elastomultiester,polyester,lyocell,rayon,scrapy_datetime
0,1024256001,slim_jeans,men_jeans_slim,black,19.99,0.99,0.01,0.0,0.0,0.0,0.0,2022-12-15 20:56:55
1,1024256001,slim_jeans,men_jeans_slim,light_denim_blue,19.99,0.99,0.01,0.0,0.0,0.0,0.0,2022-12-15 20:56:55
2,1024256001,slim_jeans,men_jeans_slim,denim_blue,19.99,0.99,0.01,0.0,0.0,0.0,0.0,2022-12-15 20:56:55
3,1024256001,slim_jeans,men_jeans_slim,dark_blue,19.99,0.99,0.01,0.0,0.0,0.0,0.0,2022-12-15 20:56:55
4,1024256001,slim_jeans,men_jeans_slim,dark_denim_blue,19.99,0.99,0.01,0.0,0.0,0.0,0.0,2022-12-15 20:56:55


In [154]:
df.to_csv("generated_data/cleaned_data.csv")