In [1]:
import pandas as pd
import re
import datetime

## Loading and Displaying initial data

In [2]:
df = pd.read_csv('../Data/amazon_0512.csv')
df.head(2)

Unnamed: 0,name,price_current,price_original,reviews_number,seller,score,Marca,Series,Color,Alto del producto,...,Número de puertos USB 2.0,Número de puertos USB 3.0,Tipo de unidad óptica,Sistema operativo,Lithium Battery Energy Content,Number of Lithium Ion Cells,Unnamed: 18,Peso del envío,Número de modelo del producto,Producto en Amazon.com.mx desde
0,\r\n \r\n \r...,"$14,999.00",,156 calificaciones,Huawei,4.7 de 5 estrellas,Huawei,Matebook D 15,Gris,17 millimeters,...,2.0,1.0,,Windows 10 Home,42 watt_hours,2.0,,2.3 Kg,6901443370764.0,20 de febrero de 2020
1,\r\n \r\n \r...,"$17,999.00","$19,999.00",,Asus,,Asus,X512FA-BR1412T,,,...,2.0,2.0,Ninguno,Windows 10,1 kilowatt_hours,1.0,,2.5 Kg,,10 de marzo de 2020


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512 entries, 0 to 511
Data columns (total 39 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   name                                 512 non-null    object 
 1   price_current                        220 non-null    object 
 2   price_original                       41 non-null     object 
 3   reviews_number                       469 non-null    object 
 4   seller                               512 non-null    object 
 5   score                                469 non-null    object 
 6   Marca                                512 non-null    object 
 7   Series                               475 non-null    object 
 8   Color                                293 non-null    object 
 9   Alto del producto                    469 non-null    object 
 10  Ancho del producto                   468 non-null    object 
 11  Tamaño de la pantalla           

## Defining functions for transformation and cleaning

In [4]:
def del_na(column):
    return column[column.notna()]

In [5]:
def only_first(cell, cast='int'):
    if cast == 'int':
        return int(round(float(cell.split()[0]), 0) )
    else:
        return float(cell.split()[0])

In [6]:
def to_watts(cell):
    cell = cell.split(' ')
    watts = float(cell[0])
    if 'kilowatt' in cell[1]:
        watts *=  1000
    elif 'milliamp' in cell[1]:
        volts = 12
        watts = (watts*volts)/1000
    
    return int(watts)

In [7]:
def cast_int(value):
    try:
        return int(value)
    except:
        return None        

In [8]:
def rename_col(dict_name):
    df.rename(columns = dict_name, inplace = True)

In [9]:
months_list = ["enero", "febrero", "marzo", 
                "abril", "mayo", "junio", 
                "julio", "agosto", "septiembre", 
                "octubre", "noviembre", "diciembre"]
month_dict = {}
month_value = 1

for month in months_list:
    month_dict[month] = month_value
    month_value += 1

def to_date(date_text):
    date_text = date_text.split()
    day = int(date_text[0])
    month = month_dict[date_text[2]]
    year = int(date_text[4])

    return datetime.date(year, month, day)

## Transforming data

In [10]:
# Deleted empty column
df.drop(df.columns[35], axis='columns', inplace=True)

In [11]:
# Delete '/n' of each name
space = re.compile('^\s+\n*(.*?.)\s+\n\s*')

df.name = df.name.apply( 
    lambda x: space.sub(r'\1', x) )

In [12]:
# price_current
df.price_current = del_na(df.price_current).apply(
    lambda x: float( x.replace('$', '').replace(',', '')) )

In [13]:
# price_original
df.price_original = del_na(df.price_original).apply(
    lambda x: float(x.replace('$', '').replace(',', '')) )

In [14]:
# reviews_number
df.reviews_number = del_na(df.reviews_number).apply(
    lambda x: only_first(x.replace(',', ''), 'int') )

In [15]:
# score
df.score = del_na(df.score).apply(
    only_first, args = ('float',) )

In [16]:
# alto
rename_col({'Alto del producto' : 'height_cm'})

df.height_cm = del_na(df.height_cm).apply( 
    lambda x:  only_first(x, 'float')   if 'cent' in x    else only_first(x) / 100 )

In [17]:
# ancho
rename_col({'Ancho del producto' : 'width_cm'})

df.width_cm = del_na(df.width_cm).apply( 
    lambda x: only_first(x, 'float')    if 'cent' in x    else only_first(x) / 100 )

In [18]:
# screen_size_in
rename_col({'Tamaño de la pantalla' : 'screen_size_in'})
df.screen_size_in = del_na(df.screen_size_in).apply(
    lambda x: round(only_first(x, 'float'), 0))
df.screen_size_in = df.screen_size_in.astype('category')

In [19]:
# proc_speed_ghz
rename_col({'Velocidad del procesador' : 'proc_speed_ghz'})

df.proc_speed_ghz = del_na(df.proc_speed_ghz).apply(only_first, args = ('float',) )

In [20]:
# ram
rename_col({'Tamaño de RAM': 'ram'})

df.ram = del_na(df.ram).apply(only_first)

In [21]:
# ram_max
rename_col({'Memoria máxima compatible': 'ram_max'})

df.ram_max = del_na(df.ram_max).apply(only_first)

In [22]:
# memory
rename_col({'Tamaño de la unidad de disco duro' : 'memory',})

df.memory = del_na(df.memory).apply(only_first)

In [23]:
# gpu_ram
rename_col({'Tamaño de RAM de la tarjeta gráfica' : 'gpu_ram',})

df.gpu_ram = del_na(df.gpu_ram).apply(only_first)

In [24]:
# usb_2
rename_col({'Número de puertos USB 2.0': 'usb_2',})

df.usb_2 = del_na(df.usb_2).apply(int)

In [25]:
# usb_3
rename_col({'Número de puertos USB 3.0': 'usb_3',})

df.usb_3 = del_na(df.usb_3).apply(int)

In [26]:
# battey
rename_col({'Lithium Battery Energy Content' : 'battery_wh',})

df.battery_wh = del_na(df.battery_wh).apply(to_watts)

In [27]:
# battery_cells
rename_col({'Number of Lithium Ion Cells' : 'battery_cells',})
df.battery_cells = del_na(df.battery_cells).apply(int)
df.battery_cells = df.battery_cells.astype('category')

In [28]:
# weight_kg
rename_col({'Peso del envío' : 'weight_kg'})

df.weight_kg = del_na(df.weight_kg).apply(only_first, args=('float',) )

In [29]:
# new column with resolution X of scree from screen resolution
df['resolution_x'] = del_na(df['Resolución de la pantalla']).apply(
    lambda x: x.replace(' ', '').split('x')[0]
    ).apply(cast_int)

In [30]:
# new column with resolution Y of screen from screen resolution
df['resolution_y'] = del_na(df['Resolución de la pantalla']).apply( 
    lambda x: x.split('x')[1].split()[0] if len(x.split('x')) > 1 else None 
    ).apply(cast_int)

In [31]:
# Ordering columns ['resolution_x' & 'resolution_y'] after column index 12
df = df[df.columns.to_list()[:13] + ['resolution_x', 'resolution_y'] + df.columns.to_list()[12:-2]]

In [32]:
# Drop old column of screen resolution
del df['Resolución de la pantalla']

In [33]:
rename_col({'Tipo de memoria del equipo' : 'ram_type'})
df.ram_type = del_na(df.ram_type).apply(lambda x: x.split()[0])
df.ram_type = df.ram_type.astype('category')

In [34]:
# since
rename_col({'Producto en Amazon.com.mx desde' : 'since'})
df.since = df.since.apply(to_date)
df.since = df.since.astype('datetime64[ns]')

## Transforming DTypes

In [35]:
# Perform types of DataFrame
df = df.convert_dtypes()


In [36]:
rename_col({'Marca' : 'brand'})
df.brand = df.brand.astype('category')

In [37]:
df.seller = df.seller.astype('category')

In [38]:
rename_col({'Marca del procesador' : 'proc_brand'})
df.proc_brand = df.proc_brand.astype('category')

In [39]:
rename_col({'Interfaz de la unidad de disco duro' : 'memory_interface'})
df.memory_interface = df.memory_interface.astype('category')

In [40]:
rename_col({'Sistema operativo' : 'os'})
df.os = df.os.astype('category')

In [41]:
rename_col({'Interfaz de la tarjeta gráfica' : 'gpu_interface'})
df.gpu_interface = df.gpu_interface.astype('category')

## Showing clean data and exporting processed data

In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 512 entries, 0 to 511
Data columns (total 39 columns):
 #   Column                             Non-Null Count  Dtype         
---  ------                             --------------  -----         
 0   name                               512 non-null    string        
 1   price_current                      220 non-null    float64       
 2   price_original                     41 non-null     float64       
 3   reviews_number                     469 non-null    Int64         
 4   seller                             512 non-null    category      
 5   score                              469 non-null    float64       
 6   brand                              512 non-null    category      
 7   Series                             475 non-null    string        
 8   Color                              293 non-null    string        
 9   height_cm                          469 non-null    float64       
 10  width_cm                           468

In [43]:
df.head(2)

Unnamed: 0,name,price_current,price_original,reviews_number,seller,score,brand,Series,Color,height_cm,...,Tipo de conexión inalámbrica,usb_2,usb_3,Tipo de unidad óptica,os,battery_wh,battery_cells,weight_kg,Número de modelo del producto,since
0,"Huawei MateBook D 15"" - AMD Ryzen, Windows 10-...",14999.0,,156.0,Huawei,4.7,Huawei,Matebook D 15,Gris,0.17,...,Bluetooth,2,1,,Windows 10 Home,42,2.0,2.3,6901443370764.0,2020-02-20
1,"Asus Laptop VivoBook 15.6"", Core i7, 8GB RAM,...",17999.0,19999.0,,Asus,,Asus,X512FA-BR1412T,,,...,,2,2,Ninguno,Windows 10,1000,1.0,2.5,,2020-03-10


In [44]:
df.to_csv('../Data/amazon_clean_' + datetime.datetime.today().strftime('%m%d') + '.csv')