In [1]:
import pandas as pd
import re
import datetime

## Loading and Displaying initial data

In [2]:
df = pd.read_csv('AmazonScraper/amazon_0507.csv')
df.head(2)

Unnamed: 0,name,price_current,price_original,reviews_number,seller,score,Marca,Series,Color,Alto del producto,...,Número de puertos USB 2.0,Número de puertos USB 3.0,Tipo de unidad óptica,Sistema operativo,Lithium Battery Energy Content,Number of Lithium Ion Cells,Unnamed: 18,Peso del envío,Número de modelo del producto,Producto en Amazon.com.mx desde
0,\n \n \n ...,"$14,999.00",,139 calificaciones,Huawei,4.7 de 5 estrellas,Huawei,Matebook D 15,Gris,17 millimeters,...,2.0,1.0,,Windows 10 Home,42 watt_hours,2.0,,2.3 Kg,6901443370764,20 de febrero de 2020
1,\n \n \n ...,"$20,999.00",,55 calificaciones,Huawei,4.3 de 5 estrellas,HUAWEI,Matebook 13,,15 millimeters,...,,2.0,Ninguno,Windows 10,41.80 watt_hours,1.0,,2.2 Kg,Wright W19GL,14 de enero de 2020


In [3]:
df.info()



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 39 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   name                                 151 non-null    object 
 1   price_current                        107 non-null    object 
 2   price_original                       36 non-null     object 
 3   reviews_number                       132 non-null    object 
 4   seller                               151 non-null    object 
 5   score                                132 non-null    object 
 6   Marca                                151 non-null    object 
 7   Series                               142 non-null    object 
 8   Color                                89 non-null     object 
 9   Alto del producto                    134 non-null    object 
 10  Ancho del producto                   134 non-null    object 
 11  Tamaño de la pantalla           

## Defining functions for transformation and cleaning

In [4]:
def del_na(column):
    return column[column.notna()]

def only_first(cell, cast='int'):
    if cast == 'int':
        return int(cell.split()[0])
    else:
        return float(cell.split()[0])

def to_watts(cell):
    cell = cell.split(' ')
    watts = float(cell[0])
    if 'kilowatt' in cell[1]:
        watts *=  1000
    elif 'milliamp' in cell[1]:
        volts = 12
        watts = (watts*volts)/1000
    
    return int(watts)

## Renaming columns to make transformation easier

In [5]:
# Deleted empty column
df.drop(df.columns[35], axis='columns', inplace=True)

# Rename some columns
df.rename(columns = {
    'Alto del producto' : 'height_cm',
    'Ancho del producto' : 'width_cm',
    'Tamaño de la pantalla' : 'screen_size_in',
    'Velocidad del procesador' : 'proc_speed_ghz',
    'Tamaño de RAM': 'ram',
    'Memoria máxima compatible': 'ram_max',
    'Tamaño de la unidad de disco duro' : 'memory',
    'Tamaño de RAM de la tarjeta gráfica' : 'gpu_ram',
    'Número de puertos USB 2.0': 'usb_2',
    'Número de puertos USB 3.0': 'usb_3',
    'Lithium Battery Energy Content' : 'battery_wh',
    'Number of Lithium Ion Cells' : 'battery_cells',
    'Peso del envío' : 'weight_kg',
    }, inplace=True)

df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 38 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   name                                 151 non-null    object 
 1   price_current                        107 non-null    object 
 2   price_original                       36 non-null     object 
 3   reviews_number                       132 non-null    object 
 4   seller                               151 non-null    object 
 5   score                                132 non-null    object 
 6   Marca                                151 non-null    object 
 7   Series                               142 non-null    object 
 8   Color                                89 non-null     object 
 9   height_cm                            134 non-null    object 
 10  width_cm                             134 non-null    object 
 11  screen_size_in                  

## Transforming data

In [6]:
# Delete '/n' of each name
space = re.compile('^\s+\n*(.*?.)\s+\n\s*')
df.name = df.name.apply(lambda x: space.sub(r'\1', x))

# price_current
df.price_current = del_na(df.price_current).apply(
    lambda x: float( x.replace('$', '').replace(',', '')) )

# price_original
df.price_original = del_na(df.price_original).apply(
    lambda x: float(x.replace('$', '').replace(',', '')) )

# reviews_number
df.reviews_number = del_na(df.reviews_number).apply(
    lambda x: only_first(x.replace(',', ''), 'int') )

# score
df.score = del_na(df.score).apply(
    only_first, args=('float',) )

# alto
df.height_cm = del_na(df.height_cm).apply( 
    lambda x:  only_first(x, 'float')   if 'cent' in x    else only_first(x) / 100 )

# ancho
df.width_cm = del_na(df.width_cm).apply( 
    lambda x: only_first(x, 'float')    if 'cent' in x    else only_first(x) / 100 )

# screen_size_in
df.screen_size_in = del_na(df.screen_size_in).apply(only_first, args=('float',) )

# proc_speed_ghz
df.proc_speed_ghz = del_na(df.proc_speed_ghz).apply(only_first, args = ('float',) )

# ram
df.ram = del_na(df.ram).apply(only_first)

# ram_max
df.ram_max = del_na(df.ram_max).apply(only_first, args=('float',) )

# memory
df.memory = del_na(df.memory).apply(only_first)

# gpu_ram
df.gpu_ram = del_na(df.gpu_ram).apply(only_first)

# usb_2
df.usb_2 = del_na(df.usb_2).apply(int)

# usb_3
df.usb_3 = del_na(df.usb_3).apply(int)

# battey
df.battery_wh = del_na(df.battery_wh).apply(to_watts)

# battery_cells
df.battery_cells = del_na(df.battery_cells).apply(int)

# weight_kg
df.weight_kg = del_na(df.weight_kg).apply(only_first, args=('float',) )

## Showing clean data and exporting processed data

In [21]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 151 entries, 0 to 150
Data columns (total 38 columns):
 #   Column                               Non-Null Count  Dtype  
---  ------                               --------------  -----  
 0   name                                 151 non-null    object 
 1   price_current                        107 non-null    float64
 2   price_original                       36 non-null     float64
 3   reviews_number                       132 non-null    float64
 4   seller                               151 non-null    object 
 5   score                                132 non-null    float64
 6   Marca                                151 non-null    object 
 7   Series                               142 non-null    object 
 8   Color                                89 non-null     object 
 9   height_cm                            134 non-null    float64
 10  width_cm                             134 non-null    float64
 11  screen_size_in                  

In [22]:
df.head(2)

Unnamed: 0,name,price_current,price_original,reviews_number,seller,score,Marca,Series,Color,height_cm,...,Tipo de conexión inalámbrica,usb_2,usb_3,Tipo de unidad óptica,Sistema operativo,battery_wh,battery_cells,weight_kg,Número de modelo del producto,Producto en Amazon.com.mx desde
0,"Huawei MateBook D 15"" - AMD Ryzen, Windows 10-...",14999.0,,139.0,Huawei,4.7,Huawei,Matebook D 15,Gris,0.17,...,Bluetooth,2.0,1.0,,Windows 10 Home,42.0,2.0,2.3,6901443370764,20 de febrero de 2020
1,HUAWEI Matebook 13 New- Computadora portátil u...,20999.0,,55.0,Huawei,4.3,HUAWEI,Matebook 13,,0.15,...,,,2.0,Ninguno,Windows 10,41.0,1.0,2.2,Wright W19GL,14 de enero de 2020


In [9]:
df.to_csv('amazon_clean_' + datetime.datetime.today().strftime('%m%d') + '.csv')

In [20]:
df.usb_2 = del_na(df.usb_2).astype(int)