In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import re
import datetime

%matplotlib inline


In [2]:
df = pd.read_csv('../Data/amazon_clean_0512.csv', index_col=0)
df.head(2)

Unnamed: 0,name,price_current,price_original,reviews_number,seller,score,brand,Series,Color,height_cm,...,Tipo de conexión inalámbrica,usb_2,usb_3,Tipo de unidad óptica,os,battery_wh,battery_cells,weight_kg,Número de modelo del producto,since
0,"Huawei MateBook D 15"" - AMD Ryzen, Windows 10-...",14999.0,,156.0,Huawei,4.7,Huawei,Matebook D 15,Gris,0.17,...,Bluetooth,2.0,1.0,,Windows 10 Home,42.0,2.0,2.3,6901443370764.0,2020-02-20
1,"Asus Laptop VivoBook 15.6"", Core i7, 8GB RAM,...",17999.0,19999.0,,Asus,,Asus,X512FA-BR1412T,,,...,,2.0,2.0,Ninguno,Windows 10,1000.0,1.0,2.5,,2020-03-10


## Droping unuseful column to this case

In [3]:
df = df.drop(columns=[
    'Tecnología de la memoria', 
    'Tipo de unidad óptica', 
    'Tipo de conexión inalámbrica', 
    'Número de modelo del producto',
    'usb_3',
    'usb_2',
    'Series',
    'Color',
    'Descripción del disco duro',
    'Tipo de RAM para gráficos',
    'memory_interface',
    'gpu_interface',
    'resolution_x',
    'resolution_y',
    'gpu_ram',
    'Descripción de la tarjeta gráfica',
    'Coprocesador de gráficos',
    'Tipo de procesador'
    ])

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 512 entries, 0 to 511
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   name            512 non-null    object 
 1   price_current   220 non-null    float64
 2   price_original  41 non-null     float64
 3   reviews_number  469 non-null    float64
 4   seller          512 non-null    object 
 5   score           469 non-null    float64
 6   brand           512 non-null    object 
 7   height_cm       469 non-null    float64
 8   width_cm        468 non-null    float64
 9   screen_size_in  509 non-null    float64
 10  proc_brand      488 non-null    object 
 11  proc_speed_ghz  448 non-null    float64
 12  ram             506 non-null    float64
 13  ram_type        368 non-null    object 
 14  ram_max         342 non-null    float64
 15  memory          429 non-null    float64
 16  os              496 non-null    object 
 17  battery_wh      508 non-null    flo

## Converting dtypes

In [5]:
df = df.convert_dtypes()


In [6]:
df.brand = df.brand.str.lower()
df.brand = df.brand.astype('category')

In [7]:
df.seller = df.seller.str.lower()
df.seller = df.seller.astype('category')


In [8]:
df.proc_brand = df.proc_brand.astype('category')


In [9]:
df.os = df.os.astype('category')


In [10]:
# df.screen_size_in = df.screen_size_in.astype('category')

In [11]:
df.ram_type = df.ram_type.astype('category')

In [12]:
# df.ram = df.ram.astype('category')

In [13]:
df.since = pd.to_datetime(df.since)

In [14]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 512 entries, 0 to 511
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   name            512 non-null    string        
 1   price_current   220 non-null    float64       
 2   price_original  41 non-null     float64       
 3   reviews_number  469 non-null    Int64         
 4   seller          512 non-null    category      
 5   score           469 non-null    float64       
 6   brand           512 non-null    category      
 7   height_cm       469 non-null    float64       
 8   width_cm        468 non-null    float64       
 9   screen_size_in  509 non-null    Int64         
 10  proc_brand      488 non-null    category      
 11  proc_speed_ghz  448 non-null    float64       
 12  ram             506 non-null    Int64         
 13  ram_type        368 non-null    category      
 14  ram_max         342 non-null    Int64         
 15  memory

## Working with NAs and duplicates

#### Features to simplify the management of NaNs

In [15]:
def repl_na(column, value=0):
    column.fillna(value, inplace=True)

In [16]:
def most_common(column):
    return column.value_counts().first_valid_index()

In [17]:
def repl_common(column):
    repl_na(column, most_common(column))

#### Filling some fields with customs values

In [18]:
repl_na(df.reviews_number)

In [19]:
repl_na(df.score)


In [20]:
repl_na(df.price_current, df.price_current.median())

In [21]:
repl_na(df.price_original, df.price_current)

In [22]:
repl_na(df.height_cm, df.height_cm.mean())

In [23]:
repl_na(df.width_cm, df.width_cm.mean())

#### The remaining columns with NaNs will be replaced by the most common value

In [24]:
df.columns[df.isna().any()]

Index(['screen_size_in', 'proc_brand', 'proc_speed_ghz', 'ram', 'ram_type',
       'ram_max', 'memory', 'os', 'battery_wh', 'battery_cells'],
      dtype='object')

In [25]:
for column in df.columns[df.isna().any()]:
    repl_common(df[column])

In [26]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 512 entries, 0 to 511
Data columns (total 21 columns):
 #   Column          Non-Null Count  Dtype         
---  ------          --------------  -----         
 0   name            512 non-null    string        
 1   price_current   512 non-null    float64       
 2   price_original  512 non-null    float64       
 3   reviews_number  512 non-null    Int64         
 4   seller          512 non-null    category      
 5   score           512 non-null    float64       
 6   brand           512 non-null    category      
 7   height_cm       512 non-null    float64       
 8   width_cm        512 non-null    float64       
 9   screen_size_in  512 non-null    Int64         
 10  proc_brand      512 non-null    category      
 11  proc_speed_ghz  512 non-null    float64       
 12  ram             512 non-null    Int64         
 13  ram_type        512 non-null    category      
 14  ram_max         512 non-null    Int64         
 15  memory

## Detecting and managing outliers

#### Configuration for BoxPlots

In [27]:
def boxplot_config(ax, x, labels=None, title='', xlabel='', ylabel=''):
    # Personalization of siymbols and lines
    boxprops = dict(linestyle='--', linewidth=2, color='darkgoldenrod')
    flierprops = dict(marker='o', markerfacecolor='green', markersize=6, label='outlier')
    medianprops = dict(linestyle='-.', linewidth=2.5, color='firebrick', label='median')
    meanpointprops = dict(marker='D', markeredgecolor='black', markerfacecolor='skyblue', label='mean')

    bbplot = ax.boxplot(x,
                    patch_artist=True,  # enable fill with color
                    labels=labels,
                    showmeans = True,
                    boxprops=boxprops,
                    flierprops=flierprops,
                    medianprops=medianprops,
                    meanprops=meanpointprops
                    )

    # Assigning color for each box
    box_number = len( bbplot['boxes'] )

    color_fracs = np.array(list( range( box_number )) ) / box_number

    color_norm = matplotlib.colors.Normalize(color_fracs.min(), color_fracs.max())
    color_norm

    for frac, patch in zip(color_fracs, bbplot['boxes']):
        color = plt.cm.summer(color_norm(frac))
        patch.set_facecolor(color)

    # Adding labels
    ax.set_title(title)
    ax.set_xlabel(xlabel)
    ax.set_ylabel(ylabel)
    ax.legend()


## Showing filtered data and exporting processed data


df_time = df.set_index(df.since).copy()
df_time.head(2)

df_time.resample('Y').median()

df.to_csv('../Data/amazon_filtered_' + datetime.datetime.today().strftime('%m%d') + '.csv')