In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.colors
import re
import datetime

%matplotlib inline


In [None]:
df = pd.read_csv('../Data/amazon_clean_0512.csv', index_col=0)
df.head(2)

## Droping unuseful column to this case

In [None]:
df = df.drop(columns=[
    'Tecnología de la memoria', 
    'Tipo de unidad óptica', 
    'Tipo de conexión inalámbrica', 
    'Número de modelo del producto',
    'usb_3',
    'usb_2',
    'Series',
    'Color',
    'Descripción del disco duro',
    'Tipo de RAM para gráficos',
    'memory_interface',
    'gpu_interface',
    'resolution_x',
    'resolution_y',
    'gpu_ram',
    'Descripción de la tarjeta gráfica',
    'Coprocesador de gráficos',
    'Tipo de procesador'
    ])

In [None]:
df.info()

## Converting dtypes

In [None]:
df = df.convert_dtypes()


In [None]:
df.brand = df.brand.str.lower()
df.brand = df.brand.astype('category')

In [None]:
df.seller = df.seller.str.lower()
df.seller = df.seller.astype('category')


In [None]:
df.proc_brand = df.proc_brand.astype('category')


In [None]:
df.os = df.os.astype('category')


In [None]:
df.ram_type = df.ram_type.astype('category')

In [None]:
df.since = pd.to_datetime(df.since)

In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='category')

## Working with NAs and duplicates

#### Features to simplify the management of NaNs

In [None]:
def most_common(column):
    return column.value_counts().first_valid_index()

In [None]:
def repl_na(column, default = True, value = -1):
    if default:
        if column.dtype == 'Int64' or column.dtype == 'float64':
            column.fillna(column.median(), inplace=True)

        elif column.dtype == 'category':
            column.fillna(most_common(column), inplace=True)
    else:
        column.fillna(value, inplace=True)

#### Filling some fields with customs values

In [None]:
repl_na(df.price_original, False, df.price_current)

#### The remaining columns with NaNs will be replaced by the most common value for categoricals and by the median for numericals

In [None]:
df.columns[df.isna().any()]

In [None]:
for column in df.columns[df.isna().any()]:
    repl_na(df[column])


In [None]:
df.info()

In [None]:
df.describe()

In [None]:
df.describe(include='category')

## Detecting and managing outliers

#### Configuration for plotting

In [None]:
def plot_color(color_fractions, patches_to_paint, color_to_paint=plt.cm.Blues):
    color_normalized = matplotlib.colors.Normalize(color_fractions.min(), color_fractions.max())

    for fraction, patch in zip(color_fractions, patches_to_paint):
        color = color_to_paint(color_normalized(fraction))
        patch.set_facecolor(color)

In [None]:
def boxplot_config(ax, x, labels=None, title='', x_label='', y_label=''):
    # Personalization of siymbols and lines
    boxprops = dict( linestyle='--', linewidth=1, color='black' )
    flierprops = dict( marker='o', markerfacecolor='skyblue', markersize=6, label='outlier' )
    medianprops = dict( linestyle='-.', linewidth=2.5, color='green', label='median' )
    meanpointprops = dict( marker='D', markeredgecolor='black', markerfacecolor='skyblue', label='mean' )

    bbplot_patches = ax.boxplot(x,
                    patch_artist=True,  # enable fill with color
                    labels=labels,
                    showmeans = True,
                    boxprops=boxprops,
                    flierprops=flierprops,
                    medianprops=medianprops,
                    meanprops=meanpointprops
                    )

    # Assigning color for each box
    box_number = len( bbplot_patches['boxes'] )
    color_fractions = np.array( list( range(box_number) ) ) / box_number
    plot_color(color_fractions, bbplot_patches['boxes'], plt.cm.tab20)
    
    # Adding labels
    ax.set_title(title)
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.legend()


In [None]:
def histogram_config(ax, x, title='', x_label='', y_label=''):
    frequencies, _ , bar_patches = ax.hist(x, edgecolor='k', linewidth=1)
    
    color_fractions = frequencies / frequencies.max()
    plot_color(color_fractions, bar_patches)

    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(title)
    ax.tick_params(axis='x', labelrotation=90)
    ax.tick_params(axis='y', labelrotation=45)


In [None]:
def _print_plots(dataframe, dtype, plot_cols, plot_func):
    columns_names = dataframe.select_dtypes(include=dtype).columns

    plots_quantity = len( columns_names )

    plot_rows = int( np.ceil( plots_quantity / plot_cols ) )
    plot_size_y = plot_rows * 5
    plot_size_x = plot_cols * 10

    fig, ax = plt.subplots( plot_rows, plot_cols, figsize=(plot_size_x, plot_size_y) )

    for nrow in range(plot_rows):
        for ncol in range(plot_cols):
            if plots_quantity >= 1:
                plots_quantity -= 1
                plot_func( ax[nrow][ncol], dataframe[ columns_names[plots_quantity] ], title=columns_names[plots_quantity] )
            else:
                break

In [None]:
def dynamic_plot(dataframe, dtype = 'number', plot_cols = 2):
    isint = 'int' in ''.join(dtype).lower()
    isfloat = 'float' in ''.join(dtype).lower()
    isnumber_type = dtype == 'number'  or isint or isfloat

    if dtype == 'category':
        _print_plots(dataframe, dtype, plot_cols, histogram_config)

    elif isnumber_type:
        _print_plots(dataframe, dtype, plot_cols, boxplot_config)

### Plotting

In [None]:
df.select_dtypes(include = 'number').columns

In [None]:
dynamic_plot(df)

### Hunting Outliers & duplicates

In [None]:
df.drop(df[df.price_current > 40000].index,axis=0, inplace=True)
df = df.reset_index(drop=True)

In [None]:
df.height_cm[df.height_cm > 3] = df.height_cm.median()

In [None]:
df.width_cm[(df.width_cm > 45) | (df.width_cm < 18)] = df.width_cm.median()

In [None]:
df.proc_speed_ghz[df.proc_speed_ghz > 500] = df.proc_speed_ghz[df.proc_speed_ghz > 500] / 1000
df.proc_speed_ghz[df.proc_speed_ghz == 64] = df.proc_speed_ghz.median()

In [None]:
df.ram[df.ram > 128] = df.ram.median()

In [None]:
df.ram_max[df.ram_max > 128] = df.ram_max.median()

In [None]:
df.battery_wh[df.battery_wh > 1e+02] = df.battery_wh.median()

In [None]:
df.battery_cells[df.battery_cells > 6] = df.battery_cells.median()

In [None]:
df.weight_kg[df.weight_kg > 7] = df.weight_kg.median()

In [None]:
df.select_dtypes(include='category').columns

In [None]:
dynamic_plot(df, 'category')

In [None]:
def drop_contains(dataframe, columns_names):
    # columns_names = dataframe.select_dtypes(include='category').columns
    for column in columns_names:

        half_list_uniques = np.ceil( len( dataframe[column].unique() ) / 2 ).astype('int')
        list_uniques = dataframe[column].unique()[ 0:half_list_uniques ]

        for unique in list_uniques:
            dataframe[column] [ dataframe[column].str.contains(unique, case=False) ] = unique

In [None]:
df.select_dtypes(include='category').columns

In [None]:
drop_contains(df, ['seller', 'brand', 'proc_brand', 'os'])

## Showing filtered data and exporting processed data


### Plotting numeric variables

In [None]:
dynamic_plot(df)

### Plotting categorical variables

In [None]:
dynamic_plot(df, 'category')

In [None]:
df.head(2)

In [None]:
df.to_csv('../Data/amazon_filtered_' + datetime.datetime.today().strftime('%m%d') + '.csv')

In [None]:
df_time = df.set_index(df.since).copy()
df_time.head(2)

In [None]:
df_time.resample('Y').mean()

In [None]:
df_time.resample('Y').median()