In [9]:
%run helpers.py

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt

In [7]:
# %load features.py
# This file contains all functions related to transforming and cleaning
# the features
import numpy as np

MIN_MAX_VALUES = {
    0:  (0,  500), 1 : (0,  220), 2: (0, 300),  3: (0, 320),
    5:  (0, 2300),                              8: (0, 100),  9: (0, 1000),
    10: (0,    6),                             13: (0, 150),
                   16: (0,  180),                            19: (0,  210),
                   21: (0, 1000),              23: (0, 500),
                   26: (0,  250),                            29: (0,  500),
}

def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    # this function should return the matrix formed
    # by applying the polynomial basis to the input data
    return np.array([np.concatenate([[1.0]] + [[xi ** d for d in range(1, degree+1)] for xi in row]) for row in x])

def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x, mean_x, std_x

def standardize_all(x):
    x = np.apply_along_axis(lambda xi: standardize(xi)[0], 0, x)
    return x

def clamp(data, min_value, max_value, mode='clamp'):
    if mode == 'clamp':
        data[data > max_value] = max_value
        data[data < min_value] = min_value
    else:
        data[data > max_value] = np.NaN
        data[data < min_value] = np.NaN

def remove_errors(data):
    data = data.copy()
    data[data == -999.0] = np.NaN
    return data

def remove_outliers(data, mode='clamp'):
    data = data.copy()

    for key, value in MIN_MAX_VALUES.items():
        clamp(data[:, key], value[0], value[1], mode)

    return data

def remove_nan_features(x):
    x = x[:, ~np.any(np.isnan(x), axis=0)]
    return x

def remove_nan_samples(x, y):
    mask = np.any(np.isnan(x), axis=1)
    return x[~mask], y[~mask]


In [10]:
y, x, ids = load_csv_data('data/train.csv', sub_sample=True)

# Raw Data

*"How much data dowe have in total?"*

In [11]:
print("- There are {} data points in the dataset".format(x.shape[0]))
print("- There are {} features in x".format(x.shape[1]))

- There are 5000 data points in the dataset
- There are 30 features in x


In [12]:
def describe_features(x):
    mean_x = np.nanmean(x, axis=0)
    var_x = np.nanvar(x, axis=0)
    max_x = np.nanmax(x, axis=0)
    min_x = np.nanmin(x, axis=0)
    nan_x = np.sum(np.isnan(x), axis=0)
    
    print(f'Shape: {x.s}')
    print('ID |    Mean    |    Var     |    Min     |    Max     |   #NaN   |')
    print('---|------------|------------|------------|------------|----------|')
    
    for f in range(0, x.shape[1]):
        mean = '{0:.3f}'.format(mean_x[f]).rjust(10)
        var = '{0:.3f}'.format(var_x[f]).rjust(10)
        maxi = '{0:.3f}'.format(max_x[f]).rjust(10)
        mini = '{0:.3f}'.format(min_x[f]).rjust(10)
        nani = str(nan_x[f]).rjust(8)
        identifier = str(f).rjust(2)
    
        print('{} | {} | {} | {} | {} | {} |'.format(identifier, mean, var, mini, maxi, nani))


*"What does the data look like?"*

In [13]:
describe_features(x)

Shape: (5000, 30)
ID |    Mean    |    Var     |    Min     |    Max     |   #NaN   |
---|------------|------------|------------|------------|----------|
 0 |    -66.570 | 178522.382 |   -999.000 |    977.515 |        0 |
 1 |     50.266 |   1227.770 |      0.011 |    256.371 |        0 |
 2 |     81.007 |   1712.009 |      9.463 |    658.494 |        0 |
 3 |     57.243 |   3945.247 |      0.000 |    585.661 |        0 |
 4 |   -703.584 | 208562.497 |   -999.000 |      7.544 |        0 |
 5 |   -596.237 | 432395.561 |   -999.000 |   3056.908 |        0 |
 6 |   -704.530 | 207232.544 |   -999.000 |     15.228 |        0 |
 7 |      2.376 |      0.627 |      0.256 |      5.423 |        0 |
 8 |     18.975 |    495.589 |      0.000 |    257.324 |        0 |
 9 |    157.925 |  13633.526 |     46.227 |   1279.985 |        0 |
10 |      1.448 |      0.705 |      0.136 |      9.955 |        0 |
11 |     -0.159 |      1.403 |     -1.414 |      1.414 |        0 |
12 |   -704.155 | 207756.906 |

# Clean Data

In [15]:
def clean_data(x):
    
    x = remove_errors(x)
    x = remove_outliers(x)
    x = standardize_all(x)
    x = remove_nan_features(x)
    x = build_poly(x, degree)
    
x_clean = clean_data(x)
describe_features(x_clean)

NameError: name 'clean_data' is not defined

*"What does the clean data look like?"*

In [None]:
# describe
# x_clean[np.isnan(x_clean) == False]

In [None]:
def histo_feature(data, i, ax=None, b=None):
    
    indexes = ~np.isnan(data[:,i])
    
    if b == None:
        color = 'blue'
    elif b:
        indexes = indexes & (y == 1)
        color='green'
    else:
        indexes = indexes & (y == -1)
        color='red'
        
    if ax ==None:
        ax = plt
    
    ax.hist(data[:,i][indexes], 250, facecolor=color, alpha=0.75)
    
def histo_features(data, b=None):
    fig = plt.figure(figsize = (15,15))
    
    for i in range(0, 30):
        ax = fig.add_subplot(6,5,i+1)
        histo_feature(data, i, ax=ax, b=b)

In [None]:
# histo_features(x_clean)

In [None]:
# histo_features(x_clean, b=True)

In [None]:
# histo_features(x_clean, b=False)

In [None]:
def clamp(data, max_value):
    data[data > max_value] = np.NaN

def remove_outliers(data):
    data = data.copy()
    clamp(data[:, 0], 500)
    clamp(data[:, 1], 220)
    clamp(data[:, 2], 300)
    clamp(data[:, 3], 320)
    # 4
    clamp(data[:, 5], 2300)
    # 6
    # 7
    clamp(data[:, 8], 100)
    clamp(data[:, 9], 1000)
    clamp(data[:, 10], 6)
    # 11
    # 12
    clamp(data[:, 13], 150)
    # 14
    # 15
    clamp(data[:, 16], 180)
    # 17 
    # 18
    clamp(data[:, 19], 210)
    # 20
    clamp(data[:, 21], 1000)
    # 22
    clamp(data[:, 23], 500)
    # 24
    # 25
    clamp(data[:, 26], 250)
    # 27
    # 28
    clamp(data[:, 29], 500)
    
    return data

In [None]:
x_clamped = remove_outliers(x_clean)

In [None]:
histo_features(x_clamped)

In [None]:
# histo_features(x_clamped, b=True)

In [None]:
# histo_features(x_clamped, b=False)

In [None]:
def compare_features(data):
    fig = plt.figure(figsize = (15,150))
    
    for i in range(0, 60):
        ax = fig.add_subplot(30,2,i+1)
        histo_feature(data, int(i/2.0), ax=ax, b=(i%2==0))

In [None]:
compare_features(x_clamped)

### Normalization

In [None]:
def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x, mean_x, std_x

In [None]:
x_std = np.vectorize(lambda x: standardize(x))(x_clean)

In [None]:
x_test = np.apply_along_axis(lambda x: standardize(x)[0], 0, x_clean)
mask = np.all(np.isnan(x_test), axis=0)
x_test = x_test[:, ~mask]

# describe_features(x_test.dropna())

describe_features(x_test)
# d[:,~np.all(np.isnan(d), axis=0)]