In [2]:
%run helpers.py

import numpy as np
import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import warnings

warnings.filterwarnings('ignore')

In [3]:
# %load features.py
# This file contains all functions related to transforming and cleaning
# the features
import numpy as np

MIN_MAX_VALUES = {
    0:  (0,  500), 1 : (0,  220), 2: (0, 300),  3: (0, 320),
    5:  (0, 2300),                              8: (0, 100),  9: (0, 1000),
    10: (0,    6),                             13: (0, 150),
                   16: (0,  180),                            19: (0,  210),
                   21: (0, 1000),              23: (0, 500),
                   26: (0,  250),                            29: (0,  500),
}

def build_poly(x, degree):
    """polynomial basis functions for input data x, for j=0 up to j=degree."""
    # this function should return the matrix formed
    # by applying the polynomial basis to the input data
    return np.array([np.concatenate([[1.0]] + [[xi ** d for d in range(1, degree+1)] for xi in row]) for row in x])

def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x, mean_x, std_x

def standardize_all(x):
    x = np.apply_along_axis(lambda xi: standardize(xi)[0], 0, x)
    return x

def clamp(data, min_value, max_value, mode='clamp'):
    if mode == 'clamp':
        data[data > max_value] = max_value
        data[data < min_value] = min_value
    else:
        data[data > max_value] = np.NaN
        data[data < min_value] = np.NaN

def remove_errors(data):
    data = data.copy()
    data[data == -999.0] = np.NaN
    return data

def remove_outliers(data, mode='clamp'):
    data = data.copy()

    for key, value in MIN_MAX_VALUES.items():
        clamp(data[:, key], value[0], value[1], mode)

    return data

def remove_nan_features(x):
    x = x[:, ~np.any(np.isnan(x), axis=0)]
    return x

def remove_nan_samples(x, y):
    mask = np.any(np.isnan(x), axis=1)
    return x[~mask], y[~mask]

def decompose_categorical(x):
    return np.array([np.vectorize(lambda xi: 1 if xi == i else 0)(x) for i in np.unique(x)]).T

def separate_features(x, indexes):
    return x[:, indexes], np.delete(x, indexes, axis=1)


In [4]:
y, x, ids = load_csv_data('data/train.csv', sub_sample=True)

# Raw Data

*"How much data dowe have in total?"*

In [9]:
print("- There are {} data points in the dataset".format(x.shape[0]))
print("- There are {} features in x".format(x.shape[1]))

- There are 5000 data points in the dataset
- There are 30 features in x


In [12]:
def describe_features(x):
    mean_x = np.nanmean(x, axis=0)
    var_x = np.nanvar(x, axis=0)
    max_x = np.nanmax(x, axis=0)
    min_x = np.nanmin(x, axis=0)
    nan_x = np.sum(np.isnan(x), axis=0)
    
    print(f'Shape: {x.shape}')
    print('ID |    Mean    |    Var     |    Min     |    Max     |   #NaN   |')
    print('---|------------|------------|------------|------------|----------|')
    
    for f in range(0, x.shape[1]):
        mean = '{0:.3f}'.format(mean_x[f]).rjust(10)
        var = '{0:.3f}'.format(var_x[f]).rjust(10)
        maxi = '{0:.3f}'.format(max_x[f]).rjust(10)
        mini = '{0:.3f}'.format(min_x[f]).rjust(10)
        nani = str(nan_x[f]).rjust(8)
        identifier = str(f).rjust(2)
    
        print('{} | {} | {} | {} | {} | {} |'.format(identifier, mean, var, mini, maxi, nani))


*"What does the data look like?"*

In [13]:
describe_features(x)

Shape: (5000, 30)
ID |    Mean    |    Var     |    Min     |    Max     |   #NaN   |
---|------------|------------|------------|------------|----------|
 0 |    -66.570 | 178522.382 |   -999.000 |    977.515 |        0 |
 1 |     50.266 |   1227.770 |      0.011 |    256.371 |        0 |
 2 |     81.007 |   1712.009 |      9.463 |    658.494 |        0 |
 3 |     57.243 |   3945.247 |      0.000 |    585.661 |        0 |
 4 |   -703.584 | 208562.497 |   -999.000 |      7.544 |        0 |
 5 |   -596.237 | 432395.561 |   -999.000 |   3056.908 |        0 |
 6 |   -704.530 | 207232.544 |   -999.000 |     15.228 |        0 |
 7 |      2.376 |      0.627 |      0.256 |      5.423 |        0 |
 8 |     18.975 |    495.589 |      0.000 |    257.324 |        0 |
 9 |    157.925 |  13633.526 |     46.227 |   1279.985 |        0 |
10 |      1.448 |      0.705 |      0.136 |      9.955 |        0 |
11 |     -0.159 |      1.403 |     -1.414 |      1.414 |        0 |
12 |   -704.155 | 207756.906 |

# Clean Data

*"What does the clean data look like?"*

### 1 - Simple Cleaning

In [56]:
def clean_data(x):
    
    x = remove_errors(x)
    x = remove_outliers(x)
    x = standardize_all(x)
    x = remove_nan_features(x)
#     x = build_poly(x, 2)
    
    return x
    
describe_features(clean_data(x))

Shape: (5000, 19)
ID |    Mean    |    Var     |    Min     |    Max     |   #NaN   |
---|------------|------------|------------|------------|----------|
 0 |      0.000 |      1.000 |     -1.439 |      4.862 |        0 |
 1 |      0.000 |      1.000 |     -1.878 |      5.796 |        0 |
 2 |      0.000 |      1.000 |     -0.941 |      4.360 |        0 |
 3 |      0.000 |      1.000 |     -2.677 |      3.848 |        0 |
 4 |      0.000 |      1.000 |     -0.934 |      4.100 |        0 |
 5 |     -0.000 |      1.000 |     -0.964 |      7.273 |        0 |
 6 |     -0.000 |      1.000 |     -1.611 |      5.613 |        0 |
 7 |     -0.000 |      1.000 |     -1.060 |      1.328 |        0 |
 8 |     -0.000 |      1.000 |     -0.901 |      5.673 |        0 |
 9 |     -0.000 |      1.000 |     -2.018 |      2.026 |        0 |
10 |      0.000 |      1.000 |     -1.752 |      1.752 |        0 |
11 |     -0.000 |      1.000 |     -0.977 |      6.437 |        0 |
12 |      0.000 |      1.000 |

In [54]:
def normalize(x):
    
    return (x - x.min()) / (x.max() - x.min())

def normalize_all(x):
    
    x = np.apply_along_axis(lambda xi: normalize(xi), 0, x)
    return x


# def build_poly_cross(x):
    
#     cross_x = [xi * xj for xi in row for j in row for row in x if (i!=j && i > j)]
#     return cross_x


def clean_data_2(x):
    
    x = remove_errors(x)
    x = remove_outliers(x)
    x = normalize_all(x)
    x = remove_nan_features(x)
    x = build_poly(x, 2)
    
    return x
    
describe_features(clean_data_2(x))

Shape: (5000, 39)
ID |    Mean    |    Var     |    Min     |    Max     |   #NaN   |
---|------------|------------|------------|------------|----------|
 0 |      1.000 |      0.000 |      1.000 |      1.000 |        0 |
 1 |      0.228 |      0.025 |      0.000 |      1.000 |        0 |
 2 |      0.077 |      0.010 |      0.000 |      1.000 |        0 |
 3 |      0.245 |      0.017 |      0.000 |      1.000 |        0 |
 4 |      0.077 |      0.012 |      0.000 |      1.000 |        0 |
 5 |      0.177 |      0.036 |      0.000 |      1.000 |        0 |
 6 |      0.067 |      0.020 |      0.000 |      1.000 |        0 |
 7 |      0.410 |      0.023 |      0.000 |      1.000 |        0 |
 8 |      0.192 |      0.016 |      0.000 |      1.000 |        0 |
 9 |      0.185 |      0.039 |      0.000 |      1.000 |        0 |
10 |      0.074 |      0.024 |      0.000 |      1.000 |        0 |
11 |      0.117 |      0.015 |      0.000 |      1.000 |        0 |
12 |      0.028 |      0.005 |

In [47]:
def build_poly_cross(x):
    
    cross_x = np.array([[row[i] * row[j] for i in range(len(row)) for j in range(len(row)) if j != i and j > i] for row in x]
    )
    exp_x = build_poly(x, 2)
    
    return np.concatenate([exp_x, cross_x], axis=1)

In [48]:
build_poly_cross([[1, 2, 3], [4, 5, 6]])

array([[ 1.,  1.,  1.,  2.,  4.,  3.,  9.,  2.,  3.,  6.],
       [ 1.,  4., 16.,  5., 25.,  6., 36., 20., 24., 30.]])

In [7]:
def histo_feature(data, i, ax=None, b=None):
    
    indexes = ~np.isnan(data[:,i])
    
    if b == None:
        color = 'blue'
    elif b:
        indexes = indexes & (y == 1)
        color='green'
    else:
        indexes = indexes & (y == -1)
        color='red'
        
    if ax ==None:
        ax = plt
    
    ax.hist(data[:,i][indexes], 250, facecolor=color, alpha=0.75)
    
def histo_features(data, b=None):
    fig = plt.figure(figsize = (15,15))
    
    for i in range(0, 30):
        ax = fig.add_subplot(6,5,i+1)
        histo_feature(data, i, ax=ax, b=b)

In [None]:
def compare_features(data):
    fig = plt.figure(figsize = (15,150))
    
    for i in range(0, 60):
        ax = fig.add_subplot(30,2,i+1)
        histo_feature(data, int(i/2.0), ax=ax, b=(i%2==0))

In [None]:
compare_features(x_clamped)

### Normalization

In [None]:
def standardize(x):
    """Standardize the original data set."""
    mean_x = np.mean(x)
    x = x - mean_x
    std_x = np.std(x)
    x = x / std_x
    return x, mean_x, std_x

In [None]:
x_std = np.vectorize(lambda x: standardize(x))(x_clean)

In [None]:
x_test = np.apply_along_axis(lambda x: standardize(x)[0], 0, x_clean)
mask = np.all(np.isnan(x_test), axis=0)
x_test = x_test[:, ~mask]

# describe_features(x_test.dropna())

describe_features(x_test)
# d[:,~np.all(np.isnan(d), axis=0)]

# Advanced Information

### Mass of Higgs Boson

Most certainly, it is important to keep the first feature, which indicates the mass of the potential particle that could be a Higgs Boson. Unfortunately, it is 

In [132]:
index = (x[:, 0] == -999.0)
y_test = y[index]
print(f'There are {y_test.shape[0]} cases where the mass of the Higgs Boson candidate is not available. \nOf these, {(y_test[y_test == 1]).shape[0]} signaled a Higgs Boson and {y_test[y_test == -1].shape[0]} did not.')

There are 840 cases where the mass of the Higgs Boson candidate is not available. 
Of these, 69 signaled a Higgs Boson and 771 did not.


In [115]:
avg_mass = np.mean(x[~index, 0])
print(f'When there is a candidate, its average mass is {avg_mass}')

When there is a candidate, its average mass is 121.70888798076925


### Jet Numbers

The categorical attribute `jet_number` influences how certain columns are `NaN`. Let's see how the table looks like for different jet numbers.

In [121]:
describe_features(remove_errors(x[x[:, 22] == 0]))

Shape: (1985, 30)
ID |    Mean    |    Var     |    Min     |    Max     |   #NaN   |
---|------------|------------|------------|------------|----------|
 0 |    120.976 |   2972.097 |     17.252 |    760.613 |      562 |
 1 |     60.833 |    996.625 |      0.110 |    239.881 |        0 |
 2 |     81.882 |   1400.835 |     13.423 |    432.568 |        0 |
 3 |     13.516 |    209.414 |      0.000 |     95.050 |        0 |
 4 |        nan |        nan |        nan |        nan |     1985 |
 5 |        nan |        nan |        nan |        nan |     1985 |
 6 |        nan |        nan |        nan |        nan |     1985 |
 7 |      2.669 |      0.490 |      0.480 |      5.398 |        0 |
 8 |     13.516 |    209.414 |      0.000 |     95.050 |        0 |
 9 |     75.376 |    441.799 |     46.227 |    204.230 |        0 |
10 |      1.429 |      0.358 |      0.240 |      5.126 |        0 |
11 |     -0.945 |      0.811 |     -1.414 |      1.414 |        0 |
12 |        nan |        nan |

In [122]:
describe_features(remove_errors(x[x[:, 22] == 1]))

Shape: (1540, 30)
ID |    Mean    |    Var     |    Min     |    Max     |   #NaN   |
---|------------|------------|------------|------------|----------|
 0 |    123.538 |   4873.741 |     18.891 |    977.515 |      172 |
 1 |     46.691 |   1296.879 |      0.073 |    256.371 |        0 |
 2 |     82.376 |   2238.650 |     14.910 |    658.494 |        0 |
 3 |     66.597 |   2381.826 |      0.172 |    480.343 |        0 |
 4 |        nan |        nan |        nan |        nan |     1540 |
 5 |        nan |        nan |        nan |        nan |     1540 |
 6 |        nan |        nan |        nan |        nan |     1540 |
 7 |      2.330 |      0.594 |      0.451 |      5.423 |        0 |
 8 |     17.022 |    315.651 |      0.043 |    228.007 |        0 |
 9 |    151.341 |   4397.689 |     78.569 |    670.062 |        0 |
10 |      1.426 |      0.644 |      0.183 |      7.827 |        0 |
11 |      0.198 |      1.218 |     -1.414 |      1.414 |        0 |
12 |        nan |        nan |

In [123]:
describe_features(remove_errors(x[x[:, 22] == 2]))

Shape: (1043, 30)
ID |    Mean    |    Var     |    Min     |    Max     |   #NaN   |
---|------------|------------|------------|------------|----------|
 0 |    120.992 |   2622.137 |     11.160 |    748.076 |       70 |
 1 |     38.508 |   1105.092 |      0.011 |    214.034 |        0 |
 2 |     78.116 |   1348.942 |      9.463 |    563.498 |        0 |
 3 |     99.726 |   4840.548 |      0.991 |    585.661 |        0 |
 4 |      2.572 |      3.201 |      0.001 |      7.544 |        0 |
 5 |    372.698 | 163239.320 |     20.724 |   3056.908 |        0 |
 6 |     -1.022 |     14.745 |    -14.127 |     15.228 |        0 |
 7 |      2.083 |      0.558 |      0.256 |      5.135 |        0 |
 8 |     17.635 |    437.350 |      0.080 |    210.603 |        0 |
 9 |    239.577 |   9337.024 |    115.669 |    816.777 |        0 |
10 |      1.488 |      1.124 |      0.136 |      9.955 |        0 |
11 |      0.557 |      0.953 |     -1.414 |      1.414 |        0 |
12 |      0.508 |      0.159 |

In [124]:
describe_features(remove_errors(x[x[:, 22] == 3]))

Shape: (432, 30)
ID |    Mean    |    Var     |    Min     |    Max     |   #NaN   |
---|------------|------------|------------|------------|----------|
 0 |    119.783 |   3463.394 |     22.564 |    587.733 |       36 |
 1 |     42.841 |   1392.126 |      0.014 |    215.655 |        0 |
 2 |     79.088 |   2106.957 |     12.672 |    485.700 |        0 |
 3 |    122.252 |   6841.529 |      3.216 |    551.744 |        0 |
 4 |      2.017 |      2.242 |      0.005 |      6.721 |        0 |
 5 |    350.845 | 123162.597 |     30.359 |   2037.086 |        0 |
 6 |     -0.249 |      8.676 |    -10.052 |      9.234 |        0 |
 7 |      1.896 |      0.703 |      0.408 |      4.842 |        0 |
 8 |     54.259 |   1192.737 |      0.718 |    257.324 |        0 |
 9 |    363.567 |  27694.750 |    158.959 |   1279.985 |        0 |
10 |      1.508 |      1.494 |      0.140 |      8.575 |        0 |
11 |      0.459 |      0.951 |     -1.414 |      1.414 |        0 |
12 |      0.397 |      0.156 | 

In [47]:
def split_categories(x, n, category):
    categories = np.apply_along_axis(categorize, 1, x)
    return [x[categories == i] for i in np.arange(n)]

In [48]:
def categorize(x):
#     if np.isnan(x[0]):
    if x[0] == -999.0:
        return 0
    else:
        return x[22] + 1


split_categories(x, 5, categorize)

[array([[-999.   ,   86.317,   73.988, ..., -999.   , -999.   ,    0.   ],
        [-999.   ,   64.299,   64.676, ..., -999.   , -999.   ,    0.   ],
        [-999.   ,   66.497,   26.766, ..., -999.   , -999.   ,    0.   ],
        ...,
        [-999.   ,  133.017,   65.248, ..., -999.   , -999.   ,    0.   ],
        [-999.   ,   88.445,   54.259, ..., -999.   , -999.   ,    0.   ],
        [-999.   ,   78.589,   76.993, ..., -999.   , -999.   ,    0.   ]]),
 array([[ 148.436,   43.251,  118.888, ..., -999.   , -999.   ,    0.   ],
        [ 249.415,   77.191,  185.154, ..., -999.   , -999.   ,    0.   ],
        [  84.766,   34.015,   56.33 , ..., -999.   , -999.   ,    0.   ],
        ...,
        [  88.762,   44.889,   67.41 , ..., -999.   , -999.   ,    0.   ],
        [  65.259,   49.695,   59.832, ..., -999.   , -999.   ,    0.   ],
        [  81.153,   28.748,   63.335, ..., -999.   , -999.   ,    0.   ]]),
 array([[ 219.057,   72.461,  124.835, ..., -999.   , -999.   ,   50.3

In [21]:
def split(arr, cond):
    return [arr[cond], arr[~cond]]

a = np.array([1,3,5,7,2,4,6,8])
print(split(a, a<5))

a = np.array([[1,2,3],[4,5,6],[7,8,9],[2,4,7]])
print(split(a, a[:,0]<3))

[array([1, 3, 2, 4]), array([5, 7, 6, 8])]
[array([[1, 2, 3],
       [2, 4, 7]]), array([[4, 5, 6],
       [7, 8, 9]])]
