# Explore Data
Notebook for providing insights about the available training data

In [None]:
# ============================================================================================================
# Import section
# ============================================================================================================
import autoreload

%load_ext autoreload
%autoreload 2

import os
import pandas as pd
import numpy as np

from IPython.display import display
import seaborn as sns

In [None]:
# ============================================================================================================
# Load Data
# ============================================================================================================

# Get Working Directory 
src_dir = os.getcwd()
print("Working directory: \t\t", src_dir)

# Data directory
data_dir = os.path.join(src_dir, "data")
print("Path for data files: \t\t", data_dir)

# Source path for training data
train_data_path = os.path.join(data_dir, "train.csv")
print("Path to trainign data file: \t", train_data_path)

In [None]:
# Load training data as dataframe
pd_train = pd.read_csv(train_data_path)

# Display first 5 rows of dataframe
pd_train.head()

In [None]:
# We know from the problem decrition that: 
# - Column "ID" refers to a customer
# - Column "TARGET" refers to the values to be predicted
# The rest of the columns correspond to the features

# List of strings corresponding to features column names
l_feat_cols =  [x for x in pd_train.columns if not x in ['ID','TARGET']]

# Number of training samples
n_samples = pd_train.shape[0]

print("\nNumber of training samples available: ", n_samples)
print("Number of feature columns: ", len(l_feat_cols))


<div class="alert alert-block alert-info">
We note note that there are a lot of possible features to use and that we have over 70k data samples available. 

Lets see how our binary classes present in the dataset are divided.
</div>

In [None]:
# Identify distribution of classes
target_vals = pd.DataFrame(pd_train.TARGET.value_counts())
target_vals["Prozent"] = target_vals.TARGET / n_samples

# Visualize values dataframe
display(target_vals)

print("Percentage of Satisfied customers: \t", np.around(target_vals.Prozent[0]*100, decimals=2), "%")
print("Percentage of Unsatisfied customers: \t", np.around(target_vals.Prozent[1]*100, decimals=2), "%")

<div class="alert alert-block alert-info">
This shows that the classes are highly imbalanced. This might represent a risk when training a model, as the model may infere that always predicting class "0" would be a good approxiamte.

If evaluating the performance of the model is always based on the accuracy, then we might get results similar to 90% even if the model fails to identify class "1". So choosing the AUC of the ROC curve is a better performance metric.

As collecting more data is not an option, it might be helpful to oversample the minority class or undersample the majority class. 

Before proceeding, lets try to analyze more the features. 
</div>

In [None]:
# Extract feature columns
pd_train_features = pd_train[l_feat_cols]
pd_train_features.head()

In [None]:
# Get columns only containing one, two or 3 possible values 
single_valued_column = []
doubled_valued_column = []
tripple_valued_column = []

for col in l_feat_cols:
    print("Value counts for column: ", col )
    temp_counts = pd.DataFrame(pd_train_features[col].value_counts())
    display(temp_counts)
    
    n_values = temp_counts.shape[0]
    
    if n_values == 1:
        single_valued_column.append(col)
    elif n_values == 2:
        doubled_valued_column.append(col)
    elif n_values == 3:
        tripple_valued_column.append(col)

In [None]:
# Results from previous examination
print("# of columns with 1 value:  ", len(single_valued_column))
print("# of columns with 2 values: ", len(doubled_valued_column))
print("# of columns with 3 values: ", len(tripple_valued_column))

<div class="alert alert-block alert-info">
We are not interested in features which are constant along all training samples (single_valued_column), as these do not contribute to identifying the hyperplanes among the feature space.

Therefore, we bump the single_valued_columns from our training dataset

</div>

In [None]:
pd_train_filtered = pd_train.drop(single_valued_column, axis=1)
pd_train_filtered.head()