## Features analysis

This Jupyter notebook provides a simple features analysis such as showing histograms of each feature and normalize features to zero mean and unit standard deviation (or in the range of 0~1, depending on which normalization you use).

After running, you will get a folder named **features_analysis** which inlcudes the histogram figures for each feature.

First, let's import packages.

In [9]:
#------ import packages ------#
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
import os
#plt.style.use("seaborn")
#plt.style.use("ggplot")
#plt.style.use("bmh")
plt.style.use("seaborn-darkgrid")
#plt.style.use("seaborn-deep")
#plt.style.use("tableau-colorblind10")

The function **features_analysis** analyzes the features in the orginal combined dataset (7151 data samples) and draw histogram figures.

In [54]:
def without_normalization_analysis(file_name, norm_tag,root_dir):
    org_train_df = pd.read_csv(file_name)
    feature_df = org_train_df.drop(['Row', 'y'], 1)  # remove ID and postions/targets
    all_features = feature_df.columns
    
    
    #---- plot histogram for each feature
    for cur_feature in all_features:
        cur_df = feature_df[cur_feature]
        plt.figure()
        cur_df.plot.hist(bins=12, alpha=0.5)
        cur_figure_title = '{} Histogram ({})'.format(cur_feature, norm_tag)
        cur_figure_name = os.path.join(root_dir, '{}.jpg'.format(cur_feature))
        plt.title(cur_figure_title)
        plt.savefig(cur_figure_name)
        plt.close()
    
    
    #---- plot boxplot for all features in one figure
    plt.figure()
    feature_df.boxplot(grid=False, rot=45, fontsize=10,figsize=(40, 20),showfliers=False)
    cur_figure_title = 'Boxplot of all features ({})'.format(norm_tag)
    cur_figure_name = os.path.join(root_dir, 'boxplot.jpg')
    plt.title(cur_figure_title)
    plt.grid()
    plt.tight_layout()
    plt.savefig(cur_figure_name)
    plt.close()

In [55]:
def minmax_normalization_analysis(file_name, norm_tag,root_dir):
    org_train_df = pd.read_csv(file_name)
    org_feature_df = org_train_df.drop(['Row', 'y'], 1)  # remove ID and postions/targets
    all_features = org_feature_df.columns
    
    #---normalize features to be the range of [0,1]
    feature_df_np = org_feature_df.to_numpy()
    scaler = MinMaxScaler()
    scaler.fit(feature_df_np)
    scaled_feature_np = scaler.transform(feature_df_np)
    feature_df = pd.DataFrame(data=scaled_feature_np, columns=all_features)
    
    #---- plot histogram for each feature
    for cur_feature in all_features:
        cur_df = feature_df[cur_feature]
        plt.figure()
        cur_df.plot.hist(bins=12, alpha=0.5)
        cur_figure_title = '{} Histogram ({})'.format(cur_feature, norm_tag)
        cur_figure_name = os.path.join(root_dir, '{}.jpg'.format(cur_feature))
        plt.title(cur_figure_title)
        plt.savefig(cur_figure_name)
        plt.close()
    
    
    #---- plot boxplot for all features in one figure
    plt.figure()
    feature_df.boxplot(grid=False, rot=45, fontsize=10,figsize=(40, 20),showfliers=False)
    cur_figure_title = 'Boxplot of all features ({})'.format(norm_tag)
    cur_figure_name = os.path.join(root_dir, 'boxplot.jpg')
    plt.title(cur_figure_title)
    plt.grid()
    plt.tight_layout()
    plt.savefig(cur_figure_name)
    plt.close()

In [56]:
def standardscaler_normalization_analysis(file_name, norm_tag,root_dir):
    org_train_df = pd.read_csv(file_name)
    org_feature_df = org_train_df.drop(['Row', 'y'], 1)  # remove ID and postions/targets
    all_features = org_feature_df.columns
    
    #---normalize features to be the range of [0,1]
    feature_df_np = org_feature_df.to_numpy()
    scaler = StandardScaler()
    scaler.fit(feature_df_np)
    scaled_feature_np = scaler.transform(feature_df_np)
    feature_df = pd.DataFrame(data=scaled_feature_np, columns=all_features)
    
    #---- plot histogram for each feature
    for cur_feature in all_features:
        cur_df = feature_df[cur_feature]
        plt.figure()
        cur_df.plot.hist(bins=12, alpha=0.5)
        cur_figure_title = '{} Histogram ({})'.format(cur_feature, norm_tag)
        cur_figure_name = os.path.join(root_dir, '{}.jpg'.format(cur_feature))
        plt.title(cur_figure_title)
        plt.savefig(cur_figure_name)
        plt.close()
    
    
    #---- plot boxplot for all features in one figure
    plt.figure()
    feature_df.boxplot(grid=False, rot=45, fontsize=10,figsize=(40, 20),showfliers=False)
    cur_figure_title = 'Boxplot of all features ({})'.format(norm_tag)
    cur_figure_name = os.path.join(root_dir, 'boxplot.jpg')
    plt.title(cur_figure_title)
    plt.grid()
    plt.tight_layout()
    plt.savefig(cur_figure_name)
    plt.close()

Below, we start to run our code.

In [57]:
#------ Start to run ------#
if __name__ == '__main__':
    
    save_folder_list = ['feature_analysis/without_normalization', 
                        'feature_analysis/standardscaler', 
                        'feature_analysis/minmaxscaler']

    for cur_folder in save_folder_list:
        if not os.path.exists(cur_folder):
            os.makedirs(cur_folder)
            print('Congrats! Folder has been created!')
        else:
            print('There is no need to create any folders!')
        
    input_file_name = 'processed_csv/processed_combined.csv'
    
    #----wihtout normalization
    norm_tag = 'without_normalization'
    root_dir = 'feature_analysis/without_normalization'
    without_normalization_analysis(input_file_name, norm_tag, root_dir)
    
    #---- normalization to be [0,1]
    norm_tag = 'minmaxscaler'
    root_dir = 'feature_analysis/minmaxscaler'
    minmax_normalization_analysis(input_file_name, norm_tag,root_dir)
    
    
    #---- normalization to be zero mean and unit standard deviation (std)
    norm_tag = 'standardscaler'
    root_dir = 'feature_analysis/standardscaler'
    standardscaler_normalization_analysis(input_file_name, norm_tag,root_dir)
   
    
    print('')
    print('>>>Congrats! Features analysis is done!')

Congrats! Folder has been created!
Congrats! Folder has been created!
Congrats! Folder has been created!

>>>Congrats! Features analysis is done!
