In [132]:
import pandas as pd
import numpy as np

In [133]:
!git clone https://github.com/aviana-michaelj/nemesis-ml.git

fatal: destination path 'nemesis-ml' already exists and is not an empty directory.


In [165]:
df = pd.read_csv('nemesis-ml/data/raw/Kaggle/train_sample.csv')
df=df.drop(columns = ['Unnamed: 0'])

In [161]:
df_type = pd.read_csv('nemesis-ml/data/raw/Kaggle/column_type.csv')

In [162]:
df.columns

Index(['Row_ID', 'Household_ID', 'Vehicle', 'Calendar_Year', 'Model_Year',
       'Blind_Make', 'Blind_Model', 'Blind_Submodel', 'Cat1', 'Cat2', 'Cat3',
       'Cat4', 'Cat5', 'Cat6', 'Cat7', 'Cat8', 'Cat9', 'Cat10', 'Cat11',
       'Cat12', 'OrdCat', 'Var1', 'Var2', 'Var3', 'Var4', 'Var5', 'Var6',
       'Var7', 'Var8', 'NVCat', 'NVVar1', 'NVVar2', 'NVVar3', 'NVVar4',
       'Claim_Amount'],
      dtype='object')

In [138]:
#function to get number of missing values in a column
def get_na_num(column): #input the whole column
    if column.dtype == np.object:
        return column.isnull().sum() + column[column == ''].count() + column[column == '?'].count()
    else:
        return column.isnull().sum()

In [139]:
#function to get number of valid values in a column
def get_valid_num(column): #input the whole column
    return len(column) - get_na_num(column)

In [140]:
#function to get minimum value in a column
def get_min(column): #input the whole column
    return min(column)

In [141]:
#function to get maximum value in a column
def get_max(column): #input the whole column
    return max(column)

In [142]:
#function to get mean in a column
def get_mean(column): #input the whole column
    return column.mean()

In [143]:
#function to get std in a column
def get_std(column): #input the whole column
    return column.std()

In [144]:
#function to get skewness in a column
def get_skew(column): #input the whole column
    return column.skew()

In [145]:
#function to get number of distict values in a column
def get_distinct_num(column): #input the whole column
    return len(column.unique().tolist())

In [146]:
#function to get count of each distict value in a column
def get_distinct_count(column): #input the whole column
    if get_distinct_num(column) > 5:
        print('Number of distict values is larger than 5. We stop updating the number of distinct values')
    else:
        return column.value_counts()

In [147]:
#function to get median in a column
import statistics
def get_median(column):
    if get_distinct_num(column) > 5:
        print('Number of distict values is larger than 5. We do not calculate median')
    else:
        return statistics.median(column)

In [148]:
#function to get mode and count for the mode in a column
def get_mode(column):
    return (column.mode()[0],column[column==column.mode()[0]].count())

In [149]:
#function to check whether a numerical column is continuous or discrete
def check_cont_or_dis(column):
    if get_distinct_num(column) > max(0.0001 * get_valid_num(column), 10):
        return ('continuous')
    else:
        return ('discrete')

In [150]:
# funtion to get column type
def column_type(column):
    return (df_type.loc[df_type['Variable'] == column, 'Type'].iloc[0])

In [151]:
#function to do basic variable screening and create basic statistical report
def Stats_Collection(df):
    for c in df:
        #exclude Target 
        if (column_type(c) != 'Flag_Continuous' and column_type(c) != 'Flag_Categorical'):
            print('Variable name: ',c)

            #Basic variable screening
            if get_na_num(df[c])/len(df[c]) > 0.5:
                print('More 50% missing values, drop this column\n')
                df = df.drop(columns=c)
                continue
            if (column_type(c) == 'Delete'):
                print('Column type is Delete, drop this column\n')
                df = df.drop(columns=c)
                continue
            if (column_type(c) == 'Continuous') and (get_min(df[c]) == get_max(df[c])):
                print('All same value, drop this column\n')
                df = df.drop(columns=c)
                continue
            if (column_type(c) == 'Ordinal' or column_type(c) == 'Nominal') and (get_mode(df[c])[1]/get_valid_num(df[c]) > 0.95):
                print('Mode contains more than 95% cases, drop this column\n')
                df = df.drop(columns=c)
                continue
            if (column_type(c) == 'Nominal') and (get_distinct_num(df[c]) > 100):
                print('More than 100 categories, drop this column\n')
                df = df.drop(columns=c)
                continue

            #Basic statistic report
            print('Variable type: ', column_type(c))
            print ('Number of missing values: ',get_na_num(df[c]))
            print ('Number of valid values: ',get_valid_num(df[c]))
            if column_type(c) == 'Continuous' or column_type(c) == 'Ordinal':
                print('Minimum value: ', get_min(df[c]))
                print('Maximum value: ', get_max(df[c]))
            if column_type(c) == 'Continuous':
                print('Mean: ',get_mean(df[c]))
                print('Standard Deviation: ',get_std(df[c]))
                print('Skewness: ',get_skew(df[c]))
                print('Number of distinct values: ',get_distinct_num(df[c]))
                print('Number of cases for each distinct value: ')
                print(get_distinct_count(df[c]))
            else:
                print('Number of categories: ', get_distinct_num(df[c]))
                print('The counts of each category: ')
                print(get_distinct_count(df[c]))
                print('Mode: ', get_mode(df[c])[0],'Count: ',get_mode(df[c])[1])                
                print()

In [158]:
Stats_Collection(df)

Variable name:  Row_ID
Variable name:  Household_ID
More than 100 categories, drop this column

Variable name:  Vehicle
Variable type:  Continuous
Number of missing values:  0
Number of valid values:  5000
Minimum value:  1
Maximum value:  14
Mean:  2.2438
Standard Deviation:  1.4993369075136003
Skewness:  2.0299344354564077
Number of distinct values:  14
Number of cases for each distinct value: 
Number of distict values is larger than 5. We stop updating the number of distinct values
None
Variable name:  Calendar_Year
Variable type:  Nominal
Number of missing values:  0
Number of valid values:  5000
Number of categories:  2
The counts of each category: 
2005    4026
2006     974
Name: Calendar_Year, dtype: int64
Mode:  2005 Count:  4026

Variable name:  Model_Year
Variable type:  Nominal
Number of missing values:  0
Number of valid values:  5000
Number of categories:  27
The counts of each category: 
Number of distict values is larger than 5. We stop updating the number of distinct va

Number of valid values:  5000
Minimum value:  -0.2661168
Maximum value:  8.8830814
Mean:  0.04335701476000005
Standard Deviation:  0.9567237370598799
Skewness:  3.172265712726865
Number of distinct values:  12
Number of cases for each distinct value: 
Number of distict values is larger than 5. We stop updating the number of distinct values
None
Variable name:  NVVar3
Variable type:  Continuous
Number of missing values:  0
Number of valid values:  5000
Minimum value:  -0.2723372000000001
Maximum value:  5.7033172
Mean:  0.011386864219999925
Standard Deviation:  0.9189255840034346
Skewness:  3.2891866042487012
Number of distinct values:  11
Number of cases for each distinct value: 
Number of distict values is larger than 5. We stop updating the number of distinct values
None
Variable name:  NVVar4
Variable type:  Continuous
Number of missing values:  0
Number of valid values:  5000
Minimum value:  -0.2514189
Maximum value:  6.3888025
Mean:  -0.060534660159999996
Standard Deviation:  0.65