In [1]:
import pandas as pd
import numpy as np
from itertools import chain, combinations
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

# Functions

In [2]:
def load_csv_data(filename):
    
    '''
    Loads CSV and converts values to float
    '''
    
    df = pd.read_csv(filename, dtype=float)
    
    return df

In [9]:
def load_excel_data(filename):
    
    '''
    Loads Excel file and converts values to float
    '''
    
    df = pd.read_excel(filename, convert_float=True)
    
    return df

In [3]:
def create_powerset_dict(df, depen_var):
    
    '''
    creates a dict containing the powerset of the dependent variables
    
    (Thanks to Stackoverflow #18826571 for the powerset generator)
    '''
    
    # get dependent variable names
    indep_vars = [i for i in df.columns.values.tolist() if i != depen_var]
    
    # create a powerset of all subsets of dependent variables
    powerset = chain.from_iterable(combinations(indep_vars, r) for r in range(len(indep_vars)+1))
    
    # iterate through the powerset and create a dict with each subset of dependent variables
    i = 0
    powerset_dict = {}
    for item in powerset:
        if len(item) > 0:
            powerset_dict[i] = {}
            powerset_dict[i]['indep_vars'] = list(item)
            powerset_dict[i]['depen_var'] = depen_var
            powerset_dict[i]['num_depen_vars'] = len(item)
            i += 1
    
    return powerset_dict

In [4]:
def prep_for_classification(df, depen_var, indep_vars):
    
    '''
    Drop blanks and create two dataframes and convert to Numpy arrays
    '''
    
    # convert blank spaces and missing values to NAN
    df.replace(r'\s+', np.nan, regex=True).replace('',np.nan)
    
    # drop entire row if any value is NAN
    df_drop_na = df.dropna(axis=0, how='any')
    
    # convert to Numpy array
    x = df[indep_vars].as_matrix()
    y = df[depen_var].as_matrix()

    return x, y

In [5]:
def linear_classification_analysis(x_train, y_train, x_test, y_test):
    
    '''
    Use SKLEARN to classify
    '''
    
    clf = LinearDiscriminantAnalysis()
    clf.fit(x_train, y_train)
    train_score = clf.score(x_train, y_train)
    test_score = clf.score(x_test, y_test)

    return train_score, test_score

In [6]:
def classification_iterator(train_df, test_df, powerset_dict):
    
    '''
    Iterate through each item in the dependent variables powerset and use as input for classification analysis
    '''
    
    for item in powerset_dict:
        
        # combine dependent and independent to get all variables
        indep_vars = powerset_dict[item]['indep_vars']
        depen_var = powerset_dict[item]['depen_var']
        all_vars = [depen_var] + indep_vars
        
        # prep training data for classification analysis
        train_df_subset = train_df[all_vars]
        x_train, y_train = prep_for_classification(train_df_subset, depen_var, indep_vars)
        
        # prep test data for classification analysis
        test_df_subset = test_df[all_vars]
        x_test, y_test = prep_for_classification(test_df_subset, depen_var, indep_vars)
        
        # run classification analysis are return hitrates
        training_hitrate, test_hitrate = linear_classification_analysis(x_train, y_train, x_test, y_test)
        
        # store hitrates
        powerset_dict[item]['training_hitrate'] = training_hitrate
        powerset_dict[item]['training_sample_size'] = len(train_df)
        powerset_dict[item]['test_hitrate'] = test_hitrate
        powerset_dict[item]['test_sample_size'] = len(test_df_subset)
        
    return powerset_dict        

# Main Program

In [16]:
train = load_excel_data('../Data/classification_analysis/ubtrain.xlsx')
test = load_excel_data('../Data/classification_analysis/ubtest.xlsx')

In [17]:
train.drop(['GROUP', 'ID'], axis=1, inplace=True)
test.drop(['GROUP', 'ID'], axis=1, inplace=True)

In [18]:
train.head()

Unnamed: 0,Age,Experience,Income,ZIP,Family,CCAvg,Education,Educgrad,Educprof,Mortgage,PersonalLoan,SecuritiesAccount,CDAccount,Online,CreditCard
0,25,1,49,91107,4,1.6,1,0,0,0,0,1,0,0,0
1,39,15,11,94720,1,1.0,1,0,0,0,0,0,0,0,0
2,35,9,100,94112,1,2.7,2,1,0,0,0,0,0,0,0
3,35,8,45,91330,4,1.0,2,1,0,0,0,0,0,0,1
4,37,13,29,92121,4,0.4,2,1,0,155,0,0,0,1,0


In [19]:
powerset_dict = create_powerset_dict(train, 'PersonalLoan')

In [22]:
%timeit -n 1 -r 1 classified = classification_iterator(train, test, powerset_dict)



1 loop, best of 1: 5min 51s per loop


In [36]:
whos

Variable                         Type         Data/Info
-------------------------------------------------------
LinearDiscriminantAnalysis       type         <class 'sklearn.discrimin<...>earDiscriminantAnalysis'>
chain                            type         <type 'itertools.chain'>
classification_iterator          function     <function classification_<...>erator at 0x7f97a59b6cf8>
combinations                     type         <type 'itertools.combinations'>
create_powerset_dict             function     <function create_powerset_dict at 0x7f97a59aef50>
df                               DataFrame              depen_var      <...>n[16383 rows x 7 columns]
linear_classification_analysis   function     <function linear_classifi<...>alysis at 0x7f97d0b35398>
load_data                        function     <function load_data at 0x7f97a59b68c0>
load_excel_data                  function     <function load_excel_data at 0x7f97a59b6c08>
np                               module       <module 'nump

In [25]:
len(powerset_dict)

16383

In [27]:
df = pd.DataFrame(powerset_dict).T

In [34]:
df.sort_values('num_depen_vars', inplace=True, ascending=True)
df.sort_values('test_hitrate', ascending=False, inplace=True)

In [35]:
df.head()

Unnamed: 0,depen_var,indep_vars,num_depen_vars,test_hitrate,test_sample_size,training_hitrate,training_sample_size
16148,PersonalLoan,"[Age, Income, ZIP, Family, CCAvg, Education, E...",11,0.9476,2500,0.9392,2500
16298,PersonalLoan,"[Age, Experience, Income, ZIP, Family, CCAvg, ...",12,0.9476,2500,0.938,2500
11189,PersonalLoan,"[Age, Income, Family, Educgrad, Educprof, Mort...",8,0.9476,2500,0.944,2500
15464,PersonalLoan,"[Age, Income, ZIP, Family, Education, Educgrad...",10,0.9476,2500,0.944,2500
16209,PersonalLoan,"[Experience, Income, ZIP, Family, CCAvg, Educa...",11,0.9476,2500,0.9392,2500
