In [2]:
import xgboost as xgb
import pandas as pd
import numpy as np

In [3]:
discrete = pd.read_csv('../data/processed/USC_Discrete_Target_Dataset.csv')

In [20]:
discrete.shape

(14112, 14)

In [5]:
column_type = pd.read_csv('../data/processed/USC_Discret_column_type.csv')

In [6]:
#function to get number of missing values in a column
def get_na_num(column): #input the whole column
    if column.dtype == np.object:
        return column.isnull().sum() + column[column == ''].count() + column[column == '?'].count()
    else:
        return column.isnull().sum()

In [7]:
#function to get number of valid values in a column
def get_valid_num(column): #input the whole column
    return len(column) - get_na_num(column)

In [8]:
#function to get minimum value in a column
def get_min(column): #input the whole column
    return min(column)

In [9]:
#function to get maximum value in a column
def get_max(column): #input the whole column
    return max(column)

In [10]:
#function to get mean in a column
def get_mean(column): #input the whole column
    return column.mean()

In [11]:
#function to get std in a column
def get_std(column): #input the whole column
    return column.std()

In [12]:
#function to get skewness in a column
def get_skew(column): #input the whole column
    return column.skew()

In [13]:
#function to get number of distict values in a column
def get_distinct_num(column): #input the whole column
    return len(column.unique().tolist())

In [14]:
#function to get count of each distict value in a column
def get_distinct_count(column): #input the whole column
    if get_distinct_num(column) > 5:
        print('Number of distict values is larger than 5. We stop updating the number of distinct values')
    else:
        return column.value_counts()

In [15]:
#function to get median in a column
import statistics
def get_median(column):
    if get_distinct_num(column) > 5:
        print('Number of distict values is larger than 5. We do not calculate median')
    else:
        return statistics.median(column)

In [16]:
#function to get mode and count for the mode in a column
def get_mode(column):
    return (column.mode()[0],column[column==column.mode()[0]].count())

In [17]:
#Function to get target variable
def get_target(df,df_type):
    for c in df:
        if (column_type(c,df_type) == 'Flag_Continuous' or column_type(c,df_type) == 'Flag_Categorical'):
            return(c)

In [18]:
# funtion to get column type
def column_type(column_name,df_type):
    return (df_type.loc[df_type['Variable'] == column_name, 'Type'].iloc[0])

In [32]:
#function to do basic variable screening and create basic statistical report
def Stats_Collection(df,df_type):
    for c in df:
        #exclude Target 
        if (column_type(c,df_type) != 'Flag_Continuous' and column_type(c,df_type) != 'Flag_Categorical'):
            print('Variable name: ',c)

            #Basic variable screening
            if get_na_num(df[c])/len(df[c]) > 0.5:
                print('More 50% missing values, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue
            if (column_type(c,df_type) == 'Delete'):
                print('Column type is Delete, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue
            if (column_type(c,df_type) == 'Continuous') and (get_min(df[c]) == get_max(df[c])):
                print('All same value, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue
            if (column_type(c,df_type) == 'Ordinal' or column_type(c,df_type) == 'Nominal') and (get_mode(df[c])[1]/get_valid_num(df[c]) > 0.95):
                print('Mode contains more than 95% cases, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue
            if (column_type(c,df_type) == 'Nominal') and (get_distinct_num(df[c]) > 100):
                print('More than 100 categories, drop this column\n')
                df = df.drop(columns=c)
                df_type = df_type.drop(index=int(df_type[df_type.Variable == c].index[0]))
                continue

            #Basic statistic report
            print('Variable type: ', column_type(c,df_type))
            print ('Number of missing values: ',get_na_num(df[c]))
            print ('Number of valid values: ',get_valid_num(df[c]))
            if column_type(c,df_type) == 'Continuous' or column_type(c,df_type) == 'Ordinal':
                print('Minimum value: ', get_min(df[c]))
                print('Maximum value: ', get_max(df[c]))
            if column_type(c,df_type) == 'Continuous':
                print('Mean: ',get_mean(df[c]))
                print('Standard Deviation: ',get_std(df[c]))
                print('Skewness: ',get_skew(df[c]))
                print('Number of distinct values: ',get_distinct_num(df[c]))
                print('Number of cases for each distinct value: ')
                print(get_distinct_count(df[c]))
            else:
                print('Number of categories: ', get_distinct_num(df[c]))
                print('The counts of each category: ')
                print(get_distinct_count(df[c]))
                print('Mode: ', get_mode(df[c])[0],'Count: ',get_mode(df[c])[1])                
        print()
    return(df,df_type)

In [19]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [22]:
# split data into X and Y
X = discrete.iloc[:,0:13]
Y = discrete.iloc[:,13]

In [24]:
# split data into train and test sets
seed = 7
test_size = 0.33
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=seed)


In [26]:
# fit model no training data
model = XGBClassifier()
model.fit(x_train, y_train)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='binary:logistic', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

In [28]:
# make predictions for test data
y_pred = model.predict(x_test)

In [31]:
# evaluate predictions
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: %.2f%%" % (accuracy * 100.0))

Accuracy: 83.62%
