In [1]:
# packages that may be needed
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")


ModuleNotFoundError: No module named 'pandas'

In [None]:
# load the data
df = pd.read_csv("./heloc_dataset_v1.csv")
seed = 668
df.sample(5)

In [None]:
df[df.ExternalRiskEstimate == -9]

In [None]:
# some statistics
n_rows = len(df)
n_cols = df.shape[1]
col_names = list(df.columns)
row_indexes = list(df.index)

In [None]:
df.info()
# seems like no null

In [None]:
df.describe()
# is there any prob about unit?

In [None]:
(df.isnull().any(axis = 0).sum() , df.isnull().any(axis = 1).sum())

In [None]:
cols_numeric = list(df.dtypes[df.dtypes==int].index)
cols_string = list(df.dtypes[df.dtypes!=int].index)

In [None]:
df.hist(figsize=(15,15));

#### There are many similar, negative outliers for different columns

In [None]:
df['ExternalRiskEstimate'].hist(figsize=(15,5), bins=50); # reflect about the syntax of this command

It appears that values mostly vary between 30-90, and there is some accumulation of negative values. 

According to the documentation of the data (available in the Excel file `heloc_data_dictionary-2.xlsx`), the column ExternalRiskEstimate refers to the "Consolidated version of risk markers." 

The file also indicates that 3 special values are used to encode missing values: -7,-8, and -9. 

Let's look at some of the rows that contain missing values.

#### let's deal with missing values

In [None]:
df_missing_ExternalRiskEstimate = df[df.ExternalRiskEstimate == -9]
n_rows_with_missing_ExternalRiskEstimate = len(df_missing_ExternalRiskEstimate)
n_rows_with_all_numeric_missing = len(df.loc[df.iloc[:,1:].sum(axis = 1) == (n_cols-1)*(-9)])
df_without_missing_rows = df.iloc[((df.iloc[:,1:] == -9).sum(axis = 1) != len(df.columns)-1).values,:]

In [None]:
(n_rows_with_missing_ExternalRiskEstimate, n_rows_with_all_numeric_missing)

#### So 588/599 of the observations whose labels are missing are with all numeric columns missing

In [None]:
df_without_missing_rows.boxplot(figsize = (15,5))
plt.xticks(rotation = 90);

In [None]:
df_avg_feature_value_per_group = df_without_missing_rows.groupby(["RiskPerformance"]).mean().T
df_avg_feature_value_per_group

In [None]:
df_without_missing_rows = df_without_missing_rows[col_names[1:]+["RiskPerformance"]]

In [None]:
df_without_missing_rows.head()

In [None]:
df_without_missing_rows.shape

In [None]:
# generate features and labels
df = df_without_missing_rows
X = df.iloc[:,:23]
Y = df.iloc[:,-1]
Y = (Y == "Bad").astype(int)


In [None]:
(df == -9).sum()

In [None]:
df[df.ExternalRiskEstimate == -9]

In [None]:
len(df[df.ExternalRiskEstimate == -9])/len(df)

In [None]:
# split data to train and test groups
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 668)

In [None]:
df_train = pd.concat([X_train,Y_train], axis = 1)

In [None]:
means = df_train.groupby("ExternalRiskEstimate").mean()["RiskPerformance"]
means

In [None]:
counts = df_train.groupby("ExternalRiskEstimate").count()["RiskPerformance"]

In [None]:
counts

In [None]:
fig, axes = plt.subplots(1,2,figsize = (20,7))
means.plot.bar(ax = axes[0])
counts.plot.bar(ax = axes[1])
plt.tight_layout()

In [None]:
X_train = X_train[X_train["ExternalRiskEstimate"] != -9]
X_test  = X_test[X_test["ExternalRiskEstimate"] != -9]
Y_train = Y_train[X_train.index]
Y_test  = Y_test[X_test.index]

In [None]:
#pipeline - donothing columntransformer + missingindicator + simpleimputer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer
from sklearn.impute import MissingIndicator, SimpleImputer

# feature_expansion = FeatureUnion([("do nothing", ColumnTransformer([("Imputer -7 to mean", SimpleImputer(missing_values = -7, strategy = "mean"), [])], remainder = "passthrough")),
#                     ("add features for -7", MissingIndicator(missing_values= -7, features = "missing-only")),
#                     ("add features for -8", MissingIndicator(missing_values = -8, features = "missing-only")),])

# pipeline = Pipeline([("expand features", feature_expansion),
#                     ("replace -7 with -8", SimpleImputer(missing_values = -7, strategy = "constant", fill_value = -8)),
#                     ("replace -8 with average", SimpleImputer(missing_values = -8, strategy = "mean"))])
# arr_X_train_t = pipeline.fit_transform(X_train)

do_nothing_imputer = ColumnTransformer([("Imputer -7 to mean", SimpleImputer(missing_values = -7, strategy = 'mean'), [])],remainder = 'passthrough')

feature_expansion = FeatureUnion([("do nothing", do_nothing_imputer),
                                  ("missing_minus_7", MissingIndicator(missing_values = -7, features = "missing-only")),
                                  ("missing_minus_8", MissingIndicator(missing_values = -8, features = "missing-only"))])

pipeline = Pipeline([("expand features", feature_expansion),
                    ("replace -7 with -8", SimpleImputer(missing_values = -7, strategy = "constant", fill_value = -8)),
                    ("replace -8 with average", SimpleImputer(missing_values = -8, strategy = "mean"))])
arr_X_train_t = pipeline.fit_transform(X_train)

In [None]:
arr_X_train_t.shape

In [None]:
minus_7_indicator_transformer = MissingIndicator(missing_values=-7, features='missing-only').fit(X_train)
minus_8_indicator_transformer = MissingIndicator(missing_values=-8, features='missing-only').fit(X_train)
col_names_minus_7 = X_train.columns.values[minus_7_indicator_transformer.features_].tolist() 
col_names_minus_7 = list(map(lambda s: str(s)+'=-7', col_names_minus_7))
col_names_minus_8 = X_train.columns.values[minus_8_indicator_transformer.features_].tolist()
col_names_minus_8 = list(map(lambda s:str(s)+'=-8', col_names_minus_8))
columns_all = X_train.columns.values.tolist() + col_names_minus_7 + col_names_minus_8
column_names = columns_all


In [None]:
#
X_train_t = pd.DataFrame(arr_X_train_t,columns= column_names)
X_test_t = pd.DataFrame(pipeline.transform(X_test), columns = column_names)

In [None]:
X_train_t

In [None]:
from sklearn.model_selection import cross_validate
from sklearn import tree, linear_model, neighbors    
cv_results_tree = cross_validate(tree.DecisionTreeClassifier(), X_train_t, Y_train, cv=5, return_estimator=True)
cv_results_log_reg = cross_validate(linear_model.LogisticRegression(max_iter=10000), X_train_t, Y_train, cv=5, return_estimator=True)
cv_results_knn = cross_validate(neighbors.KNeighborsClassifier(), X_train_t, Y_train, cv=5, return_estimator=True)

In [None]:
print('Classification tree - CV accuracy score %.3f'%cv_results_tree['test_score'].mean()) # this is their average value
print('Logistic regresion - CV accuracy score %.3f'%cv_results_log_reg['test_score'].mean()) # this is their average value
print('KNN - CV accuracy score %.3f'%cv_results_knn['test_score'].mean()) # this is their average value

In [None]:
from sklearn.model_selection import GridSearchCV
param_grid = [{'max_depth':[1,2,3,4,5,6,7,8,9,10],  
               'criterion':["gini", "entropy"],            
               'min_samples_split':[2,5,10],              
               'min_samples_leaf':[10,20,30]}]
clf_tree = tree.DecisionTreeClassifier()
grid_search = GridSearchCV(clf_tree, param_grid, cv=3, scoring='accuracy')
grid_search.fit(X_train_t,Y_train)

In [None]:
grid_search.best_params_

In [None]:
grid_search.best_estimator_ # variable holding the best classifier (fitted on the entire dataset)

In [None]:
cv_best = cross_validate(tree.DecisionTreeClassifier(max_depth=7,min_samples_leaf=20, min_samples_split= 5, criterion= 'entropy'), X_train_t,Y_train, cv=5,return_estimator = True)

In [None]:
cv_best["test_score"].mean()

In [None]:
from sklearn.model_selection import GridSearchCV, RepeatedStratifiedKFold
from sklearn.linear_model import LogisticRegression

space = dict()
space['solver'] = ['newton-cg', 'lbfgs', 'liblinear']
space['penalty'] = ['none', 'l1', 'l2', 'elasticnet']
space['C'] = [1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1, 10, 100]
lr = LogisticRegression()
cv = RepeatedStratifiedKFold(n_splits=10, n_repeats = 3, random_state = 668)
search = GridSearchCV(lr,space,scoring = "accuracy", n_jobs = -1, cv=cv)
search.fit(X_train_t,Y_train)

In [None]:
search.best_estimator_

In [None]:
cv_best = cross_validate(LogisticRegression(C=0.1, solver = 'newton-cg'), X_train_t,Y_train, cv=5,return_estimator = True)

In [None]:
cv_best['test_score'].mean()

In [None]:
cv_best = cross_validate(LogisticRegression(C=0.1, solver = 'newton-cg'), X_test_t,Y_test, cv=5,return_estimator = True)

In [None]:
cv_best['test_score'].mean()

In [None]:
## Test accuracy score using best param on whole test set
lr = LogisticRegression(C=0.1, solver = 'newton-cg')
lr.fit(X_train_t,Y_train)
lr.score(X_test_t, Y_test)


In [None]:
# All in pipeline
do_nothing_imputer = ColumnTransformer([("Imputer -7 to mean", SimpleImputer(missing_values = -7, strategy = 'mean'), [])],remainder = 'passthrough')

feature_expansion = FeatureUnion([("do nothing", do_nothing_imputer),
                                  ("missing_minus_7", MissingIndicator(missing_values = -7, features = "missing-only")),
                                  ("missing_minus_8", MissingIndicator(missing_values = -8, features = "missing-only"))])

final_model = Pipeline([("expand features", feature_expansion),
                        ("replace -7 with -8", SimpleImputer(missing_values = -7, strategy = "constant", fill_value = -8)),
                        ("replace -8 with average", SimpleImputer(missing_values = -8, strategy = "mean")),
                        ('classifier',LogisticRegression(C=0.1, solver = 'newton-cg'))])


final_model.fit(X_train,Y_train)
final_model.score(X_test,Y_test)

In [None]:
# Get the Odds ratio
# Numerical feature: If you increase the value of feature x
# by one unit, the estimated odds change by a factor of 

res = np.exp(final_model.named_steps['classifier'].coef_)
res = pd.DataFrame(res, columns = X_train_t.columns)
res = res.transpose()

res = res.reset_index(level = 0)
res.columns = ['feature', 'Odds_ratios']

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
plt.figure(figsize=(30, 25))
plt.xticks(rotation=90)
sns.barplot(data = res,x= 'feature', y = 'Odds_ratios',palette = 'magma',order=res.sort_values('Odds_ratios').feature)
plt.savefig('Odds_ratios_visualization')

In [None]:
res

In [None]:
final_model.named_steps['classifier'].coef_

In [None]:
np.exp(0.362968655)