In [1]:
import pandas as pd
from zipfile import ZipFile
from sklearn.preprocessing import OneHotEncoder
from sklearn.model_selection import train_test_split
# from imblearn.over_sampling import SMOTE
from collections import Counter
import numpy as np

# Data Preparation (done from past weeks)

In [2]:
zip_file = ZipFile('../../data/raw/forStudents.pkl.zip')
df = pd.read_pickle(zip_file.open('forStudents.pkl'))
df

Unnamed: 0,acquisition_type,snapshot_type,channel,evaluation_dt,vintage,is_app_approved,vantage3_score,bad,bad_v2,bad_balance,...,rtr5930,rtr7110,rtr7140,rtr7160,rtr7216,rtr7620,rtr8120,rtr8220,rtr8320,rowNumber
0,ORGANIC,FLOW_MODULARITY,OTHER,2021-08-04,202108,1,698.0,0.0,0.0,0.0,...,2.0,4.0,4.0,9.0,5.0,100.0,5.0,5.0,5.0,59486
1,ORGANIC,FLOW_MODULARITY,OTHER,2021-03-01,202103,1,731.0,0.0,0.0,0.0,...,30.0,2.0,2.0,130.0,12.0,100.0,4.0,4.0,4.0,2357
2,ORGANIC,FLOW_MODULARITY,OTHER,2021-02-26,202102,1,,0.0,0.0,0.0,...,,,,,,,,,,90398
3,ORGANIC,FLOW_MODULARITY,OTHER,2021-06-04,202106,1,715.0,0.0,0.0,0.0,...,249.0,18.0,18.0,56.0,7.0,100.0,5.0,151.0,45.0,104654
4,ORGANIC,FLOW_MODULARITY,OTHER,2021-07-14,202107,1,,0.0,0.0,0.0,...,,,,,,,,,,89292
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48707,ORGANIC,FLOW_MODULARITY,OTHER,2021-05-01,202105,1,732.0,0.0,0.0,0.0,...,,,,,0.0,,,,,131881
48708,ORGANIC,FLOW_MODULARITY,OTHER,2020-11-29,202011,1,705.0,0.0,0.0,0.0,...,,,,,0.0,,,,,47197
48709,ORGANIC,FLOW_MODULARITY,OTHER,2021-04-25,202104,1,722.0,0.0,0.0,0.0,...,0.0,0.0,0.0,,0.0,,2.0,167.0,86.0,29739
48710,ORGANIC,FLOW_MODULARITY,OTHER,2021-04-10,202104,1,,0.0,0.0,0.0,...,,,,,,,,,,89207


In [3]:
# Clean and imputation

df_clean = df[df['all0000'].notna()]
df_clean = df_clean[df_clean['bad'] != -1.0]
df_clean = df_clean[df_clean['is_app_approved'] == 1]
df_clean.drop(columns=['bad_v2','evaluation_dt','vintage', "vantage3_score", "bad_balance", "current_balance", "net_spend"], inplace=True)
described = df_clean["annual_income"].describe()
quartiles = list(described.iloc[4:7])

In [4]:
def assign_quartile(row):
    income = row["annual_income"]
    if income < quartiles[0]:
        return 0
    elif ((quartiles[0] < income) & (quartiles[1] >= income)):
        return 1
    elif ((quartiles[1] < income) & (quartiles[2] >= income)):
        return 2
    else:
        return 3

In [5]:
df_clean.apply(assign_quartile, axis=1).value_counts().sort_index()

0    10625
1    10546
2    10709
3    10667
dtype: int64

In [6]:
df_clean['income_quartile'] = df_clean.apply(assign_quartile, axis=1)

In [7]:
dfs = []
for i in range(4):
    quartile = df_clean.loc[df_clean["income_quartile"] == i]
    dfs.append(quartile.fillna(quartile.mean()))
    
df_clean = pd.concat(dfs)

temp = df_clean.isna().sum().to_frame()
temp[temp[0] != 0]

df_clean.drop(columns=['all9230', 'all9240', 'all9249', 'all9280'], inplace=True)
df_clean = df_clean.fillna(df_clean.mean())

  dfs.append(quartile.fillna(quartile.mean()))
  df_clean = df_clean.fillna(df_clean.mean())


In [8]:
# Categorical

def process_categorical_data(dataset):
    """ One hot encodes all of the categorial columns of the dataset.  Removes the original columns """
    # select categorical data
    categorical_data = dataset.select_dtypes('object')
    
    enc = OneHotEncoder(handle_unknown='ignore')
    
    ohe_data = enc.fit_transform(categorical_data).toarray()
    column_names = enc.get_feature_names(categorical_data.columns)
    
    add = dataset.select_dtypes(exclude='object')
    added = pd.DataFrame(ohe_data,columns=column_names).astype(int)
    
    add.reset_index(drop = True, inplace= True)
    added.reset_index(drop = True, inplace= True)
    X = pd.concat([add, 
               added], axis=1)
    print(X.shape)
    return X

df_clean = process_categorical_data(df_clean)
df_clean

(42547, 887)




Unnamed: 0,is_app_approved,bad,annual_income,all0000,all0060,all0061,all0101,all0200,all0201,all0206,...,state_code_SD,state_code_TN,state_code_TX,state_code_UT,state_code_VA,state_code_VT,state_code_WA,state_code_WI,state_code_WV,state_code_WY
0,1,0.0,15171,10.0,8.000000,8.000000,0.0,9.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,1,0.0,20000,3.0,4.165954,5.237061,0.0,3.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.0,7000,6.0,4.165954,5.237061,0.0,4.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1,1.0,18000,3.0,1.000000,1.000000,0.0,2.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1,0.0,30000,6.0,4.165954,5.237061,0.0,2.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42542,1,0.0,125000,12.0,9.305257,11.139763,0.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
42543,1,0.0,101000,60.0,21.000000,11.139763,0.0,43.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
42544,1,0.0,145000,53.0,9.305257,11.139763,0.0,52.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
42545,1,0.0,72000,7.0,9.305257,11.139763,0.0,7.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0


# Week 4-6 starts here:

### Brainstorm with your team some ideas of new features you can create.

We are going to attempt to reduce the dimensionality of our features. PCA can remove correlated features, reduce overfitting, and thus improve the accuracy of our ML model.

In [9]:
df_clean

Unnamed: 0,is_app_approved,bad,annual_income,all0000,all0060,all0061,all0101,all0200,all0201,all0206,...,state_code_SD,state_code_TN,state_code_TX,state_code_UT,state_code_VA,state_code_VT,state_code_WA,state_code_WI,state_code_WV,state_code_WY
0,1,0.0,15171,10.0,8.000000,8.000000,0.0,9.0,0.0,1.0,...,0,0,0,0,0,0,0,0,0,0
1,1,0.0,20000,3.0,4.165954,5.237061,0.0,3.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
2,1,0.0,7000,6.0,4.165954,5.237061,0.0,4.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
3,1,1.0,18000,3.0,1.000000,1.000000,0.0,2.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
4,1,0.0,30000,6.0,4.165954,5.237061,0.0,2.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42542,1,0.0,125000,12.0,9.305257,11.139763,0.0,11.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
42543,1,0.0,101000,60.0,21.000000,11.139763,0.0,43.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
42544,1,0.0,145000,53.0,9.305257,11.139763,0.0,52.0,0.0,0.0,...,0,0,0,0,0,0,0,0,0,0
42545,1,0.0,72000,7.0,9.305257,11.139763,0.0,7.0,0.0,0.0,...,0,0,0,0,0,0,1,0,0,0


# Sampling need fix!!

In [10]:
# balancing df clean using undersampling method
df_clean_0 = df_clean[df_clean['bad'] == 0.0]
df_clean_1 = df_clean[df_clean['bad'] == 1.0]

df_clean_0_sample = df_clean_0.sample(n=len(df_clean_1), replace=False, random_state = 42)
df_clean_balanced = pd.concat([df_clean_0_sample, df_clean_1])

len(df_clean_balanced) # should be 3693 * 2 = 7386

7386

In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

In [12]:
def model_evaluator(data):
    X = data.drop('bad', axis=1)
    y = data['bad'].astype(int)
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

    clf = LogisticRegression(penalty='none').fit(X_train,y_train)
    y_pred =clf.predict(X_test)
    
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    X_test['predicted_bad'] = y_pred
    return clf
#     print("Dollar Loss: " + str(X_test[(y_test == 1) & (y_test == y_pred)]['bad_balance'].sum().round(2)))

In [13]:
clf = model_evaluator(df_clean_balanced)

[[510 604]
 [320 782]]
              precision    recall  f1-score   support

           0       0.61      0.46      0.52      1114
           1       0.56      0.71      0.63      1102

    accuracy                           0.58      2216
   macro avg       0.59      0.58      0.58      2216
weighted avg       0.59      0.58      0.58      2216



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [14]:
X = df_clean_balanced.drop('bad', axis=1)
y = df_clean_balanced['bad'].astype(int)

In [15]:
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
scaler.fit(X)
scaled_data = pd.DataFrame(scaler.transform(X))

In [16]:
scaled_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,876,877,878,879,880,881,882,883,884,885
0,0.0,-0.063132,-0.271052,-0.450030,-0.565297,-0.094785,-0.392699,-0.02716,-0.22507,-0.203263,...,-0.04034,-0.1328,4.098486,-0.072858,-0.158946,-0.053398,-0.152097,-0.130672,-0.088965,-0.04034
1,0.0,-0.057225,-0.723091,-0.450030,-0.565297,-0.094785,-0.682158,-0.02716,-0.22507,-0.607925,...,-0.04034,-0.1328,-0.243993,-0.072858,-0.158946,-0.053398,-0.152097,-0.130672,-0.088965,-0.04034
2,0.0,-0.026283,-0.271052,-1.298225,0.233685,-0.094785,-0.199727,-0.02716,-0.22507,0.066512,...,-0.04034,-0.1328,-0.243993,-0.072858,-0.158946,-0.053398,-0.152097,-0.130672,-0.088965,-0.04034
3,0.0,-0.026521,0.271395,-0.483818,0.233685,-0.094785,0.282705,-0.02716,-0.22507,0.740950,...,-0.04034,-0.1328,-0.243993,-0.072858,-0.158946,-0.053398,-0.152097,-0.130672,-0.088965,-0.04034
4,0.0,-0.034613,-0.451868,-0.280217,-0.616574,-0.094785,-0.392699,-0.02716,-0.22507,-0.338150,...,-0.04034,-0.1328,-0.243993,-0.072858,-0.158946,-0.053398,-0.152097,-0.130672,-0.088965,-0.04034
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7381,0.0,0.027272,0.361803,1.144995,0.897553,-0.094785,0.379192,-0.02716,-0.22507,0.201400,...,-0.04034,-0.1328,-0.243993,-0.072858,-0.158946,-0.053398,-0.152097,-0.130672,-0.088965,-0.04034
7382,0.0,-0.002480,0.542618,0.126987,-0.183967,-0.094785,-0.585672,-0.02716,-0.22507,-0.607925,...,-0.04034,-0.1328,-0.243993,-0.072858,-0.158946,-0.053398,-0.152097,-0.130672,-0.088965,-0.04034
7383,0.0,0.033223,1.265881,0.534190,0.248641,-0.094785,1.440542,-0.02716,-0.22507,0.606062,...,-0.04034,-0.1328,-0.243993,-0.072858,-0.158946,-0.053398,-0.152097,-0.130672,-0.088965,-0.04034
7384,0.0,-0.009621,0.180987,-0.076615,-0.400271,-0.094785,0.282705,-0.02716,-0.22507,0.336287,...,-0.04034,-0.1328,-0.243993,-0.072858,-0.158946,-0.053398,-0.152097,-0.130672,-0.088965,-0.04034


In [17]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
pca = PCA(n_components=50)
pca.fit(scaled_data)
transformed_data = pca.fit_transform(scaled_data)
pca_data = pd.DataFrame(transformed_data)

In [18]:
pca_data

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,40,41,42,43,44,45,46,47,48,49
0,-7.177256,-4.512930,4.265165,2.081165,-2.684958,-1.721772,-2.413138,4.802520,-0.447681,-1.828036,...,0.584184,-0.647407,0.734363,0.561655,-1.604577,0.729487,1.269672,-0.031936,0.390205,-0.828926
1,-8.489437,-0.076108,4.600491,1.496669,1.943937,-1.350314,-0.997756,3.757003,-1.257618,-3.831429,...,0.161229,1.107679,-0.032998,-0.645292,1.840760,0.237730,0.090207,-1.008393,-0.174749,0.280057
2,-4.528945,2.689593,-1.525833,3.402563,-10.939202,32.022132,1.318531,4.352373,-2.980230,-0.087705,...,2.652145,-2.559193,5.476243,-0.239666,0.375098,-7.615288,-9.577796,-3.668627,3.270515,0.874253
3,8.412768,13.358951,-0.496734,0.207961,2.105098,0.205439,3.957848,-4.745022,-5.984092,-5.472748,...,-1.819269,2.725941,-0.067058,1.111605,-0.678923,-3.826699,-1.158665,-3.198363,-1.007723,-0.963541
4,-3.332479,6.545506,1.212955,-3.897411,-0.315443,0.101068,-5.376314,-5.889094,-0.887446,-4.343347,...,1.482794,-2.041133,0.552212,-0.708587,-0.391758,0.189662,-0.343504,-0.131552,1.229187,-0.067426
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7381,8.550572,-4.179188,0.556905,-4.050597,13.125279,0.769349,-1.940271,3.237761,-2.062023,1.792650,...,-0.270615,2.504851,-4.960976,-1.506585,-2.198473,-2.373632,-0.227402,1.813594,2.183643,-0.183170
7382,-0.361224,-1.344797,-5.767633,-4.212040,-3.160669,-1.235665,-1.471944,0.741604,0.056704,1.521034,...,-1.576923,-2.869537,-1.696536,-1.745816,0.611635,-4.716768,0.829881,-2.606704,2.565391,0.585993
7383,8.440756,12.255782,-7.983140,-3.144338,-4.322581,-4.804491,0.112822,3.336193,-5.525677,-1.816095,...,-1.041651,1.397407,1.381004,0.427156,-0.267897,1.329487,2.803829,0.957656,0.747145,0.680013
7384,8.333123,-1.642463,-5.931626,-3.075135,6.921081,2.412136,-1.667403,-0.803675,0.836682,4.359406,...,0.141114,0.572902,-0.436723,0.653512,-0.562294,4.213039,-0.320758,-0.953271,3.096633,0.230437


In [19]:
def pca_model(new_x, new_y):
    X_train, X_test, y_train, y_test = train_test_split(new_x, new_y, test_size=0.3, random_state=42)

    clf = LogisticRegression(penalty='none').fit(X_train,y_train)
#     clf = RandomForestClassifier(max_depth = 15, random_state=42).fit(X_train, y_train)
    y_pred =clf.predict(X_test)
    
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    X_test['predicted_bad'] = y_pred
#     print("Dollar Loss: " + str(X_test[(y_test == 1) & (y_test == y_pred)]['bad_balance'].sum().round(2)))

In [20]:
pca_model(pca_data, y)

[[680 434]
 [394 708]]
              precision    recall  f1-score   support

           0       0.63      0.61      0.62      1114
           1       0.62      0.64      0.63      1102

    accuracy                           0.63      2216
   macro avg       0.63      0.63      0.63      2216
weighted avg       0.63      0.63      0.63      2216



In [21]:
# accuracy went from 0.58 to 0.62 after pca! nice! also, f1-score for the 0 class improved.

In [22]:
# TODO: balance df_clean
df_clean_0 = df_clean[df_clean['bad'] == 0.0]
df_clean_1 = df_clean[df_clean['bad'] == 1.0]

df_clean_0_sample = df_clean_0.sample(n=len(df_clean_1), replace=False, random_state = 42)
df_clean_balanced = pd.concat([df_clean_0_sample, df_clean_1])

len(df_clean_balanced) # should be 3693 * 2 = 7386



7386

In [23]:
X = df_clean_balanced.drop('bad', axis=1)
y = df_clean_balanced['bad'].astype(int)

In [24]:
# X = X.drop('vantage3_score', axis=1).drop('bad_balance', axis=1).drop('is_app_approved',axis=1)

In [25]:

top_col_idx = abs(clf.coef_[0]).argsort()[-50:]
top_col_idx

array([554, 646, 303, 465, 498, 508, 144, 166, 165, 716, 145, 304, 521,
       805, 642, 509, 202, 468, 441, 718, 466, 810, 809, 438, 160, 693,
       520, 548, 274, 161, 311, 644, 690, 328, 640, 691, 649, 162, 676,
       717, 697, 715, 714, 272, 167, 692, 757, 648, 271, 694])

In [26]:
def model_evaluator(data):
    X = data.drop('bad', axis=1)
    y = data['bad'].astype(int)
        
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    clf = LogisticRegression(penalty='none').fit(X_train,y_train)
    y_pred =clf.predict(X_test)
    
    print(confusion_matrix(y_test, y_pred))
    print(classification_report(y_test, y_pred))
    
    X_test['predicted_bad'] = y_pred
#     print("Dollar Loss: " + str(X_test[(y_test == 1) & (y_test == y_pred)]['bad_balance'].sum().round(2)))

In [27]:
top50 = X.iloc[:,top_col_idx]
top50['bad'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top50['bad'] = y


In [28]:
# top50 = X.iloc[:,scores.argsort()[:1]]
# top50['bad'] = y


In [29]:
model_evaluator(top50)

[[550 588]
 [350 728]]
              precision    recall  f1-score   support

           0       0.61      0.48      0.54      1138
           1       0.55      0.68      0.61      1078

    accuracy                           0.58      2216
   macro avg       0.58      0.58      0.57      2216
weighted avg       0.58      0.58      0.57      2216



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## chi-sq select k best

In [30]:
from sklearn.feature_selection import SelectKBest, chi2

In [31]:
from sklearn.preprocessing import MinMaxScaler

scaler = MinMaxScaler()

X_normalized = scaler.fit_transform(X)

In [32]:
X_new = pd.DataFrame(SelectKBest(chi2, k=50).fit_transform(X_normalized, y))
X_new['bad'] = y

model_evaluator(top50)

[[598 543]
 [360 715]]
              precision    recall  f1-score   support

           0       0.62      0.52      0.57      1141
           1       0.57      0.67      0.61      1075

    accuracy                           0.59      2216
   macro avg       0.60      0.59      0.59      2216
weighted avg       0.60      0.59      0.59      2216



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


What are your top 50 features?  Top 10 features?

In [33]:
# Top 50 features with feature importance

X.iloc[:, top_col_idx]

Unnamed: 0,iln5220,bcc5422,mta5020,stu5123,rec5120,fip5020,all5020,all5830,all5825,reh5120,...,brc5930,reh5030,reh5020,use5320,all5935,brc5038,rev5036,bcc5520,use5030,brc5620
6754,325.000000,1700.000000,76625.021127,7926.483675,10429.937500,2568.024286,15294.0,385.0,385.0,669.0,...,249.0,778.0,778.0,10793.749338,574.0,109.000000,451.636409,2383.00000,2108.783641,13522.0
1983,176.017635,3800.000000,76625.021127,7926.483675,10429.937500,2568.024286,809.0,108.0,108.0,374.0,...,28.0,789.0,789.0,10793.749338,48.0,51.000000,451.636409,1789.00000,2108.783641,4577.0
25807,1.000000,700.000000,141414.140673,14800.060567,13823.402985,10.000000,474.0,61.0,61.0,236.0,...,231.0,464.0,464.0,8201.291358,232.0,0.000000,464.000000,500.00000,1992.075506,6236.0
25211,745.000000,500.000000,141414.140673,14800.060567,13823.402985,4620.372275,41179.0,912.0,912.0,722.0,...,430.0,2090.0,2090.0,8201.291358,1175.0,1017.000000,884.985469,550.00000,1992.075506,1210.0
23604,25.000000,500.000000,141414.140673,14800.060567,13823.402985,4620.372275,630.0,169.0,169.0,344.0,...,0.0,344.0,344.0,8201.291358,144.0,1988.472697,884.985469,400.00000,1992.075506,856.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
42477,469.928300,2500.000000,210748.620051,22617.714663,18401.614815,6415.659231,16164.0,363.0,363.0,9746.0,...,200.0,16164.0,16164.0,10550.587071,200.0,9746.000000,642.000000,4175.00000,2480.986044,1024.0
42490,384.000000,2328.240767,210748.620051,9564.000000,18401.614815,380.000000,20201.0,427.0,427.0,0.0,...,0.0,0.0,0.0,10550.587071,434.0,2793.698409,0.000000,2147.05802,2480.986044,669.0
42494,600.000000,500.000000,210748.620051,22617.714663,18401.614815,13166.000000,34576.0,1398.0,1398.0,3.0,...,602.0,3.0,3.0,10550.587071,3078.0,3.000000,0.000000,375.00000,2480.986044,2027.0
42520,570.000000,500.000000,210748.620051,22617.714663,18401.614815,6415.659231,46134.0,1323.0,1323.0,6788.0,...,2588.0,8480.0,8480.0,10550.587071,3628.0,7675.000000,1219.406468,2613.00000,2480.986044,1970.0


In [34]:
# Top 10 features
X.iloc[:, top_col_idx[-10:]]

Unnamed: 0,brc5930,reh5030,reh5020,use5320,all5935,brc5038,rev5036,bcc5520,use5030,brc5620
6754,249.0,778.0,778.0,10793.749338,574.0,109.000000,451.636409,2383.00000,2108.783641,13522.0
1983,28.0,789.0,789.0,10793.749338,48.0,51.000000,451.636409,1789.00000,2108.783641,4577.0
25807,231.0,464.0,464.0,8201.291358,232.0,0.000000,464.000000,500.00000,1992.075506,6236.0
25211,430.0,2090.0,2090.0,8201.291358,1175.0,1017.000000,884.985469,550.00000,1992.075506,1210.0
23604,0.0,344.0,344.0,8201.291358,144.0,1988.472697,884.985469,400.00000,1992.075506,856.0
...,...,...,...,...,...,...,...,...,...,...
42477,200.0,16164.0,16164.0,10550.587071,200.0,9746.000000,642.000000,4175.00000,2480.986044,1024.0
42490,0.0,0.0,0.0,10550.587071,434.0,2793.698409,0.000000,2147.05802,2480.986044,669.0
42494,602.0,3.0,3.0,10550.587071,3078.0,3.000000,0.000000,375.00000,2480.986044,2027.0
42520,2588.0,8480.0,8480.0,10550.587071,3628.0,7675.000000,1219.406468,2613.00000,2480.986044,1970.0


In [35]:
model_evaluator(top50)

[[581 564]
 [321 750]]
              precision    recall  f1-score   support

           0       0.64      0.51      0.57      1145
           1       0.57      0.70      0.63      1071

    accuracy                           0.60      2216
   macro avg       0.61      0.60      0.60      2216
weighted avg       0.61      0.60      0.60      2216



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [36]:
top10 = X.iloc[:,top_col_idx[-10:]]
top10['bad'] = y

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  top10['bad'] = y


In [37]:
top10

Unnamed: 0,brc5930,reh5030,reh5020,use5320,all5935,brc5038,rev5036,bcc5520,use5030,brc5620,bad
6754,249.0,778.0,778.0,10793.749338,574.0,109.000000,451.636409,2383.00000,2108.783641,13522.0,0
1983,28.0,789.0,789.0,10793.749338,48.0,51.000000,451.636409,1789.00000,2108.783641,4577.0,0
25807,231.0,464.0,464.0,8201.291358,232.0,0.000000,464.000000,500.00000,1992.075506,6236.0,0
25211,430.0,2090.0,2090.0,8201.291358,1175.0,1017.000000,884.985469,550.00000,1992.075506,1210.0,0
23604,0.0,344.0,344.0,8201.291358,144.0,1988.472697,884.985469,400.00000,1992.075506,856.0,0
...,...,...,...,...,...,...,...,...,...,...,...
42477,200.0,16164.0,16164.0,10550.587071,200.0,9746.000000,642.000000,4175.00000,2480.986044,1024.0,1
42490,0.0,0.0,0.0,10550.587071,434.0,2793.698409,0.000000,2147.05802,2480.986044,669.0,1
42494,602.0,3.0,3.0,10550.587071,3078.0,3.000000,0.000000,375.00000,2480.986044,2027.0,1
42520,2588.0,8480.0,8480.0,10550.587071,3628.0,7675.000000,1219.406468,2613.00000,2480.986044,1970.0,1


In [38]:
model_evaluator(top10)

[[536 568]
 [330 782]]
              precision    recall  f1-score   support

           0       0.62      0.49      0.54      1104
           1       0.58      0.70      0.64      1112

    accuracy                           0.59      2216
   macro avg       0.60      0.59      0.59      2216
weighted avg       0.60      0.59      0.59      2216



Features we dropped include dates, and features like bad_balance and vantage3_score which have direct relation to our outcome variable bad