In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

pd.set_option('max_columns', None)

In [2]:
train = pd.read_csv('train_fNxu4vz.csv')
test = pd.read_csv('test_fjtUOL8.csv')
sample_sub = pd.read_csv('sample_submission_HSqiq1Q.csv')

train.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate
0,10000001,7000,< 1 year,Rent,68000.0,not verified,car,18.37,0,,9,14,Female,1
1,10000002,30000,4 years,Mortgage,,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3
2,10000003,24725,7 years,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,,12,16,Male,3
3,10000004,16000,< 1 year,,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,,16,22,Male,3
4,10000005,17000,8 years,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,,19,30,Female,1


In [3]:
train.shape

(164309, 14)

In [4]:
df = pd.concat([train, test], ignore_index=True)
df.shape

(273850, 14)

In [5]:
df['Loan_Amount_Requested'] = df['Loan_Amount_Requested'].str.replace(',', '')

In [6]:
pattern = r'[year | years | + | <]'
df['Length_Employed'] = df['Length_Employed'].str.replace(pattern, '')   

In [7]:
df['Months_Since_Deliquency'] = df['Months_Since_Deliquency'].fillna(0)

In [8]:
df['Length_Employed'] = df['Length_Employed'].fillna(df['Length_Employed'].median())
df['Home_Owner'] = df['Home_Owner'].fillna('None')
df['Annual_Income'] = df['Annual_Income'].fillna(df['Annual_Income'].median())

In [9]:
cols_to_dum = ['Home_Owner', 'Income_Verified', 'Purpose_Of_Loan', 'Gender']

In [10]:
# Feature Engineering
df['Loan_Amount_Requested'] = df['Loan_Amount_Requested'].astype('int32')
df['Length_Employed'] = df['Length_Employed'].astype('int32')
df['Debt_per_income'] = df['Annual_Income'] / df['Debt_To_Income']
df['Number_Closed_Account'] = df['Total_Accounts'] - df['Number_Open_Accounts']
df['per_month_deliquency'] = df['Inquiries_Last_6Mo'] / 6
df['Income_per_Length_Employed'] = df['Annual_Income'] / df['Length_Employed']
df['Income_per_Amount_Requested'] = df['Annual_Income'] / df['Loan_Amount_Requested']

In [11]:
aggs = {}
aggs['Loan_Amount_Requested'] = ['std', 'mean', 'median']
aggs['Annual_Income'] = ['std', 'mean', 'median']
aggs['Debt_To_Income'] = ['std', 'mean', 'median']
agg_trans = df.groupby(['Purpose_Of_Loan']).agg(aggs)
agg_trans.columns = ['_'.join(col).strip() for col in agg_trans.columns.values]
agg_trans.reset_index(inplace=True)

df = pd.merge(df, agg_trans, how='left', on='Purpose_Of_Loan')
df.head()

Unnamed: 0,Loan_ID,Loan_Amount_Requested,Length_Employed,Home_Owner,Annual_Income,Income_Verified,Purpose_Of_Loan,Debt_To_Income,Inquiries_Last_6Mo,Months_Since_Deliquency,Number_Open_Accounts,Total_Accounts,Gender,Interest_Rate,Debt_per_income,Number_Closed_Account,per_month_deliquency,Income_per_Length_Employed,Income_per_Amount_Requested,Loan_Amount_Requested_std,Loan_Amount_Requested_mean,Loan_Amount_Requested_median,Annual_Income_std,Annual_Income_mean,Annual_Income_median,Debt_To_Income_std,Debt_To_Income_mean,Debt_To_Income_median
0,10000001,7000,1,Rent,68000.0,not verified,car,18.37,0,0.0,9,14,Female,1.0,3701.687534,5,0.0,68000.0,9.714286,5637.956226,8172.483328,6400.0,46740.763244,65268.604424,63000.0,8.130674,13.257323,12.3
1,10000002,30000,4,Mortgage,63000.0,VERIFIED - income,debt_consolidation,14.93,0,17.0,12,24,Female,3.0,4219.691896,12,0.0,15750.0,2.1,8163.834063,15153.691536,14000.0,52214.754799,71157.145773,63000.0,7.676983,17.90247,17.56
2,10000003,24725,7,Mortgage,75566.4,VERIFIED - income source,debt_consolidation,15.88,0,0.0,12,16,Male,3.0,4758.589421,4,0.0,10795.2,3.056275,8163.834063,15153.691536,14000.0,52214.754799,71157.145773,63000.0,7.676983,17.90247,17.56
3,10000004,16000,1,,56160.0,VERIFIED - income source,debt_consolidation,14.34,3,0.0,16,22,Male,3.0,3916.317992,6,0.5,56160.0,3.51,8163.834063,15153.691536,14000.0,52214.754799,71157.145773,63000.0,7.676983,17.90247,17.56
4,10000005,17000,8,Own,96000.0,VERIFIED - income source,debt_consolidation,22.17,1,0.0,19,30,Female,1.0,4330.175913,11,0.166667,12000.0,5.647059,8163.834063,15153.691536,14000.0,52214.754799,71157.145773,63000.0,7.676983,17.90247,17.56


In [12]:
df = df.replace({np.inf: 0})

In [13]:
df.columns

Index(['Loan_ID', 'Loan_Amount_Requested', 'Length_Employed', 'Home_Owner',
       'Annual_Income', 'Income_Verified', 'Purpose_Of_Loan', 'Debt_To_Income',
       'Inquiries_Last_6Mo', 'Months_Since_Deliquency', 'Number_Open_Accounts',
       'Total_Accounts', 'Gender', 'Interest_Rate', 'Debt_per_income',
       'Number_Closed_Account', 'per_month_deliquency',
       'Income_per_Length_Employed', 'Income_per_Amount_Requested',
       'Loan_Amount_Requested_std', 'Loan_Amount_Requested_mean',
       'Loan_Amount_Requested_median', 'Annual_Income_std',
       'Annual_Income_mean', 'Annual_Income_median', 'Debt_To_Income_std',
       'Debt_To_Income_mean', 'Debt_To_Income_median'],
      dtype='object')

In [14]:
from sklearn.preprocessing import LabelEncoder
for col in df.columns.values:
    if df[col].dtype == 'object':
        le = LabelEncoder()
        df[col] = le.fit_transform(df[col])

In [56]:
df_dum = pd.get_dummies(data=df, columns=cols_to_dum, drop_first=True, sparse=True).drop(['Loan_ID'], axis=1)
df_dum.shape

(273850, 43)

In [57]:
train_df = df_dum.iloc[:train.shape[0] ]
test_df = df_dum.iloc[train.shape[0]: ]
print(train_df.shape)
print(test_df.shape)

(164309, 43)
(109541, 43)


In [58]:
test_df.drop('Interest_Rate', axis=1, inplace=True)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  errors=errors,


In [59]:
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
mm = StandardScaler()
train_df['Interest_Rate'] = train_df['Interest_Rate'].astype('int32')
target = train_df.loc[:, 'Interest_Rate'].copy()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """


In [60]:
X = train_df.drop('Interest_Rate', axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, target, test_size=.2, random_state=20201)

X_train = mm.fit_transform(X_train)
X_test = mm.transform(X_test)
test_v = mm.transform(test_df)

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier

In [61]:
lr = LogisticRegression(max_iter=1000, random_state=21)
lr.fit(X_train, y_train)
lr_pred = lr.predict(X_test)
print('F1_Score: ', f1_score(y_test, lr_pred, labels=[1,2,3], average='weighted'))

F1_Score:  0.4936523430806539


In [22]:
rf = RandomForestClassifier()
rf.fit(X_train, y_train)
rf_pred = rf.predict(X_test)
print('F1_Score: ', f1_score(y_test, rf_pred, labels=[1,2,3], average='weighted'))

F1_Score:  0.5072983432240106


In [23]:
grb = GradientBoostingClassifier()
grb.fit(X_train, y_train)
grb_pred = grb.predict(X_test)
print('F1_Score: ', f1_score(y_test, grb_pred, labels=[1,2,3], average='weighted'))

F1_Score:  0.5213973685625576


In [78]:
from catboost import CatBoostClassifier, Pool

eval_dataset = Pool(X_test, y_test)

cats_model = CatBoostClassifier(learning_rate=0.0322, iterations=5000,
                                loss_function='MultiClass')

cats_model.fit(X_train, y_train, eval_set=eval_dataset, verbose=500, early_stopping_rounds=200)

0:	learn: 1.0918605	test: 1.0919344	best: 1.0919344 (0)	total: 133ms	remaining: 11m 5s
500:	learn: 0.9142956	test: 0.9257812	best: 0.9257812 (500)	total: 57.2s	remaining: 8m 33s
1000:	learn: 0.9010013	test: 0.9214956	best: 0.9214941 (999)	total: 1m 52s	remaining: 7m 29s
1500:	learn: 0.8911913	test: 0.9199591	best: 0.9199548 (1497)	total: 2m 48s	remaining: 6m 31s
2000:	learn: 0.8823872	test: 0.9194023	best: 0.9193347 (1948)	total: 3m 43s	remaining: 5m 35s
Stopped by overfitting detector  (200 iterations wait)

bestTest = 0.9191749459
bestIteration = 2213

Shrink model to first 2214 iterations.


<catboost.core.CatBoostClassifier at 0x2ba067f4e88>

In [79]:
cats_pred = cats_model.predict(X_test)
print('F1_Score: ', f1_score(y_test, cats_pred, labels=[1,2,3], average='weighted'))

F1_Score:  0.5310627628107445


In [26]:
# testing new models
from xgboost import XGBClassifier
clf = XGBClassifier(base_score=0.7,n_estimators=400,learning_rate=0.01,subsample=0.8)
eval_set  = [(X_train,y_train), (X_test,y_test)]
#.900057
clf.fit(X_train, y_train, eval_set=eval_set,eval_metric="merror", early_stopping_rounds=200,
                    verbose=500)

[0]	validation_0-merror:0.504819	validation_1-merror:0.507668
Multiple eval metrics have been passed: 'validation_1-merror' will be used for early stopping.

Will train until validation_1-merror hasn't improved in 200 rounds.


KeyboardInterrupt: 

In [None]:
lgbm_pred = clf.predict(X_test)
print('F1_Score: ', f1_score(y_test, lgbm_pred, labels=[1,2,3], average='weighted'))

In [69]:
from lightgbm import LGBMClassifier
lgbm = LGBMClassifier(learning_rate=0.01, n_estimators=3000, num_leaves=71)

lgbm.fit(X_train, y_train,eval_set=[(X_train,y_train), (X_test, y_test)],
         early_stopping_rounds=100, verbose=500)

Training until validation scores don't improve for 100 rounds
[500]	training's multi_logloss: 0.921342	valid_1's multi_logloss: 0.933998
[1000]	training's multi_logloss: 0.901231	valid_1's multi_logloss: 0.9237
[1500]	training's multi_logloss: 0.888365	valid_1's multi_logloss: 0.920915
[2000]	training's multi_logloss: 0.877493	valid_1's multi_logloss: 0.919398
[2500]	training's multi_logloss: 0.868024	valid_1's multi_logloss: 0.91892
Early stopping, best iteration is:
[2761]	training's multi_logloss: 0.863276	valid_1's multi_logloss: 0.918779


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.01, max_depth=-1,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=3000, n_jobs=-1, num_leaves=31, objective=None,
               random_state=None, reg_alpha=0.0, reg_lambda=0.0, silent=True,
               subsample=1.0, subsample_for_bin=200000, subsample_freq=0)

In [70]:
lgbm_pred = lgbm.predict(X_test)
print('F1_Score: ', f1_score(y_test, lgbm_pred, labels=[1,2,3], average='weighted'))

F1_Score:  0.5329617318770021


In [54]:
from sklearn.tree import DecisionTreeClassifier
tree = DecisionTreeClassifier(max_depth=11)
tree.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=11, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')

In [55]:
lgbm_pred = tree.predict(X_test)
print('F1_Score: ', f1_score(y_test, lgbm_pred, labels=[1,2,3], average='weighted'))

F1_Score:  0.49579795918483965


In [None]:
fea_imp = pd.Series(lgbm.feature_importances_, index=X.columns, name='feature_importance')
fea_imp = fea_imp.sort_values()
fea_imp.plot(kind='barh', figsize=(20,20), title='LGBM Feature Importance')

In [None]:
sample_df = df.iloc[:100]
plt.scatter(x='Debt_To_Income', y='Annual_Income', s='Total_Accounts', data=sample_df, alpha=.5)


In [None]:
np.random.seed(44)
N = 1000
x = np.random.normal(170, 20, N)
y = x + np.random.normal(5, 25, N)
colors = np.random.rand(N)
area = (25 * np.random.rand(N)) ** 2

In [None]:
df1 = pd.DataFrame({'X': x, 'Y': y, 'Colors': colors, 'bubble_size': area})

df1.head(2)

In [None]:
plt.scatter('X', 'Y', data=df1, s='bubble_size', alpha=.5, c='Colors')
plt.grid()

In [None]:
mu, sigma = 0, 0.1 # mean and standard deviation
s = np.random.normal(mu, sigma, 1000)
count, bins, ignored = plt.hist(s, 30, density=True)
plt.plot(bins, 1/(sigma * np.sqrt(2 * np.pi)) * np.exp( - (bins - mu)**2 / (2 * sigma**2) ),  linewidth=2, color='r')
plt.show()

In [71]:

sample_sub['Interest_Rate'] = lgbm.predict(test_v)

In [72]:
sample_sub.to_csv('lgbm.csv', index=False)
print('Ready for submission')

Ready for submission
