In [5]:
import pandas as pd

In [6]:
df_encoded = pd.read_csv('../data/df_encoded.csv')

In [7]:
df_encoded.head()

Unnamed: 0,Term,isNewBusiness,isFranchise,RevLineCr,LowDoc,GrAppv,SBARatio,NAICS_i_0,NAICS_i_11,NAICS_i_21,...,NAICS_i_61,NAICS_i_62,NAICS_i_71,NAICS_i_72,NAICS_i_81,NAICS_i_92,UrbanRural_0,UrbanRural_1,UrbanRural_2,Default
0,84,1.0,0,0,1,60000.0,0.8,False,False,False,...,False,False,False,False,False,False,True,False,False,0
1,60,1.0,0,0,1,40000.0,0.8,False,False,False,...,False,False,False,True,False,False,True,False,False,0
2,180,0.0,0,0,0,287000.0,0.75,False,False,False,...,False,True,False,False,False,False,True,False,False,0
3,60,0.0,0,0,1,35000.0,0.8,True,False,False,...,False,False,False,False,False,False,True,False,False,0
4,240,0.0,0,0,0,229000.0,1.0,True,False,False,...,False,False,False,False,False,False,True,False,False,0


In [8]:
target = 'Default'
features_encoded = [feature for feature in df_encoded.columns if feature not in target]

print(f"Target: {target}")
print(f"Features: {features_encoded}")

Target: Default
Features: ['Term', 'isNewBusiness', 'isFranchise', 'RevLineCr', 'LowDoc', 'GrAppv', 'SBARatio', 'NAICS_i_0', 'NAICS_i_11', 'NAICS_i_21', 'NAICS_i_22', 'NAICS_i_23', 'NAICS_i_31', 'NAICS_i_32', 'NAICS_i_33', 'NAICS_i_42', 'NAICS_i_44', 'NAICS_i_45', 'NAICS_i_48', 'NAICS_i_49', 'NAICS_i_51', 'NAICS_i_52', 'NAICS_i_53', 'NAICS_i_54', 'NAICS_i_55', 'NAICS_i_56', 'NAICS_i_61', 'NAICS_i_62', 'NAICS_i_71', 'NAICS_i_72', 'NAICS_i_81', 'NAICS_i_92', 'UrbanRural_0', 'UrbanRural_1', 'UrbanRural_2']


In [9]:
# Grid search
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
param_grid = [
    {
        "min_samples_split": [50, 100],
        "max_depth": [20, 30, 40],
        "n_estimators": [100, 200]
    }
]

# Let's also try some CV

from sklearn.experimental import enable_halving_search_cv # noqa
# now you can import normally from model_selection
from sklearn.model_selection import HalvingGridSearchCV
X_train, X_test, y_train, y_test = train_test_split(df_encoded[features_encoded], df_encoded[target], train_size = .9)
from sklearn.model_selection import RepeatedStratifiedKFold
cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3, random_state=42)

search = HalvingGridSearchCV(
    estimator=RandomForestClassifier(),
    param_grid=param_grid,
    scoring='precision',
    cv=cv,
    n_jobs=-1,
    verbose=1
)

search.fit(X_train, y_train)

# X_train, X_test, y_train, y_test = train_test_split(df_encoded[features], df_encoded[target], train_size = .9)
# from sklearn.model_selection import RepeatedStratifiedKFold
# cv = RepeatedStratifiedKFold(n_splits=3, n_repeats=3)

# search = GridSearchCV(estimator=RandomForestClassifier(), 
#                       param_grid=param_grid, 
#                       scoring='accuracy', 
#                       cv=cv,
#                       n_jobs=-1)
# search.fit(X_train, y_train)

n_iterations: 3
n_required_iterations: 3
n_possible_iterations: 3
min_resources_: 87051
max_resources_: 783462
aggressive_elimination: False
factor: 3
----------
iter: 0
n_candidates: 12
n_resources: 87051
Fitting 9 folds for each of 12 candidates, totalling 108 fits
----------
iter: 1
n_candidates: 4
n_resources: 261153
Fitting 9 folds for each of 4 candidates, totalling 36 fits
----------
iter: 2
n_candidates: 2
n_resources: 783459
Fitting 9 folds for each of 2 candidates, totalling 18 fits


In [None]:
results_df = pd.DataFrame(search.cv_results_)
results_df = results_df.sort_values(by=["rank_test_score"])
results_df = results_df.set_index(
    results_df["params"].apply(lambda x: "_".join(str(val) for val in x.values()))
).rename_axis("kernel")
results_df[["params", "rank_test_score", "mean_test_score", "std_test_score"]]

In [None]:
yhat_test = search.best_estimator_.predict(X_test)
print(y_test.shape)
print(accuracy_score(yhat_test, y_test))

In [None]:
len(df[df[target] == 1])/len(df)

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, yhat_test)

# Print the confusion matrix
print("Confusion Matrix:")
print(cm)

In [None]:
yhat_train = search.best_estimator_.predict(X_train)

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Calculate the confusion matrix
cm = confusion_matrix(y_train, yhat_train)

# Create a heatmap using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=['yes', 'no'], yticklabels=['yes', 'no'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
# I am sure that more estimators is generating better results.

rfmodel = RandomForestClassifier(
    n_estimators=500,
    max_depth=50,
    min_samples_split=50,
    n_jobs=-1,
    verbose=1
).fit(df_encoded[features], df_encoded[target])

In [None]:
# Calculate the confusion matrix
yhat_rf = rfmodel.predict(df_encoded[features])
cm = confusion_matrix(df_encoded[target], yhat_rf)

# Create a heatmap using seaborn
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues",
            xticklabels=['yes', 'no'], yticklabels=['yes', 'no'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()


In [None]:
from xgboost import XGBClassifier

xgb_model = XGBClassifier()

xgb_model.fit(df_encoded[features], df_encoded[target])