In [1]:
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_parquet('./data/catB_train.parquet')

In [2]:
# Add new feature Age
df = df[df['cltdob_fix']!='None']
df['cltdob_fix'] = pd.to_datetime(df.iloc[:, 6], format ='mixed')
df['age'] = 2024-df['cltdob_fix'].dt.year

In [3]:
# Target Column
df["f_purchase_lh"] = df["f_purchase_lh"].fillna(0)
y = df["f_purchase_lh"]

# All features 
X = df.drop(columns=['f_purchase_lh'])

In [4]:
# Split numerical and non-numerical columns
numeric_cols = X.select_dtypes(include=["int32", "int64", "float64"]).columns
X_numeric = X[numeric_cols]

In [5]:
# Remove Low-Variance Numerical Variables
from sklearn.feature_selection import VarianceThreshold
sel = VarianceThreshold(threshold=(0.05))
sel.fit(X_numeric)
X_numeric = X_numeric[numeric_cols[sel.get_support()]]

In [6]:
# Fill null values in numeric columns with the median value
X_numeric = X_numeric.apply(lambda x: x.fillna(x.median()))

In [7]:
# Merge with selected non_categorical values
temp = pd.get_dummies(X[['cltsex_fix', 'stat_flag']], dtype=float)
X = pd.concat([X_numeric, temp, df['age']], axis=1)

In [8]:
from sklearn.model_selection import train_test_split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

In [9]:
# Mapping for replacement for categorical data (not hot encoding)
mapping = {
    None: -1,
    'E.BELOW30K': 0,
    'D.30K-60K': 1,
    'C.60K-100K': 2,
    'B.100K-200K': 3,
    'A.ABOVE200K': 4,
}

# Replace values based on the mapping
df['annual_income_est'] = df['annual_income_est'].replace(mapping)

In [13]:
print(y.describe())

count    17970.000000
mean         0.039399
std          0.194548
min          0.000000
25%          0.000000
50%          0.000000
75%          0.000000
max          1.000000
Name: f_purchase_lh, dtype: float64


Select related features

In [136]:
df_new = pd.concat([X,y],axis=1)
#print(df_new.columns)

Select based on mutual info

In [137]:
from sklearn.feature_selection import mutual_info_classif

mutual_info_values = mutual_info_classif(X, y, discrete_features=[False]*X.shape[1])

threshold = 0.005
selected_features = X.columns[mutual_info_values > threshold]

df_new_selected = pd.concat([X[selected_features],y],axis=1)
print(len(selected_features))
#print(df_new_selected.columns)




13


Select based on feature importance

In [97]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score
rf_model = RandomForestRegressor(n_estimators=100, random_state=42)

X_new = df_new_selected.drop(columns=['f_purchase_lh'])
y_new = df_new_selected['f_purchase_lh']

X_new_train, X_new_val, y_new_train, y_new_val = train_test_split(X_new, y_new, test_size=0.2, random_state=0)

rf_model.fit(X_new_train, y_new_train)

feature_importances = rf_model.feature_importances_

importance_df = pd.DataFrame({'Feature': X_new_train.columns, 'Importance': feature_importances})

importance_df = importance_df.sort_values(by='Importance', ascending=False)

print("Feature Importances:")
print(importance_df)

y_pred_test = rf_model.predict(X_new_val)
mse_test_rf = mean_squared_error(y_new_val, y_pred_test)
r2_test_rf = r2_score(y_new_val, y_pred_test)

print(f'Test Mean Squared Error (Random Forest): {mse_test_rf}')
print(f'Test R-squared (Random Forest): {r2_test_rf}')

selected_features2 = importance_df.head(8)['Feature'].tolist()
print(selected_features2)
df_new_selected2 = df_new_selected[selected_features2+['f_purchase_lh']]
print(df_new_selected2.columns)

Feature Importances:
                          Feature  Importance
4                   recency_lapse    0.247752
3   n_months_last_bought_products    0.238608
6                tot_inforce_pols    0.154182
14                cltsex_fix_Male    0.052967
13                recency_giclaim    0.051537
5                  recency_cancel    0.050193
8                   f_hold_507c37    0.041749
10                       f_retail    0.037333
11    n_months_since_visit_affcon    0.036229
12    recency_hlthclaim_unsuccess    0.034832
0                     is_valid_dm    0.020149
9                f_ever_bought_gi    0.013077
1                  is_valid_email    0.009628
2                        is_sg_pr    0.006082
7                 tot_cancel_pols    0.005682
Test Mean Squared Error (Random Forest): 0.03789882993546852
Test R-squared (Random Forest): 0.02756219434665852
['recency_lapse', 'n_months_last_bought_products', 'tot_inforce_pols', 'cltsex_fix_Male', 'recency_giclaim', 'recency_cancel', 'f_

In [98]:
X_new = df_new_selected2.drop(columns=['f_purchase_lh'])
y_new = df_new_selected2['f_purchase_lh']
print(X_new.columns)

Index(['recency_lapse', 'n_months_last_bought_products', 'tot_inforce_pols',
       'cltsex_fix_Male', 'recency_giclaim', 'recency_cancel', 'f_hold_507c37',
       'f_retail'],
      dtype='object')


resampling - original data is imbalanced (too many 0 very 1)

In [105]:
from imblearn.over_sampling import RandomOverSampler, SMOTE
ros = RandomOverSampler(random_state=0)
X_resampled, y_resampled = ros.fit_resample(X, y)
smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X, y)


## Logistic regression

normal logistic

In [28]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)
logreg_model = LogisticRegression()

logreg_model.fit(X_train, y_train)

y_val_pred = logreg_model.predict(X_val)

accuracy = accuracy_score(y_val, y_val_pred)
conf_matrix = confusion_matrix(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print(f"F1 Score: {f1}")

Accuracy: 0.9579855314412911
Confusion Matrix:
[[3442    6]
 [ 145    1]]
F1 Score: 0.013071895424836602


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


resampled logistic

In [128]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

smote = SMOTE(random_state=0)
X_resampled_smote, y_resampled_smote = smote.fit_resample(X, y)

adasyn = ADASYN(random_state=0)
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X, y)

logreg_model = LogisticRegression(max_iter=1000)

# SMOTE
X_train_smote, X_val_smote, y_train_smote, y_val_smote = train_test_split(X_resampled_smote, y_resampled_smote, test_size=0.2, random_state=0)
logreg_model.fit(X_train_smote, y_train_smote)

# ADASYN
X_train_adasyn, X_val_adasyn, y_train_adasyn, y_val_adasyn = train_test_split(X_resampled_adasyn, y_resampled_adasyn, test_size=0.2, random_state=0)
logreg_model.fit(X_train_adasyn, y_train_adasyn)

# Assuming X and y are your original features and target
X_train_original, X_val_original, y_train_original, y_val_original = train_test_split(X, y, test_size=0.2, random_state=0)
y_val_pred_original = logreg_model.predict(X_val_original)

accuracy_original = accuracy_score(y_val_original, y_val_pred_original)
conf_matrix_original = confusion_matrix(y_val_original, y_val_pred_original)
f1_original = f1_score(y_val_original, y_val_pred_original)

print("Results on Original Data:")
print(f"Accuracy: {accuracy_original}")
print("Confusion Matrix:")
print(conf_matrix_original)
print(f"F1 Score: {f1_original}")



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Results on Original Data:
Accuracy: 0.8010573177518086
Confusion Matrix:
[[2814  634]
 [  81   65]]
F1 Score: 0.15384615384615385


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


resampled logistic with cross-validate

In [116]:
from sklearn.model_selection import cross_val_score, StratifiedKFold

# SMOTE
cv_smote = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cross_val_results_smote = cross_val_score(logreg_model, X_resampled_smote, y_resampled_smote, cv=cv_smote, scoring='f1')

# ADASYN
cv_adasyn = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
cross_val_results_adasyn = cross_val_score(logreg_model, X_resampled_adasyn, y_resampled_adasyn, cv=cv_adasyn, scoring='f1')

print("Cross-Validation Results with SMOTE:")
print(f"F1 Scores: {cross_val_results_smote}")
print(f"Mean F1 Score: {cross_val_results_smote.mean()}")

print("\nCross-Validation Results with ADASYN:")
print(f"F1 Scores: {cross_val_results_adasyn}")
print(f"Mean F1 Score: {cross_val_results_adasyn.mean()}")

y_val_pred_original = logreg_model.predict(X_val_original)

accuracy_original = accuracy_score(y_val_original, y_val_pred_original)
conf_matrix_original = confusion_matrix(y_val_original, y_val_pred_original)
f1_original = f1_score(y_val_original, y_val_pred_original)

print("\nResults on Original Data:")
print(f"Accuracy: {accuracy_original}")
print("Confusion Matrix:")
print(conf_matrix_original)
print(f"F1 Score: {f1_original}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Cross-Validation Results with SMOTE:
F1 Scores: [0.78892342 0.78308824 0.78727634 0.79426381 0.79495179]
Mean F1 Score: 0.7897007193404375

Cross-Validation Results with ADASYN:
F1 Scores: [0.77697228 0.7727662  0.77772957 0.78109523 0.78398163]
Mean F1 Score: 0.7785089828542116

Results on Original Data:
Accuracy: 0.7337228714524207
Confusion Matrix:
[[2557  891]
 [  66   80]]
F1 Score: 0.14324082363473592


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


logsitic with selected features

In [99]:
logreg_model = LogisticRegression()
X_new = df_new_selected2.drop(columns=['f_purchase_lh'])
y_new = df_new_selected2['f_purchase_lh']
X_new_train, X_new_val, y_new_train, y_new_val = train_test_split(X_new, y_new, test_size=0.2, random_state=0)
logreg_model.fit(X_new_train, y_new_train)

y_val_pred = logreg_model.predict(X_new_val)

accuracy = accuracy_score(y_new_val, y_val_pred)
conf_matrix = confusion_matrix(y_new_val, y_val_pred)
f1 = f1_score(y_new_val, y_val_pred)
print(f"Accuracy: {accuracy}")
print("Confusion Matrix:")
print(conf_matrix)
print(f"F1 Score: {f1}")
#accuracy_score(y_val, [0]*3594)

Accuracy: 0.9590984974958264
Confusion Matrix:
[[3447    1]
 [ 146    0]]
F1 Score: 0.0


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


adjusted logistic

In [132]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)

y_val_prob = logreg_model.predict_proba(X_val)[:, 1]

# custom threshold
custom_threshold = 0.15
y_val_pred_custom = (y_val_prob > custom_threshold).astype(int)

accuracy_custom = accuracy_score(y_val, y_val_pred_custom)
conf_matrix_custom = confusion_matrix(y_val, y_val_pred_custom)
f1_custom = f1_score(y_val, y_val_pred_custom)

# Print the results
print(f"Accuracy with Custom Threshold: {accuracy_custom}")
print("Confusion Matrix with Custom Threshold:")
print(conf_matrix_custom)
print(f"F1 Score with Custom Threshold: {f1_custom}")


Accuracy with Custom Threshold: 0.9426822481914302
Confusion Matrix with Custom Threshold:
[[3350   98]
 [ 108   38]]
F1 Score with Custom Threshold: 0.2695035460992908


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


adjusted logistic with cross-validation

In [134]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

logreg_model = LogisticRegression()

# custom threshold
custom_threshold = 0.15

# cross-validation
cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

accuracy_scores = []
conf_matrices = []
f1_scores = []

# cross-validation
for train_idx, val_idx in cv.split(X_train, y_train):
    X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
    y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]

    logreg_model.fit(X_train_fold, y_train_fold)
    y_val_prob = logreg_model.predict_proba(X_val_fold)[:, 1]

    y_val_pred_custom = (y_val_prob > custom_threshold).astype(int)

    accuracy_fold = accuracy_score(y_val_fold, y_val_pred_custom)
    conf_matrix_fold = confusion_matrix(y_val_fold, y_val_pred_custom)
    f1_fold = f1_score(y_val_fold, y_val_pred_custom)

    accuracy_scores.append(accuracy_fold)
    conf_matrices.append(conf_matrix_fold)
    f1_scores.append(f1_fold)

print(f"Average Accuracy with Custom Threshold: {sum(accuracy_scores) / len(accuracy_scores)}")
print("Average Confusion Matrix with Custom Threshold:")
print(sum(conf_matrices) / len(conf_matrices))
print(f"Average F1 Score with Custom Threshold: {sum(f1_scores) / len(f1_scores)}")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

Average Accuracy with Custom Threshold: 0.9380218903065852
Average Confusion Matrix with Custom Threshold:
[[2669.    93.8]
 [  84.4   28. ]]
Average F1 Score with Custom Threshold: 0.23861971868864423


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


adjusted logistic with selected features

In [135]:
X_train, X_val, y_train, y_val = train_test_split(X_new, y_new, test_size=0.2, random_state=0)

logreg_model = LogisticRegression()
logreg_model.fit(X_train, y_train)
y_val_prob = logreg_model.predict_proba(X_val)[:, 1]

custom_threshold = 0.15
y_val_pred_custom = (y_val_prob > custom_threshold).astype(int)

accuracy_custom = accuracy_score(y_val, y_val_pred_custom)
conf_matrix_custom = confusion_matrix(y_val, y_val_pred_custom)
f1_custom = f1_score(y_val, y_val_pred_custom)

print(f"Accuracy with Custom Threshold: {accuracy_custom}")
print("Confusion Matrix with Custom Threshold:")
print(conf_matrix_custom)
print(f"F1 Score with Custom Threshold: {f1_custom}")


Accuracy with Custom Threshold: 0.9501947690595437
Confusion Matrix with Custom Threshold:
[[3393   55]
 [ 124   22]]
F1 Score with Custom Threshold: 0.19730941704035873


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


## Random Forest

normal random forest with decision tree

In [83]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=0,class_weight='balanced')
dt_model.fit(X_train, y_train)

y_val_pred_dt = dt_model.predict(X_val)

accuracy_dt = accuracy_score(y_val, y_val_pred_dt)
conf_matrix_dt = confusion_matrix(y_val, y_val_pred_dt)
f1_dt = f1_score(y_val, y_val_pred_dt)

print("Decision Tree Results:")
print(f"Accuracy: {accuracy_dt}")
print("Confusion Matrix:")
print(conf_matrix_dt)
print(f"F1 Score: {f1_dt}")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=0,class_weight='balanced')
rf_model.fit(X_train, y_train)

y_val_pred_rf = rf_model.predict(X_val)

accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_rf)
f1_rf = f1_score(y_val, y_val_pred_rf)

print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print(f"F1 Score: {f1_rf}")


Decision Tree Results:
Accuracy: 0.9301613800779076
Confusion Matrix:
[[3318  130]
 [ 121   25]]
F1 Score: 0.16611295681063123

Random Forest Results:
Accuracy: 0.9596549805230941
Confusion Matrix:
[[3445    3]
 [ 142    4]]
F1 Score: 0.05228758169934641


random forest & decision tree with custom threshold

In [84]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=0,class_weight='balanced')
dt_model.fit(X_train, y_train)

y_val_prob_dt = dt_model.predict_proba(X_val)[:, 1]
custom_threshold = 0.15  # Adjust as needed
y_val_pred_custom_dt = (y_val_prob_dt > custom_threshold).astype(int)

accuracy_dt = accuracy_score(y_val, y_val_pred_custom_dt)
conf_matrix_dt = confusion_matrix(y_val, y_val_pred_custom_dt)
f1_dt = f1_score(y_val, y_val_pred_custom_dt)

print("Decision Tree Results:")
print(f"Accuracy: {accuracy_dt}")
print("Confusion Matrix:")
print(conf_matrix_dt)
print(f"F1 Score: {f1_dt}")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=0,class_weight='balanced')
rf_model.fit(X_train, y_train)

y_val_prob_rf = rf_model.predict_proba(X_val)[:, 1]
y_val_pred_custom_rf = (y_val_prob_rf > custom_threshold).astype(int)

accuracy_rf = accuracy_score(y_val, y_val_pred_custom_rf)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_custom_rf)
f1_rf = f1_score(y_val, y_val_pred_custom_rf)

print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print(f"F1 Score: {f1_rf}")


Decision Tree Results:
Accuracy: 0.9301613800779076
Confusion Matrix:
[[3318  130]
 [ 121   25]]
F1 Score: 0.16611295681063123

Random Forest Results:
Accuracy: 0.9354479688369505
Confusion Matrix:
[[3316  132]
 [ 100   46]]
F1 Score: 0.2839506172839506


custom rf & dt with cross-validation 

In [121]:
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, make_scorer
import numpy as np

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=0, class_weight='balanced')
dt_model.fit(X_train, y_train)

y_val_prob_dt = dt_model.predict_proba(X_val)[:, 1]
custom_threshold = 0.15 
y_val_pred_custom_dt = (y_val_prob_dt > custom_threshold).astype(int)

accuracy_dt = accuracy_score(y_val, y_val_pred_custom_dt)
conf_matrix_dt = confusion_matrix(y_val, y_val_pred_custom_dt)
f1_dt = f1_score(y_val, y_val_pred_custom_dt)

print("Decision Tree Results:")
print(f"Accuracy: {accuracy_dt}")
print("Confusion Matrix:")
print(conf_matrix_dt)
print(f"F1 Score: {f1_dt}")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced')
rf_model.fit(X_train, y_train)

y_val_prob_rf = rf_model.predict_proba(X_val)[:, 1]
y_val_pred_custom_rf = (y_val_prob_rf > custom_threshold).astype(int)

accuracy_rf = accuracy_score(y_val, y_val_pred_custom_rf)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_custom_rf)
f1_rf = f1_score(y_val, y_val_pred_custom_rf)

print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print(f"F1 Score: {f1_rf}")


scorer = make_scorer(f1_score)

# Decision Tree Cross-Validation Results
cv_scores_dt = cross_val_score(dt_model, X, y, cv=5, scoring=scorer)
print("\nDecision Tree Cross-Validation Results:")
print(f"Average F1 Score: {np.mean(cv_scores_dt)}")

# Random Forest Cross-Validation Results
cv_scores_rf = cross_val_score(rf_model, X, y, cv=5, scoring=scorer)
print("\nRandom Forest Cross-Validation Results:")
print(f"Average F1 Score: {np.mean(cv_scores_rf)}")


Decision Tree Results:
Accuracy: 0.9301613800779076
Confusion Matrix:
[[3318  130]
 [ 121   25]]
F1 Score: 0.16611295681063123

Random Forest Results:
Accuracy: 0.9354479688369505
Confusion Matrix:
[[3316  132]
 [ 100   46]]
F1 Score: 0.2839506172839506

Decision Tree Cross-Validation Results:
Average F1 Score: 0.18909699544002626

Random Forest Cross-Validation Results:
Average F1 Score: 0.0510024022911057


random forest and decision tree with selected features

In [101]:
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

X_train, X_val, y_train, y_val = train_test_split(X_new, y_new, test_size=0.2, random_state=0)

# Decision Tree
dt_model = DecisionTreeClassifier(random_state=0,class_weight='balanced')
dt_model.fit(X_train, y_train)

y_val_pred_dt = dt_model.predict(X_val)

accuracy_dt = accuracy_score(y_val, y_val_pred_dt)
conf_matrix_dt = confusion_matrix(y_val, y_val_pred_dt)
f1_dt = f1_score(y_val, y_val_pred_dt)

print("Decision Tree Results:")
print(f"Accuracy: {accuracy_dt}")
print("Confusion Matrix:")
print(conf_matrix_dt)
print(f"F1 Score: {f1_dt}")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=0,class_weight='balanced')
rf_model.fit(X_train, y_train)

y_val_pred_rf = rf_model.predict(X_val)

accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_rf)
f1_rf = f1_score(y_val, y_val_pred_rf)

print("\nRandom Forest Results:")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print(f"F1 Score: {f1_rf}")


Decision Tree Results:
Accuracy: 0.8667223149693934
Confusion Matrix:
[[3083  365]
 [ 114   32]]
F1 Score: 0.11786372007366484

Random Forest Results:
Accuracy: 0.9151363383416806
Confusion Matrix:
[[3268  180]
 [ 125   21]]
F1 Score: 0.12103746397694524


dt rf with grid search

In [102]:
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

dt_model = DecisionTreeClassifier(random_state=0, class_weight='balanced')
rf_model = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced')

param_grid = {
    'max_depth': [None, 10, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

# Grid Search for Decision Tree
grid_search_dt = GridSearchCV(estimator=dt_model, param_grid=param_grid, scoring='f1', cv=3)
grid_search_dt.fit(X_train, y_train)

# Grid Search for Random Forest
grid_search_rf = GridSearchCV(estimator=rf_model, param_grid=param_grid, scoring='f1', cv=3)
grid_search_rf.fit(X_train, y_train)

# hyperparameters
best_params_dt = grid_search_dt.best_params_
best_params_rf = grid_search_rf.best_params_

y_val_pred_dt = grid_search_dt.best_estimator_.predict(X_val)
y_val_pred_rf = grid_search_rf.best_estimator_.predict(X_val)

# Decision Tree
accuracy_dt = accuracy_score(y_val, y_val_pred_dt)
conf_matrix_dt = confusion_matrix(y_val, y_val_pred_dt)
f1_dt = f1_score(y_val, y_val_pred_dt)

# Random Forest
accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_rf)
f1_rf = f1_score(y_val, y_val_pred_rf)

print("Decision Tree Results:")
print(f"Best Hyperparameters: {best_params_dt}")
print(f"Accuracy: {accuracy_dt}")
print("Confusion Matrix:")
print(conf_matrix_dt)
print(f"F1 Score: {f1_dt}")

print("\nRandom Forest Results:")
print(f"Best Hyperparameters: {best_params_rf}")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print(f"F1 Score: {f1_rf}")


Decision Tree Results:
Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 5}
Accuracy: 0.8825820812465219
Confusion Matrix:
[[3134  314]
 [ 108   38]]
F1 Score: 0.15261044176706826

Random Forest Results:
Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 4, 'min_samples_split': 10}
Accuracy: 0.9471341124095715
Confusion Matrix:
[[3367   81]
 [ 109   37]]
F1 Score: 0.28030303030303033


dt & rf with random search

In [103]:
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from scipy.stats import randint

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

dt_model = DecisionTreeClassifier(random_state=0, class_weight='balanced')
rf_model = RandomForestClassifier(n_estimators=100, random_state=0, class_weight='balanced')

param_dist = {
    'max_depth': [None, 10, 20],
    'min_samples_split': randint(2, 11),
    'min_samples_leaf': randint(1, 5)
}

# Random Search for Decision Tree
random_search_dt = RandomizedSearchCV(estimator=dt_model, param_distributions=param_dist, scoring='f1', cv=3, n_iter=10, random_state=0)
random_search_dt.fit(X_train, y_train)

# Random Search for Random Forest
random_search_rf = RandomizedSearchCV(estimator=rf_model, param_distributions=param_dist, scoring='f1', cv=3, n_iter=10, random_state=0)
random_search_rf.fit(X_train, y_train)

# hyperparameters
best_params_dt = random_search_dt.best_params_
best_params_rf = random_search_rf.best_params_
y_val_pred_dt = random_search_dt.best_estimator_.predict(X_val)
y_val_pred_rf = random_search_rf.best_estimator_.predict(X_val)

# Decision Tree
accuracy_dt = accuracy_score(y_val, y_val_pred_dt)
conf_matrix_dt = confusion_matrix(y_val, y_val_pred_dt)
f1_dt = f1_score(y_val, y_val_pred_dt)

# Random Forest
accuracy_rf = accuracy_score(y_val, y_val_pred_rf)
conf_matrix_rf = confusion_matrix(y_val, y_val_pred_rf)
f1_rf = f1_score(y_val, y_val_pred_rf)

print("Decision Tree Results:")
print(f"Best Hyperparameters: {best_params_dt}")
print(f"Accuracy: {accuracy_dt}")
print("Confusion Matrix:")
print(conf_matrix_dt)
print(f"F1 Score: {f1_dt}")

print("\nRandom Forest Results:")
print(f"Best Hyperparameters: {best_params_rf}")
print(f"Accuracy: {accuracy_rf}")
print("Confusion Matrix:")
print(conf_matrix_rf)
print(f"F1 Score: {f1_rf}")


Decision Tree Results:
Best Hyperparameters: {'max_depth': 20, 'min_samples_leaf': 1, 'min_samples_split': 3}
Accuracy: 0.889259877573734
Confusion Matrix:
[[3158  290]
 [ 108   38]]
F1 Score: 0.16033755274261605

Random Forest Results:
Best Hyperparameters: {'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 10}
Accuracy: 0.9020589872008904
Confusion Matrix:
[[3181  267]
 [  85   61]]
F1 Score: 0.25738396624472576


## SVM

In [125]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

svm_model = SVC(probability=True)  
svm_model.fit(X_train, y_train)

y_val_prob_svm = svm_model.predict_proba(X_val)[:, 1]
y_val_pred_svm = svm_model.predict(X_val)

accuracy_svm = accuracy_score(y_val, y_val_pred_svm)
conf_matrix_svm = confusion_matrix(y_val, y_val_pred_svm)
f1_svm = f1_score(y_val, y_val_pred_svm)

print(f"Accuracy without Custom Threshold (SVM): {accuracy_svm}")
print("Confusion Matrix without Custom Threshold (SVM):")
print(conf_matrix_svm)
print(f"F1 Score without Custom Threshold (SVM): {f1_svm}")


Accuracy without Custom Threshold (SVM): 0.9593767390094602
Confusion Matrix without Custom Threshold (SVM):
[[3448    0]
 [ 146    0]]
F1 Score without Custom Threshold (SVM): 0.0


svm with resampling

In [127]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score
from imblearn.over_sampling import SMOTE

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=0)

smote = SMOTE(random_state=0)
X_resampled, y_resampled = smote.fit_resample(X_train, y_train)

svm_model = SVC(kernel='linear', C=0.1, probability=True)
svm_model.fit(X_resampled, y_resampled)

y_val_prob_svm = svm_model.predict_proba(X_val)[:, 1]
y_val_pred_svm = svm_model.predict(X_val)

accuracy_svm = accuracy_score(y_val, y_val_pred_svm)
conf_matrix_svm = confusion_matrix(y_val, y_val_pred_svm)
f1_svm = f1_score(y_val, y_val_pred_svm)

print(f"Accuracy without Custom Threshold (SVM): {accuracy_svm}")
print("Confusion Matrix without Custom Threshold (SVM):")
print(conf_matrix_svm)
print(f"F1 Score without Custom Threshold (SVM): {f1_svm}")


Accuracy without Custom Threshold (SVM): 0.8196994991652755
Confusion Matrix without Custom Threshold (SVM):
[[2889  559]
 [  89   57]]
F1 Score without Custom Threshold (SVM): 0.14960629921259844
