In [20]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, cross_val_score, KFold
import lightgbm as lgb
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, average_precision_score, classification_report
import matplotlib.pyplot as plt
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import RandomOverSampler
from time import time
import xgboost as xgb
from hyperopt import Trials, hp, fmin, tpe, STATUS_OK
from tqdm.auto import tqdm
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.decomposition import PCA
from sklearn.svm import SVC
from sklearn.svm import OneClassSVM
import optuna


In [21]:
df_train = pd.read_parquet('./parquets/P03_train.pq')
df_test = pd.read_parquet('./parquets/P03_test.pq')

In [22]:
import pandas as pd
from scipy.stats import chi2_contingency

# Assuming you have your dataset loaded into 'df_train' (a pandas DataFrame)
# and the target variable is 'target_column_name'

# Create a contingency table for each feature with the target variable
contingency_tables = []
for feature_column in df_train.columns:
    contingency_table = pd.crosstab(df_train[feature_column], df_train['flag'])
    contingency_tables.append(contingency_table)

# Calculate chi-squared statistic, p-value, degrees of freedom, and expected frequencies
feature_importance_results = []
for contingency_table in contingency_tables:
    chi2, p_value, dof, expected = chi2_contingency(contingency_table)
    feature_importance_results.append({'Feature': contingency_table.index.name, 'Chi-squared': chi2, 'p-value': p_value})

# Convert the results to a DataFrame for easy visualization
results_df = pd.DataFrame(feature_importance_results)

# Sort the results based on chi-squared values in descending order
results_df = results_df.sort_values(by='Chi-squared', ascending=False)

print(results_df)


                       Feature    Chi-squared        p-value
0                           id  175000.000000   4.988761e-01
1                         flag  174966.717465   0.000000e+00
13  pre_loans_credit_cost_rate    1022.184686  3.184142e-211
31                  enc_paym_1     836.606620  4.976220e-181
32                  enc_paym_2     816.479351  1.153998e-176
..                         ...            ...            ...
18                 pre_loans90       6.484555   9.027360e-02
58       enc_loans_account_cur       4.419185   1.097453e-01
60                 fclose_flag       4.116843   4.245823e-02
11     pre_loans_total_overdue       0.000000   1.000000e+00
17               pre_loans6090       0.000000   1.000000e+00

[61 rows x 3 columns]


In [23]:
significant_features = results_df[results_df['p-value'] < 0.05]['Feature'].tolist()

df = df_train[significant_features]

In [24]:
X = df.drop('flag', axis=1)
y = df['flag']

In [25]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [26]:
scaler = StandardScaler()
X_train_std = scaler.fit_transform(X_train)
X_val_std = scaler.transform(X_val)

In [27]:
n_components = 15 # Choose the number of principal components (you can experiment with different values)
pca = PCA(n_components=n_components)
X_train_pca = pca.fit_transform(X_train_std)
X_val_pca = pca.transform(X_val_std)

In [28]:
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log = True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'scale_pos_weight': len(y_train[y_train == 0]) / len(y_train[y_train == 1])        
    }

    model = lgb.LGBMClassifier(**params)

    # Use cross-validation to evaluate the model's performance
    score = cross_val_score(model, X_train_pca, y_train, n_jobs=-1, cv=3, scoring='precision').mean()

    return score


In [29]:
sampler = optuna.samplers.TPESampler(seed=42)  # Optuna sampler (Tree-structured Parzen Estimator)
study = optuna.create_study(direction='maximize', sampler=sampler)
study.optimize(objective, n_trials=100)  # You can adjust the number of trials as needed

[I 2023-08-01 22:17:26,217] A new study created in memory with name: no-name-7d73b8e2-a4a1-44e5-8da5-6d7b819aeea9
[I 2023-08-01 22:17:34,764] Trial 0 finished with value: 0.0683966347232075 and parameters: {'num_leaves': 44, 'learning_rate': 0.07969454818643935, 'n_estimators': 746, 'min_child_samples': 60}. Best is trial 0 with value: 0.0683966347232075.
[I 2023-08-01 22:17:36,715] Trial 1 finished with value: 0.0 and parameters: {'num_leaves': 24, 'learning_rate': 0.002051110418843397, 'n_estimators': 105, 'min_child_samples': 87}. Best is trial 0 with value: 0.0683966347232075.
[I 2023-08-01 22:17:38,242] Trial 2 finished with value: 0.06498488189474785 and parameters: {'num_leaves': 64, 'learning_rate': 0.02607024758370768, 'n_estimators': 69, 'min_child_samples': 97}. Best is trial 0 with value: 0.0683966347232075.
[I 2023-08-01 22:17:43,190] Trial 3 finished with value: 0.0 and parameters: {'num_leaves': 85, 'learning_rate': 0.0026587543983272706, 'n_estimators': 222, 'min_child_

In [30]:
params = study.best_params
params['scale_pos_weight'] = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
model_new = lgb.LGBMClassifier(**params)
model_new.fit(X_train_pca, y_train)
y_pred_new = model_new.predict(X_val_pca)

[LightGBM] [Info] Number of positive: 4354, number of negative: 135646
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 140000, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031100 -> initscore=-3.438954
[LightGBM] [Info] Start training from score -3.438954


In [31]:
print("Accuracy:", accuracy_score(y_val, y_pred_new))
print("Precision:", precision_score(y_val, y_pred_new))
print("Recall:", recall_score(y_val, y_pred_new))
print("F1 Score:", f1_score(y_val, y_pred_new))
print("ROC AUC Score:", roc_auc_score(y_val, y_pred_new))

Accuracy: 0.9657142857142857
Precision: 0.0949367088607595
Recall: 0.013992537313432836
F1 Score: 0.024390243902439025
ROC AUC Score: 0.5048888647425452


In [32]:
def objective2(trial):
    params = {
        'objective': 'binary',
        'metric': 'binary_logloss',
        'boosting_type': 'gbdt',
        'verbosity': -1,
        'n_jobs': -1,
        'num_leaves': trial.suggest_int('num_leaves', 10, 100),
        'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1, log = True),
        'n_estimators': trial.suggest_int('n_estimators', 50, 1000),
        'min_child_samples': trial.suggest_int('min_child_samples', 1, 100),
        'scale_pos_weight': len(y_train[y_train == 0]) / len(y_train[y_train == 1])        
    }

    model = lgb.LGBMClassifier(**params)

    # Use cross-validation to evaluate the model's performance
    score = cross_val_score(model, X_train_pca, y_train, n_jobs=-1, cv=3, scoring='recall').mean()

    return score

In [33]:
sampler = optuna.samplers.TPESampler(seed=42)  # Optuna sampler (Tree-structured Parzen Estimator)
study2 = optuna.create_study(direction='maximize', sampler=sampler)
study2.optimize(objective2, n_trials=100)  # You can adjust the number of trials as needed

[I 2023-08-01 22:33:49,088] A new study created in memory with name: no-name-31055935-aa8e-433b-b2a8-ff8a76f7f5e9
[I 2023-08-01 22:33:56,618] Trial 0 finished with value: 0.13550785722015593 and parameters: {'num_leaves': 44, 'learning_rate': 0.07969454818643935, 'n_estimators': 746, 'min_child_samples': 60}. Best is trial 0 with value: 0.13550785722015593.
[I 2023-08-01 22:33:58,405] Trial 1 finished with value: 0.0 and parameters: {'num_leaves': 24, 'learning_rate': 0.002051110418843397, 'n_estimators': 105, 'min_child_samples': 87}. Best is trial 0 with value: 0.13550785722015593.
[I 2023-08-01 22:34:00,270] Trial 2 finished with value: 0.37344847510250684 and parameters: {'num_leaves': 64, 'learning_rate': 0.02607024758370768, 'n_estimators': 69, 'min_child_samples': 97}. Best is trial 2 with value: 0.37344847510250684.
[I 2023-08-01 22:34:04,970] Trial 3 finished with value: 0.0 and parameters: {'num_leaves': 85, 'learning_rate': 0.0026587543983272706, 'n_estimators': 222, 'min_ch

In [34]:
params2 = study2.best_params
params2['scale_pos_weight'] = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
model_new2 = lgb.LGBMClassifier(**params2)
model_new2.fit(X_train_pca, y_train)
y_pred_new2 = model_new2.predict(X_val_pca)

[LightGBM] [Info] Number of positive: 4354, number of negative: 135646
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 140000, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031100 -> initscore=-3.438954
[LightGBM] [Info] Start training from score -3.438954


In [35]:
print("Accuracy:", accuracy_score(y_val, y_pred_new2))
print("Precision:", precision_score(y_val, y_pred_new2))
print("Recall:", recall_score(y_val, y_pred_new2))
print("F1 Score:", f1_score(y_val, y_pred_new2))
print("ROC AUC Score:", roc_auc_score(y_val, y_pred_new2))

Accuracy: 0.6215428571428572
Precision: 0.04971149578339991
Recall: 0.6268656716417911
F1 Score: 0.09211788896504454
ROC AUC Score: 0.6241201737129022


In [38]:
from sklearn.linear_model import LogisticRegression

logreg_model = LogisticRegression(class_weight='balanced')
logreg_model.fit(X_train_pca, y_train)

logreg_predictions = logreg_model.predict(X_val_pca)

print("Accuracy:", accuracy_score(y_val, logreg_predictions))
print("Precision:", precision_score(y_val, logreg_predictions))
print("Recall:", recall_score(y_val, logreg_predictions))
print("F1 Score:", f1_score(y_val, logreg_predictions))
print("ROC AUC Score:", roc_auc_score(y_val, logreg_predictions))

Accuracy: 0.609
Precision: 0.04665372726619222
Recall: 0.605410447761194
F1 Score: 0.08663151571781351
ROC AUC Score: 0.6072619322041056


In [65]:
def objective3(trial):
    params = {
        'objective': 'binary:hinge',  
        'eta': trial.suggest_float('eta', 0.001, 0.1, log=True),  
        'max_depth': trial.suggest_int('max_depth', 3, 9),
        'min_child_weight': trial.suggest_float('min_child_weight', 0.1, 10, log=True),
        'gamma': trial.suggest_float('gamma', 0.001, 1, log=True),  
        'subsample': trial.suggest_float('subsample', 0.1, 1),  
        'colsample_bytree': trial.suggest_float('colsample_bytree', 0.1, 1),
        'eval_metric': 'logloss'
    }

    dtrain = xgb.DMatrix(X_train_pca, label=y_train)

    # Train the model with the suggested hyperparameters
    num_rounds = 100
    model = xgb.train(params, dtrain, num_rounds)

    # Make predictions on the test set
    dtest = xgb.DMatrix(X_val_pca)
    y_pred = model.predict(dtest)

    # Convert predicted labels from float to int
    y_pred = y_pred.astype(int)

    # Calculate precision
    recall = recall_score(y_val, y_pred)

    # Optuna aims to minimize the objective function, so return the negative precision
    return -recall

In [66]:
study3 = optuna.create_study(direction='minimize')  # Optuna aims to minimize the objective function (negative accuracy)
study3.optimize(objective3, n_trials=10, show_progress_bar=True)  # You can adjust the number of trials as needed


[I 2023-08-01 23:34:58,590] A new study created in memory with name: no-name-6e52a299-93e1-4e6d-bf84-38d16e1e6c74


  0%|          | 0/10 [00:00<?, ?it/s]

[I 2023-08-01 23:35:03,379] Trial 0 finished with value: -0.0 and parameters: {'eta': 0.09206953955933338, 'max_depth': 4, 'min_child_weight': 5.680080942740105, 'gamma': 0.24073129453894002, 'subsample': 0.4631828466537756, 'colsample_bytree': 0.5070734326362988}. Best is trial 0 with value: -0.0.
[I 2023-08-01 23:35:06,656] Trial 1 finished with value: -1.0 and parameters: {'eta': 0.0019769292421800712, 'max_depth': 3, 'min_child_weight': 7.396540934305601, 'gamma': 0.10517333701276899, 'subsample': 0.6002967643105485, 'colsample_bytree': 0.35240435898152556}. Best is trial 1 with value: -1.0.
[I 2023-08-01 23:35:10,952] Trial 2 finished with value: -1.0 and parameters: {'eta': 0.001616637677407731, 'max_depth': 3, 'min_child_weight': 8.393711738725088, 'gamma': 0.4895593272062752, 'subsample': 0.36263473416873415, 'colsample_bytree': 0.8771562559258164}. Best is trial 1 with value: -1.0.
[I 2023-08-01 23:35:24,714] Trial 3 finished with value: -1.0 and parameters: {'eta': 0.00117781

In [67]:
params3 = study3.best_params
params3['scale_pos_weight'] = len(y_train[y_train == 0]) / len(y_train[y_train == 1])
model_new3 = xgb.XGBClassifier(**params3)
model_new3.fit(X_train_pca, y_train)
y_pred_new3 = model_new3.predict(X_val_pca)

In [120]:
print("Accuracy:", accuracy_score(y_val, y_pred_new3))
print("Precision:", precision_score(y_val, y_pred_new3))
print("Recall:", recall_score(y_val, y_pred_new3))
print("F1 Score:", f1_score(y_val, y_pred_new3))
print("ROC AUC Score:", roc_auc_score(y_val, y_pred_new3))

Accuracy: 0.6569714285714285
Precision: 0.0512967826657912
Recall: 0.5830223880597015
F1 Score: 0.09429692214846107
ROC AUC Score: 0.6211651671493981


In [121]:
model_new4 = xgb.XGBClassifier(scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]))
model_new4.fit(X_train_pca, y_train)
y_pred_new4 = model_new4.predict(X_val_pca)

print("Accuracy:", accuracy_score(y_val, y_pred_new4))
print("Precision:", precision_score(y_val, y_pred_new4))
print("Recall:", recall_score(y_val, y_pred_new4))
print("F1 Score:", f1_score(y_val, y_pred_new4))
print("ROC AUC Score:", roc_auc_score(y_val, y_pred_new4))

Accuracy: 0.7722571428571429
Precision: 0.05241987803295705
Recall: 0.376865671641791
F1 Score: 0.0920378175190796
ROC AUC Score: 0.5808078652950762


In [125]:
model_new5 = lgb.LGBMClassifier(scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1]))
model_new5.fit(X_train_pca, y_train)
y_pred_new5 = model_new5.predict(X_val_pca)

print("Accuracy:", accuracy_score(y_val, y_pred_new5))
print("Precision:", precision_score(y_val, y_pred_new5))
print("Recall:", recall_score(y_val, y_pred_new5))
print("F1 Score:", f1_score(y_val, y_pred_new5))
print("ROC AUC Score:", roc_auc_score(y_val, y_pred_new5))

[LightGBM] [Info] Number of positive: 4354, number of negative: 135646
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 140000, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031100 -> initscore=-3.438954
[LightGBM] [Info] Start training from score -3.438954
Accuracy: 0.6821714285714285
Precision: 0.051970048136922804
Recall: 0.5438432835820896
F1 Score: 0.09487388120423107
ROC AUC Score: 0.6151926863560059


In [132]:
model_svc = SVC(class_weight='balanced')
model_svc.fit(X_train_pca, y_train)
svc_pred = model_svc.predict(X_val_pca)

print("Accuracy:", accuracy_score(y_val, svc_pred))
print("Precision:", precision_score(y_val, svc_pred))
print("Recall:", recall_score(y_val, svc_pred))
print("F1 Score:", f1_score(y_val, svc_pred))
print("ROC AUC Score:", roc_auc_score(y_val, svc_pred))

Accuracy: 0.6283428571428571
Precision: 0.048084204149628954
Recall: 0.5923507462686567
F1 Score: 0.0889480319372461
ROC AUC Score: 0.6109154108612795


In [136]:
ensemble_model = VotingClassifier(
    estimators=[('model1', model_new), ('model2', model_new2), ('rf', rf), ('logreg_model', logreg_model), ('model3', model_new3), ('model4', model_new4), ('model5', model_new5)],
    voting='soft'  # 'soft' for probabilities-based voting, 'hard' for majority voting
)
ensemble_model.fit(X_train_pca, y_train)
ensemble_predictions = ensemble_model.predict(X_val_pca)

[LightGBM] [Info] Number of positive: 4354, number of negative: 135646
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 140000, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031100 -> initscore=-3.438954
[LightGBM] [Info] Start training from score -3.438954
[LightGBM] [Info] Number of positive: 4354, number of negative: 135646
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train set: 140000, number of used features: 15
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.031100 -> initscore=-3.438954
[LightGBM] [Info] Start training from score -3.438954
[LightGBM] [Info] Number of positive: 4354, number of negative: 135646
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 3825
[LightGBM] [Info] Number of data points in the train se

In [137]:
print("Accuracy:", accuracy_score(y_val, ensemble_predictions))
print("Precision:", precision_score(y_val, ensemble_predictions))
print("Recall:", recall_score(y_val, ensemble_predictions))
print("F1 Score:", f1_score(y_val, ensemble_predictions))
print("ROC AUC Score:", roc_auc_score(y_val, ensemble_predictions))

Accuracy: 0.8942571428571429
Precision: 0.07719523962688968
Recall: 0.22388059701492538
F1 Score: 0.11480507055728296
ROC AUC Score: 0.569659586411259


In [84]:
significant_features.remove('flag')

In [86]:
df1 = df_test[significant_features]

In [87]:
test_scaled = scaler.transform(df1)
test_final = pca.transform(test_scaled)

In [138]:
Final_ans = ensemble_model.predict(test_final)

In [140]:
np.savetxt("Final_ans.csv", Final_ans, delimiter=",")