<a href="https://colab.research.google.com/github/Fikaaw/amazing-feat-eng/blob/main/datathon_final.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer, PowerTransformer
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import confusion_matrix, classification_report, average_precision_score
import lightgbm as lgb

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.preprocessing import MinMaxScaler, PolynomialFeatures, OneHotEncoder, LabelEncoder, TargetEncoder
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer, ConfusionMatrixDisplay
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.utils.class_weight import compute_class_weight

from scipy.stats import mode

import optuna

In [None]:
train = pd.read_csv('../train.csv')
test = pd.read_csv('../test.csv')
loan = pd.read_csv('../loan_activities.csv')
non_borrower = pd.read_csv('../non_borrower_user.csv')
submission = pd.read_csv('../sample_submission.csv')

In [None]:
train['label']

0         0
1         0
2         0
3         0
4         0
         ..
857894    0
857895    0
857896    0
857897    0
857898    0
Name: label, Length: 857899, dtype: int64

In [None]:
train_df = train.copy()
train_feature = train_df.drop('label', axis=1)
train_label = train_df['label']

In [None]:
train_feature

Unnamed: 0,user_id,pc0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,pc9,pc10,pc11,pc12,pc13,pc14,pc15,pc16
0,3,1.0,1.0,0.275,0.255,0.927273,0.4,0.260,0.040,0.254,0.976923,1.0,0.072727,0.023077,0.078431,0.750000,0.018182,0.250000
1,5,0.0,0.0,0.430,0.365,0.848837,0.4,1.253,0.210,1.235,0.985634,1.0,0.151163,0.014366,0.054795,0.500000,0.011628,0.250000
2,9,1.0,3.0,1.315,0.825,0.627376,0.9,2.385,0.128,2.270,0.951782,1.0,0.372624,0.048218,0.054545,0.777778,0.003802,0.111111
3,10,0.0,5.0,-1.000,-1.000,-1.000000,-1.0,-1.000,-1.000,-1.000,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000,-1.000000
4,17,1.0,1.0,0.235,0.160,0.680851,0.1,0.120,0.002,0.104,0.866667,0.0,0.319149,0.133333,0.031250,-1.000000,-1.000000,-1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
857894,3700529,1.0,4.0,0.190,0.160,0.842105,0.1,0.281,0.005,0.274,0.975089,0.0,0.157895,0.024911,0.031250,1.000000,-1.000000,-1.000000
857895,3700532,1.0,3.0,1.710,1.535,0.897661,2.1,3.750,0.687,3.712,0.989867,4.0,0.102339,0.010133,0.068404,0.857143,0.011696,0.190476
857896,3700537,1.0,0.0,2.165,2.060,0.951501,2.4,2.213,0.065,2.186,0.987799,2.0,0.048499,0.012201,0.058252,0.458333,0.004619,0.083333
857897,3700543,1.0,5.0,2.075,1.565,0.754217,1.9,3.292,0.342,3.188,0.968408,4.0,0.245783,0.031592,0.060703,0.736842,0.009639,0.210526


In [None]:
def combine_data(train_data, test_data, non_borrower_data, reference_data, base='train'):
    # Load the datasets
    train = train_data.copy()
    test = test_data.copy()
    non_borrower = non_borrower_data.copy()
    reference = reference_data.copy()


    # Combine all data sources for potential reference values
    all_data = pd.concat([train, test, non_borrower], ignore_index=True)

    # Choose the base dataset
    if base == 'train':
        base_data = train
    elif base == 'test':
        base_data = test
    elif base == 'non_borrower':
        base_data = non_borrower
    else:
        raise ValueError("base must be 'train', 'test', or 'non_borrower'")

    # Merge base data with reference data
    merged = pd.merge(base_data, reference, on='user_id', how='left')

    final_data = pd.merge(
        merged,
        all_data,
        left_on='reference_contact',
        right_on='user_id',
        how='left',
        suffixes=('', '_ref')
    )

    final_data = final_data.drop('user_id_ref', axis=1)


    return final_data

In [None]:
train_merge = combine_data(train_data=train_feature, test_data=test, non_borrower_data=non_borrower, reference_data=loan, base='train')
test_merge = combine_data(train_data=train_feature, test_data=test, non_borrower_data=non_borrower, reference_data=loan, base='test')
non_borrower_merge = combine_data(train_data=train_feature, test_data=test, non_borrower_data=non_borrower, reference_data=loan, base='non_borrower')

In [None]:
train_merge['user_id'].nunique()

857899

In [None]:
train_merge

Unnamed: 0,user_id,pc0,pc1,pc2,pc3,pc4,pc5,pc6,pc7,pc8,...,pc7_ref,pc8_ref,pc9_ref,pc10_ref,pc11_ref,pc12_ref,pc13_ref,pc14_ref,pc15_ref,pc16_ref
0,3,1.0,1.0,0.275,0.255,0.927273,0.4,0.260,0.040,0.254,...,,,,,,,,,,
1,5,0.0,0.0,0.430,0.365,0.848837,0.4,1.253,0.210,1.235,...,-1.000,-1.000,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.00,-1.000000,-1.00
2,5,0.0,0.0,0.430,0.365,0.848837,0.4,1.253,0.210,1.235,...,-1.000,-1.000,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.00,-1.000000,-1.00
3,9,1.0,3.0,1.315,0.825,0.627376,0.9,2.385,0.128,2.270,...,-1.000,-1.000,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.00,-1.000000,-1.00
4,9,1.0,3.0,1.315,0.825,0.627376,0.9,2.385,0.128,2.270,...,-1.000,-1.000,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.00,-1.000000,-1.00
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1653552,3700532,1.0,3.0,1.710,1.535,0.897661,2.1,3.750,0.687,3.712,...,-1.000,-1.000,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.00,-1.000000,-1.00
1653553,3700532,1.0,3.0,1.710,1.535,0.897661,2.1,3.750,0.687,3.712,...,-1.000,-1.000,-1.000000,0.0,-1.000000,-1.000000,-1.000000,-1.00,-1.000000,-1.00
1653554,3700537,1.0,0.0,2.165,2.060,0.951501,2.4,2.213,0.065,2.186,...,0.072,0.374,0.973958,0.0,0.128571,0.026042,0.065574,0.75,-1.000000,-1.00
1653555,3700543,1.0,5.0,2.075,1.565,0.754217,1.9,3.292,0.342,3.188,...,0.080,0.668,0.995529,1.0,0.037037,0.004471,0.051282,0.75,0.012346,0.25


In [None]:
train_merge.columns

Index(['user_id', 'pc0', 'pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7',
       'pc8', 'pc9', 'pc10', 'pc11', 'pc12', 'pc13', 'pc14', 'pc15', 'pc16',
       'reference_contact', 'loan_type', 'ts', 'pc0_ref', 'pc1_ref', 'pc2_ref',
       'pc3_ref', 'pc4_ref', 'pc5_ref', 'pc6_ref', 'pc7_ref', 'pc8_ref',
       'pc9_ref', 'pc10_ref', 'pc11_ref', 'pc12_ref', 'pc13_ref', 'pc14_ref',
       'pc15_ref', 'pc16_ref'],
      dtype='object')

In [None]:
def agg_data(data):
    df = data.copy()

    col_list = ['pc0', 'pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7',
       'pc8', 'pc9', 'pc10', 'pc11', 'pc12', 'pc13', 'pc14', 'pc15', 'pc16',
       'pc0_ref', 'pc1_ref', 'pc2_ref',
       'pc3_ref', 'pc4_ref', 'pc5_ref', 'pc6_ref', 'pc7_ref', 'pc8_ref',
       'pc9_ref', 'pc10_ref', 'pc11_ref', 'pc12_ref', 'pc13_ref', 'pc14_ref',
       'pc15_ref', 'pc16_ref']

    agg_funcs = {
    'reference_contact': 'first',
    'ts': 'median',
    'loan_type': 'median',
    }

    df2 = df.groupby('user_id').agg({**agg_funcs, **{col: 'mean' for col in col_list if col not in agg_funcs}}).reset_index()

    return df2

In [None]:
final_train = agg_data(train_merge)
final_test = agg_data(test_merge)
final_nonb = agg_data(non_borrower_merge)

In [None]:
final_train.columns

Index(['user_id', 'reference_contact', 'ts', 'loan_type', 'pc0', 'pc1', 'pc2',
       'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10', 'pc11', 'pc12',
       'pc13', 'pc14', 'pc15', 'pc16', 'pc0_ref', 'pc1_ref', 'pc2_ref',
       'pc3_ref', 'pc4_ref', 'pc5_ref', 'pc6_ref', 'pc7_ref', 'pc8_ref',
       'pc9_ref', 'pc10_ref', 'pc11_ref', 'pc12_ref', 'pc13_ref', 'pc14_ref',
       'pc15_ref', 'pc16_ref'],
      dtype='object')

In [None]:
final_train_ = final_train.copy()
final_train_['label'] = train_label

In [None]:
final_train_.columns

Index(['user_id', 'reference_contact', 'ts', 'loan_type', 'pc0', 'pc1', 'pc2',
       'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10', 'pc11', 'pc12',
       'pc13', 'pc14', 'pc15', 'pc16', 'pc0_ref', 'pc1_ref', 'pc2_ref',
       'pc3_ref', 'pc4_ref', 'pc5_ref', 'pc6_ref', 'pc7_ref', 'pc8_ref',
       'pc9_ref', 'pc10_ref', 'pc11_ref', 'pc12_ref', 'pc13_ref', 'pc14_ref',
       'pc15_ref', 'pc16_ref', 'label'],
      dtype='object')

In [None]:
X = final_train_.drop(columns=['user_id', 'reference_contact', 'label'])
y = final_train_['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((686319, 36), (171580, 36), (686319,), (171580,))

In [None]:
X_train.columns

Index(['ts', 'loan_type', 'pc0', 'pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6',
       'pc7', 'pc8', 'pc9', 'pc10', 'pc11', 'pc12', 'pc13', 'pc14', 'pc15',
       'pc16', 'pc0_ref', 'pc1_ref', 'pc2_ref', 'pc3_ref', 'pc4_ref',
       'pc5_ref', 'pc6_ref', 'pc7_ref', 'pc8_ref', 'pc9_ref', 'pc10_ref',
       'pc11_ref', 'pc12_ref', 'pc13_ref', 'pc14_ref', 'pc15_ref', 'pc16_ref'],
      dtype='object')

In [None]:
num_cols = ['pc0', 'pc1', 'pc2', 'pc3', 'pc4', 'pc5', 'pc6', 'pc7', 'pc8', 'pc9', 'pc10', 'pc11', 'pc12',
       'pc13', 'pc14', 'pc15', 'pc16',
       'pc0_ref', 'pc1_ref', 'pc2_ref', 'pc3_ref', 'pc4_ref', 'pc5_ref', 'pc6_ref', 'pc7_ref',
       'pc8_ref', 'pc9_ref', 'pc10_ref', 'pc11_ref', 'pc12_ref', 'pc13_ref',
       'pc14_ref', 'pc15_ref', 'pc16_ref']
cat_cols = ['loan_type', 'ts']

num_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', MinMaxScaler())
])

cat_pipe = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('encoder', TargetEncoder())
])

preprocessor = ColumnTransformer([
    ('numeric', num_pipe, num_cols),
    ('categoric', cat_pipe, cat_cols)
    ],remainder = "drop",verbose_feature_names_out=False).set_output(transform="pandas")

In [None]:
model = Pipeline([
        ('pre', preprocessor),
        ('algo', lgb.LGBMClassifier(random_state=42, class_weight={0: 1, 1: 70}))
    ])
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
average_precision_score(y_test, model.predict_proba(X_test)[:,1])

0.04129970907486352

In [None]:
model.predict_proba(X_test)[:,1]

array([0.21770917, 0.25953574, 0.37207232, ..., 0.75791219, 0.27033632,
       0.27647507])