# This notebook provide an example of classification for this competition

Importation dependencies

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.preprocessing import LabelEncoder, MinMaxScaler, StandardScaler
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import VotingClassifier, BaggingClassifier

# utilities
"""import os
os.append('/kaggle/usr/lib/utilities_plot/')
import uilities_plot"""

# models
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier


In [5]:
path_train = "playground-series-s4e7/train.csv"
path_test = "playground-series-s4e7/test.csv"

train = pd.read_csv(path_train)
test = pd.read_csv(path_test)
print(train.shape)
train.head(5)

(11504798, 12)


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


Get target, values, preprocessing...

In [13]:
train.isna().sum()

id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64

In [6]:
encode = LabelEncoder()

for col in train.columns:
    if col != "Response" and isinstance(train[col][0], str):
        print("encode :", col)
        train[col] = encode.fit_transform(train[col])
        test[col] = encode.transform(test[col])

        

encode : Gender
encode : Vehicle_Age
encode : Vehicle_Damage


# Prepare for training

In [8]:
id_test = test.id

y_train = train.Response
prepro_train = train.drop(["id", "Response"], axis= 1)
prepro_test = test.drop(["id"], axis= 1)

x_train, x_val, y_train, y_val = train_test_split(prepro_train,
                                                  y_train,
                                                  test_size = 0.3, random_state = 42)
print(x_train.shape)

(8053358, 10)


# Training on usual classfifiaction model

In [10]:
XGB = XGBClassifier()
cross_val = False
    
XGB.fit(x_train, y_train)

predictions_xgb = XGB.predict(x_val)

print("Accuracy : ",  accuracy_score(y_val, predictions_xgb))

if cross_val:
    # Utiliser la validation croisée pour évaluer la performance du modèle
    scores = cross_val_score(XGB, x_train, y_train, cv=5, scoring='accuracy')
    print("Cross-validation accuracy scores: ", scores)
    print("Mean cross-validation accuracy: ", scores.mean())

Accuracy :  0.8799770530561157


# Bagging

In [18]:
from sklearn.ensemble import BaggingClassifier

def bag_model(model, 
              x_train: list, y_train: list, x_val: list, y_val: list, 
              cross_val: bool)-> None:
    """
        function to use bagging method with specified model
        
        Args:
        ------
            model (model type):
                the classification model we want to bag
                
            x_train (list):
                data for train
                
            y_train (list):
                train label
                
            x_val (list):
                data val
                
            y_val (list):
                val label
            
            cross_val (bool):
                if we want or not use crossval 
                because it could be long fro submission
                
        Returns:
        ------
            None
    """
    
    bag = BaggingClassifier(estimator=model,
                            n_estimators=10, 
                            random_state=0).fit(x_train, y_train)

    predictions_bag = bag.predict(x_val)

    print("accuracy bag: ",  accuracy_score(y_val, predictions_bag))

    if cross_val:
        # Utiliser la validation croisée pour évaluer la performance du modèle
        scores = cross_val_score(bag, x_train, y_train, cv=5, scoring='accuracy')
        print("Cross-validation accuracy scores: ", scores)
        print("Mean cross-validation accuracy: ", scores.mean())
    return bag
# ex with xgb
bag = bag_model(XGBClassifier(random_state=42), x_train, y_train, x_val, y_val, False)

accuracy bag:  0.8800564402104629


# Voting

In [21]:
def voting_models(models, 
              x_train: list, y_train: list, x_val: list, y_val: list, 
              cross_val: bool)-> None:
    """
        function to use voting method with specified models list
        
        Args:
        ------
            model (list):
                list of model for voting
                
            x_train (list):
                data for train
                
            y_train (list):
                train label
                
            x_val (list):
                data val
                
            y_val (list):
                val label
            
            cross_val (bool):
                if we want or not use crossval 
                because it could be long fro submission
                
        Returns:
        ------
            None
    """
    
    voting = VotingClassifier(estimators=models,
                                    voting='soft')

    voting.fit(x_train, y_train)

    predictions_boost = voting.predict(x_val)

    print("accuracy boost: ",  accuracy_score(y_val, predictions_boost))

    if cross_val:
        # Utiliser la validation croisée pour évaluer la performance du modèle
        scores = cross_val_score(voting, x_train, y_train, cv=5, scoring='accuracy')
        print("Cross-validation accuracy scores: ", scores)
        print("Mean cross-validation accuracy: ", scores.mean())
    
    return voting
# ex :



vot = voting_models([('xgb', xgb), ('cat', cat), ('lgb', lgb)], x_train, y_train, x_val, y_val, False)

[LightGBM] [Info] Number of positive: 990701, number of negative: 7062657
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.089378 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 739
[LightGBM] [Info] Number of data points in the train set: 8053358, number of used features: 10
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.123017 -> initscore=-1.964164
[LightGBM] [Info] Start training from score -1.964164
accuracy boost:  0.8801891384465614


# Submission

In [22]:
def submission(model, test, Id):
    pred = model.predict_proba(test)

    submission = pd.DataFrame({'id': Id,
                               'Response': pred[:, 1]})

    # Save submission to a CSV file
    submission.to_csv('submission.csv', index=False)
    
submission(vot, prepro_test, id_test)