## Importing biblio 

In [1]:
import numpy as np
import pandas as pd
import warnings
import matplotlib.pyplot as plt
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.linear_model import LinearRegression
from lightgbm import LGBMRegressor
from sklearn.metrics import mean_squared_error
from scipy.stats import normaltest, norm, rankdata
from sklearn.decomposition import PCA
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import mutual_info_regression
from sklearn.preprocessing import StandardScaler
warnings.filterwarnings("ignore")

## Data importation

In [3]:
train_data=pd.read_csv('train.csv')
test_data=pd.read_csv('test.csv')

In [4]:
train_data=train_data.drop(['language','author'],axis=1)
test_data=test_data.drop(['language','author'],axis=1)

## Cleaning 

In [11]:
## data is my dataset either train validation or test
## threshold is the allowed amount of null values
def clean_data(data,threshold):
    col_to_be_removed=[]
    row_to_be_removed=[]
    n=data.shape[0]
    columns=data.columns
    for column in columns :
        if data[column].isnull().sum()/n > threshold:
            col_to_be_removed.append(column)
        else:
            row_to_be_removed.append(column)
    data_cleaned=data.drop(col_to_be_removed,axis=1)
    data_cleaned.dropna(subset=row_to_be_removed)
    return data_cleaned

In [13]:
# Transforming boolean variables to integers for interpretability
def bool_transform(data):
    data_cleaned=clean_data(data,threshold=0.2)
    boolean_columns = data_cleaned.select_dtypes(include='bool').columns
    data_cleaned[boolean_columns] = data_cleaned[boolean_columns].astype(int)
    return data_cleaned

## Feature selection using mutual information

In [10]:
def select_features_by_mi_threshold(X, y):
   
    mi_scores = mutual_info_regression(X, y, random_state=42)
    max_scores=np.max(mi_scores)
    min_scores=np.min(mi_scores)
    threshold=0.005*(max_scores-min_scores)+ min_scores
    # Create a DataFrame of MI scores
    mi_scores_df = pd.DataFrame({
        'Feature': X.columns,
        'Mutual Information': mi_scores
    }).sort_values(by='Mutual Information', ascending=False).reset_index(drop=True)
    # Filter features by threshold
    selected_features = mi_scores_df[mi_scores_df['Mutual Information'] >= threshold]
    selected_feature_names = selected_features['Feature'].tolist()
    # Filter input data
    X_selected = X[selected_feature_names]
    return selected_feature_names, X_selected

## Finetuning Xgboost

In [12]:
def finetune_xgboost(train_input, train_output):
    xgb = XGBRegressor(objective='reg:squarederror', random_state=42)
    # Définition de la grille des hyperparamètres
    param_grid = {
        'max_depth': [4, 8, 10],
        'n_estimators': [50, 100, 200],
        'colsample_bytree': [0.4,0.6],
    }
    # Configuration du GridSearchCV
    grid_search = GridSearchCV(
        estimator=xgb,
        param_grid=param_grid,
        cv=3,  # 3-fold cross-validation
        scoring='neg_mean_squared_error',
        verbose=1,
        n_jobs=-1 
    )
    # Exécution de la recherche
    grid_search.fit(train_input, train_output)
    best_params=grid_search.best_params_
    best_model=grid_search.best_estimator_
    return best_params, best_model

## Finetune LGBM

In [14]:
## Finetuning LGBM

def finetune_LGBM(train_input, train_output):
    # Initialize LightGBM Regressor
    lgbm = LGBMRegressor(random_state=42)

    # Define the parameter grid for GridSearch
    param_grid = {
    'learning_rate': [0.01, 0.1],
    'n_estimators': [100, 200,300],
    'max_depth': [10,12],
    'num_leaves': [20, 50],
    'min_data_in_leaf': [10, 20],
    'force_col_wise' : [True],
    'colsample_bytree': [0.5],
    }
    # Wrap the model with GridSearchCV
    grid_search = GridSearchCV(
        estimator=lgbm,
        param_grid=param_grid,
        cv=3,  # 3-fold cross-validation
        scoring='neg_mean_squared_error',  # Evaluation metric
        verbose=1,
        n_jobs=-1
    )
    # Perform the grid search
    grid_search.fit(train_input, train_output)
    best_params=grid_search.best_params_
    best_model=grid_search.best_estimator_
    return best_params, best_model

In [None]:
## Finetuni

## Meta model construction

In [16]:
def meta_model_and_predict(train_input, train_output, test_data):
    best_par_xgb, best_xgb=finetune_xgboost(train_input, train_output)
    opti_xgb=XGBRegressor().set_params(**best_par_xgb)
    best_par_lgbm, best_lgbm=finetune_LGBM(train_input, train_output)
    opti_lgbm= LGBMRegressor(**best_par_lgbm)
    meta_mod= StackingRegressor(estimators=[('lgbm', opti_lgbm), ('xgb',opti_xgb)], final_estimator=LinearRegression(), cv=2)
    meta_mod.fit(train_input, train_output)
    predictions_test=meta_mod.predict(test_data)
    predictions_test=np.maximum(predictions_test,0)
    predictions_train=meta_mod.predict(train_input)
    predictions_train=np.maximum(predictions_train,0)
    rmse_train=np.sqrt(mean_squared_error(predictions_train, train_output))
    return rmse_train, predictions_test

## Training and Testing 

In [15]:
## Cleaning data 
train_data_cleaned=bool_transform(train_data)
test_data_cleaned=bool_transform(test_data)

In [17]:
## Separation  of inputs and outputs 
train_output=train_data_cleaned['engagement']
train_inputs=train_data_cleaned.drop(columns=['engagement'])

In [20]:
## Feature selection
selected_feature_names, train_inputs_best=select_features_by_mi_threshold(train_inputs, train_output)
test_data_best=test_data[selected_feature_names]

In [21]:
rmse_train, predictions_test=meta_model_and_predict(train_inputs_best, train_output, test_data_best)

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Fitting 3 folds for each of 48 candidates, totalling 144 fits
[LightGBM] [Info] Total Bins 201872
[LightGBM] [Info] Number of data points in the train set: 4154, number of used features: 798
[LightGBM] [Info] Start training from score 18726.554887
[LightGBM] [Info] Total Bins 201872
[LightGBM] [Info] Number of data points in the train set: 4154, number of used features: 798
[LightGBM] [Info] Start training from score 18726.554887
[LightGBM] [Info] Total Bins 201853
[LightGBM] [Info] Number of data points in the train set: 2077, number of used features: 798
[LightGBM] [Info] Start training from score 18871.163216
[LightGBM] [Info] Total Bins 201858
[LightGBM] [Info] Number of data points in the train set: 2077, number of used features: 798
[LightGBM] [Info] Start training from score 18581.946558


In [22]:
print(rmse_train)

34055.48163959116


In [23]:
print(predictions_test)

[23737.00789615  7127.10200932 18895.11114622 ...     0.
 20371.28059234 36660.02590464]


In [43]:
test_data.rename(columns={'Id':'id'}, inplace=True)
test_data['engagement']=predictions_test
submission=test_data[['id','engagement']]
submission.to_csv('Thirtenth_pred.csv', index=False)

In [19]:
import pandas as pd
submi_1=pd.read_csv('Twelvth_pred.csv')
submi_2=pd.read_csv('Sixth_pred.csv')
submi_3=pd.read_csv('Fifth_pred.csv')

In [21]:
submission=(submi_1 + submi_2 +submi_3)/3

In [25]:
print(np.mean(submission['engagement']))
print(np.var(submission['engagement']))

20223.31423596601
719829432.2029196


In [43]:
print(np.mean(train_output))
print(np.var(train_output))

18726.554886856044
4124688669.8723745


In [31]:
submission['engagement']=(submission['engagement']-1000)
submission['engagement']=np.maximum(submission['engagement'],0)

In [33]:
test_data.rename(columns={'Id':'id'}, inplace=True)
test_data['engagement']=submission['engagement']
submi=test_data[['id','engagement']]
submi.to_csv('mi_pred.csv', index=False)