# Child Mind Institute-Problematic Internet Use

In [49]:
#import necessary libraries
import pandas as pd
import numpy as np
import os
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, VotingClassifier, StackingClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.impute import KNNImputer
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import LabelEncoder


In [2]:
#Reading the data
train= pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test= pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3960 entries, 0 to 3959
Data columns (total 82 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      3960 non-null   object 
 1   Basic_Demos-Enroll_Season               3960 non-null   object 
 2   Basic_Demos-Age                         3960 non-null   int64  
 3   Basic_Demos-Sex                         3960 non-null   int64  
 4   CGAS-Season                             2555 non-null   object 
 5   CGAS-CGAS_Score                         2421 non-null   float64
 6   Physical-Season                         3310 non-null   object 
 7   Physical-BMI                            3022 non-null   float64
 8   Physical-Height                         3027 non-null   float64
 9   Physical-Weight                         3076 non-null   float64
 10  Physical-Waist_Circumference            898 non-null    floa

In [69]:
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 59 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      20 non-null     object 
 1   Basic_Demos-Enroll_Season               20 non-null     object 
 2   Basic_Demos-Age                         20 non-null     int64  
 3   Basic_Demos-Sex                         20 non-null     int64  
 4   CGAS-Season                             10 non-null     object 
 5   CGAS-CGAS_Score                         8 non-null      float64
 6   Physical-Season                         14 non-null     object 
 7   Physical-BMI                            13 non-null     float64
 8   Physical-Height                         13 non-null     float64
 9   Physical-Weight                         13 non-null     float64
 10  Physical-Waist_Circumference            5 non-null      float64


In [3]:
#Drop the rows without labels from the train set
train.dropna(subset=['sii'], inplace=True)

In [4]:
#Relationship between PCIAT total and sii. 
PCIAT_ranges = train.groupby('sii')['PCIAT-PCIAT_Total'].agg(['min', 'max'])
print(PCIAT_ranges)

      min   max
sii            
0.0   0.0  30.0
1.0  31.0  49.0
2.0  50.0  79.0
3.0  80.0  93.0


Since classification for sii are determined by PCIAT total score, we can use PCIAT score as a proxy for sii. PCIAT total score is a continuous variable so it can be used to find correlated features.  

In [5]:
#removing PCIAT columns because they are captured in target columns "sii" or PCIAT total
target_columns=['sii', 'PCIAT-PCIAT_Total']
common_columns = train.columns.intersection(test.columns)
for column in target_columns:
    common_columns = common_columns.append(pd.Index([column]))
train = train[common_columns]
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2736 entries, 0 to 3958
Data columns (total 61 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      2736 non-null   object 
 1   Basic_Demos-Enroll_Season               2736 non-null   object 
 2   Basic_Demos-Age                         2736 non-null   int64  
 3   Basic_Demos-Sex                         2736 non-null   int64  
 4   CGAS-Season                             2342 non-null   object 
 5   CGAS-CGAS_Score                         2342 non-null   float64
 6   Physical-Season                         2595 non-null   object 
 7   Physical-BMI                            2527 non-null   float64
 8   Physical-Height                         2530 non-null   float64
 9   Physical-Weight                         2572 non-null   float64
 10  Physical-Waist_Circumference            483 non-null    float64
 

In [6]:
#removing columns with more than 50% missing values
missing_percentage = train.isnull().mean() * 100
columns_to_drop = missing_percentage[missing_percentage > 50].index
train = train.drop(columns=columns_to_drop)
test = test.drop(columns=columns_to_drop)
train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2736 entries, 0 to 3958
Data columns (total 50 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   id                                      2736 non-null   object 
 1   Basic_Demos-Enroll_Season               2736 non-null   object 
 2   Basic_Demos-Age                         2736 non-null   int64  
 3   Basic_Demos-Sex                         2736 non-null   int64  
 4   CGAS-Season                             2342 non-null   object 
 5   CGAS-CGAS_Score                         2342 non-null   float64
 6   Physical-Season                         2595 non-null   object 
 7   Physical-BMI                            2527 non-null   float64
 8   Physical-Height                         2530 non-null   float64
 9   Physical-Weight                         2572 non-null   float64
 10  Physical-Diastolic_BP                   2478 non-null   float64
 

In [7]:
#Preparing the submission file
submissions= test[['id']]

In [8]:
#changing data types
train['sii']= train['sii'].astype(int)
train['Basic_Demos-Age']= train['Basic_Demos-Age'].astype(float)
test['Basic_Demos-Age']= test['Basic_Demos-Age'].astype(float)

#Label encoding of categorical variables 
columns_to_encode = ['Physical-Season', 'FGC-Season', 'BIA-Season']
label_encoder = LabelEncoder()
for column in columns_to_encode:
    train[column] = label_encoder.fit_transform(train[column])

for column in columns_to_encode:
    test[column] = label_encoder.fit_transform(test[column])


In [9]:
# Selecting only numerical columns to create a correlaiton matrix with the target  variable 
numerical_columns = train.select_dtypes(include=['number']).columns

corr_matrix = train[numerical_columns].corr()

target_correlation = corr_matrix[['PCIAT-PCIAT_Total']]
display(target_correlation.style.background_gradient(cmap='coolwarm'))

Unnamed: 0,PCIAT-PCIAT_Total
Basic_Demos-Age,0.409559
Basic_Demos-Sex,-0.093648
CGAS-CGAS_Score,-0.070542
Physical-BMI,0.240858
Physical-Height,0.420765
Physical-Weight,0.353048
Physical-Diastolic_BP,0.069321
Physical-HeartRate,-0.037594
Physical-Systolic_BP,0.147081
FGC-FGC_CU,0.287494


In [10]:
#Filtering the dataset to only include features having correlation of higher than 0.1 or less than -0.1 with target variable. 
filtered_columns = target_correlation[abs(target_correlation['PCIAT-PCIAT_Total']) >=0.1].index.tolist()
filtered_train = train[filtered_columns+ ['id']]
filtered_test = test[[col for col in filtered_columns if col in test.columns]+['id']]

# Selecting only numerical columns to create a correlaiton matrix with the target  variable 
# Displaying final correlation between numerical features and target variable
numerical_columns = filtered_train.select_dtypes(include=['number']).columns
corr_matrix = filtered_train[numerical_columns].corr()
target_correlation = corr_matrix[['PCIAT-PCIAT_Total']]
display(target_correlation.style.background_gradient(cmap='coolwarm'))

Unnamed: 0,PCIAT-PCIAT_Total
Basic_Demos-Age,0.409559
Physical-BMI,0.240858
Physical-Height,0.420765
Physical-Weight,0.353048
Physical-Systolic_BP,0.147081
FGC-FGC_CU,0.287494
FGC-FGC_PU,0.196006
FGC-FGC_SRL_Zone,-0.14885
FGC-FGC_SRR_Zone,-0.109682
FGC-FGC_TL,0.136696


# Autoencoder for data compression and feature extraction from Actigraphy Files¶
process_file function : Reads the Parquet file and converts it into DataFrame format.and Remove the unnecessary 'step' column and calculate statistical information for the remaining columns.

load_time_series function : Processes all files in the specified directory and extracts statistical information for each file. Improve processing speed by performing parallel processing using ThreadPoolExecutor.

AutoEncoder: Trains the model to minimize the difference between the reconstructed data and the original data through the encoder and decoder.

perform_autoencoder : Trains a model using Huber Loss as the loss function and uses the trained encoder to transform the input data into a low-dimensional latent representation.

In [12]:
!pip install /kaggle/input/tabnett/pytorch_tabnet-4.1.0-py3-none-any.whl

Processing /kaggle/input/tabnett/pytorch_tabnet-4.1.0-py3-none-any.whl
Installing collected packages: pytorch-tabnet
Successfully installed pytorch-tabnet-4.1.0


In [14]:
# Import modules for Autoencoders
from pytorch_tabnet.tab_model import TabNetRegressor
from pytorch_tabnet.callbacks import Callback

from keras.models import Model
from keras.layers import Input, Dense
from keras.optimizers import Adam

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pytorch_tabnet

In [15]:
def process_file(filename, dirname):
    df = pd.read_parquet(os.path.join(dirname, filename, 'part-0.parquet'))
    df.drop('step', axis=1, inplace=True)
    return df.describe().values.reshape(-1), filename.split('=')[1]

def load_time_series(dirname) -> pd.DataFrame:
    ids = os.listdir(dirname)
    
    with ThreadPoolExecutor() as executor:
        results = list(tqdm(executor.map(lambda fname: process_file(fname, dirname), ids), total=len(ids)))
    
    stats, indexes = zip(*results)
    
    df = pd.DataFrame(stats, columns=[f"stat_{i}" for i in range(len(stats[0]))])
    df['id'] = indexes
    return df

In [16]:
class AutoEncoder(nn.Module):
    def __init__(self, input_dim, encoding_dim):
        super(AutoEncoder, self).__init__()
        self.encoder = nn.Sequential(
            nn.Linear(input_dim, encoding_dim*3),
            nn.LeakyReLU(0.2),
            nn.Linear(encoding_dim*3, encoding_dim*2),
            nn.LeakyReLU(0.2),
            nn.Linear(encoding_dim*2, encoding_dim),
            nn.LeakyReLU(0.2)
        )
        self.decoder = nn.Sequential(
            nn.Linear(encoding_dim, input_dim*2),
            nn.LeakyReLU(0.2),
            nn.Linear(input_dim*2, input_dim*3),
            nn.LeakyReLU(0.2),
            nn.Linear(input_dim*3, input_dim),
            nn.Sigmoid()
        )

    def forward(self, x):
        encoded = self.encoder(x)
        decoded = self.decoder(encoded)
        return decoded

In [17]:
def perform_autoencoder(df, encoding_dim=50, epochs=50, batch_size=32):
    scaler = StandardScaler()
    df_scaled = scaler.fit_transform(df)

    data_tensor = torch.FloatTensor(df_scaled)

    input_dim = data_tensor.shape[1]
    autoencoder = AutoEncoder(input_dim, encoding_dim)

    criterion = F.smooth_l1_loss
    optimizer = optim.Adam(autoencoder.parameters())

    for epoch in range(epochs):
        for i in range(0, len(data_tensor), batch_size):
            batch = data_tensor[i : i + batch_size]
            optimizer.zero_grad()
            reconstructed = autoencoder(batch)
            loss = criterion(reconstructed, batch)
            loss.backward()
            optimizer.step()

        if (epoch + 1) % 10 == 0:
            print(f'Epoch [{epoch + 1}/{epochs}], Loss: {loss.item():.4f}]')

    with torch.no_grad():
        encoded_data = autoencoder.encoder(data_tensor).numpy()

    df_encoded = pd.DataFrame(encoded_data, columns=[f'Enc_{i + 1}' for i in range(encoded_data.shape[1])])

    return df_encoded

In [18]:
#Read in the Parquet files
train_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_train.parquet")
test_ts = load_time_series("/kaggle/input/child-mind-institute-problematic-internet-use/series_test.parquet")

df_train = train_ts.drop('id', axis=1)
df_test = test_ts.drop('id', axis=1)

#Perform autoencoder to extract features from Parquet files
train_ts_encoded = perform_autoencoder(df_train, encoding_dim=60, epochs=100, batch_size=32)
test_ts_encoded = perform_autoencoder(df_test, encoding_dim=60, epochs=100, batch_size=32)

time_series_cols = train_ts_encoded.columns.tolist()

train_ts_encoded["id"] = train_ts["id"]
test_ts_encoded["id"] = test_ts["id"]



100%|██████████| 996/996 [01:26<00:00, 11.50it/s]
100%|██████████| 2/2 [00:00<00:00,  9.21it/s]


Epoch [10/100], Loss: 0.4779]
Epoch [20/100], Loss: 0.4625]
Epoch [30/100], Loss: 0.4537]
Epoch [40/100], Loss: 0.4533]
Epoch [50/100], Loss: 0.4500]
Epoch [60/100], Loss: 0.4501]
Epoch [70/100], Loss: 0.4515]
Epoch [80/100], Loss: 0.4355]
Epoch [90/100], Loss: 0.4353]
Epoch [100/100], Loss: 0.4340]
Epoch [10/100], Loss: 0.4070]
Epoch [20/100], Loss: 0.2135]
Epoch [30/100], Loss: 0.2135]
Epoch [40/100], Loss: 0.2135]
Epoch [50/100], Loss: 0.2135]
Epoch [60/100], Loss: 0.2135]
Epoch [70/100], Loss: 0.2135]
Epoch [80/100], Loss: 0.2135]
Epoch [90/100], Loss: 0.2135]
Epoch [100/100], Loss: 0.2135]


In [20]:
test_ts_encoded.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2 entries, 0 to 1
Data columns (total 61 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Enc_1   2 non-null      float32
 1   Enc_2   2 non-null      float32
 2   Enc_3   2 non-null      float32
 3   Enc_4   2 non-null      float32
 4   Enc_5   2 non-null      float32
 5   Enc_6   2 non-null      float32
 6   Enc_7   2 non-null      float32
 7   Enc_8   2 non-null      float32
 8   Enc_9   2 non-null      float32
 9   Enc_10  2 non-null      float32
 10  Enc_11  2 non-null      float32
 11  Enc_12  2 non-null      float32
 12  Enc_13  2 non-null      float32
 13  Enc_14  2 non-null      float32
 14  Enc_15  2 non-null      float32
 15  Enc_16  2 non-null      float32
 16  Enc_17  2 non-null      float32
 17  Enc_18  2 non-null      float32
 18  Enc_19  2 non-null      float32
 19  Enc_20  2 non-null      float32
 20  Enc_21  2 non-null      float32
 21  Enc_22  2 non-null      float32
 22  Enc_23

In [21]:
#Merging the tabular data with extracted features from Actigraphy files
filtered_train = pd.merge(filtered_train, train_ts_encoded, how="left", on='id')
filtered_test = pd.merge(filtered_test, test_ts_encoded, how="left", on='id')

In [39]:
filtered_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2736 entries, 0 to 2735
Data columns (total 79 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Basic_Demos-Age                         2736 non-null   float64
 1   Physical-BMI                            2527 non-null   float64
 2   Physical-Height                         2530 non-null   float64
 3   Physical-Weight                         2572 non-null   float64
 4   Physical-Systolic_BP                    2478 non-null   float64
 5   FGC-FGC_CU                              1919 non-null   float64
 6   FGC-FGC_PU                              1909 non-null   float64
 7   FGC-FGC_SRL_Zone                        1877 non-null   float64
 8   FGC-FGC_SRR_Zone                        1879 non-null   float64
 9   FGC-FGC_TL                              1919 non-null   float64
 10  BIA-BIA_BMI                             1813 non-null   floa

In [22]:
#Creating train and validation split
filtered_test=filtered_test.drop(['id'], axis=1)
X=filtered_train.drop(['sii', 'PCIAT-PCIAT_Total', 'id'], axis=1)
y=filtered_train['sii']
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [24]:
#Standardize the data
scaler = StandardScaler()
scaler.fit(X_train)
X_train_standardized=scaler.transform(X_train)
X_train=pd.DataFrame(X_train_standardized, columns=X_train.columns)

X_val_standardized=scaler.transform(X_val)
X_val=pd.DataFrame(X_val_standardized, columns=X_train.columns)

filtered_test_standardized=scaler.transform(filtered_test)
filtered_test=pd.DataFrame(filtered_test_standardized, columns=X_train.columns)


In [25]:
# Imputation for missing values using KNN 
imputer = KNNImputer(n_neighbors=15)
imputer.fit(X_train)
X_train_imputed = imputer.transform(X_train)
X_train_imputed = pd.DataFrame(X_train_imputed, columns=X_train.columns)

X_val_imputed = imputer.transform(X_val)
X_val_imputed = pd.DataFrame(X_val_imputed, columns=X_train.columns)

X_test_imputed = imputer.transform(filtered_test)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=X_train.columns)

In [26]:
X_train_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2188 entries, 0 to 2187
Data columns (total 76 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Basic_Demos-Age                         2188 non-null   float64
 1   Physical-BMI                            2188 non-null   float64
 2   Physical-Height                         2188 non-null   float64
 3   Physical-Weight                         2188 non-null   float64
 4   Physical-Systolic_BP                    2188 non-null   float64
 5   FGC-FGC_CU                              2188 non-null   float64
 6   FGC-FGC_PU                              2188 non-null   float64
 7   FGC-FGC_SRL_Zone                        2188 non-null   float64
 8   FGC-FGC_SRR_Zone                        2188 non-null   float64
 9   FGC-FGC_TL                              2188 non-null   float64
 10  BIA-BIA_BMI                             2188 non-null   floa

# PCA for dimensionality reduction of physical measures, Bio-electric Impedance Analysis, and FitnessGram Vitals 

In [27]:
class PCATransformer:
    def __init__(self, n_components):
        self.n_components = n_components
        self.scaler = StandardScaler()
        self.pca = PCA(n_components=self.n_components)

    def fit(self, df, columns):
        """Fit the PCA model to the specified columns of the DataFrame."""
        # Select the specified columns
        df_subset = df[columns].copy()

        # Standardize the data
        self.scaled_data = self.scaler.fit_transform(df_subset)

        # Fit PCA
        self.pca.fit(self.scaled_data)

    def transform(self, df, columns):
        """Transform the DataFrame using the fitted PCA model."""
        df_subset = df[columns].copy()

        # Standardize the data
        scaled_data = self.scaler.transform(df_subset)

        # Transform using PCA
        pca_result = self.pca.transform(scaled_data)

        # Create a DataFrame for the PCA results
        pca_df = pd.DataFrame(data=pca_result, columns=[f'{columns[0].split("-")[0]}-PC {i + 1}' for i in range(self.n_components)])

        # Print explained variances
        explained_variances = self.pca.explained_variance_ratio_
        print(f'Explained Variances for {columns[0].split("-")[0]}:')
        for i, variance in enumerate(explained_variances, start=1):
            print(f'Principal Component {i}: {variance:.4f}')

        # Concatenate the PCA results back to the original DataFrame
        return pd.concat([df.reset_index(drop=True), pca_df.reset_index(drop=True)], axis=1)


In [None]:
p = PCATransformer(n_components=2)

physical_columns=['Physical-BMI', 'Physical-Height', 'Physical-Weight', 'Physical-Systolic_BP']

BIA_columns = ['BIA-BIA_FFMI', 'BIA-BIA_BMI', 'BIA-BIA_Frame_num']

FGC_columns = ['FGC-FGC_CU', 'FGC-FGC_PU', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL']

# Apply PCA for physical columns
p.fit(X_train_imputed, physical_columns)
X_train_imputed = p.transform(X_train_imputed, physical_columns)
X_val_imputed = p.transform(X_val_imputed, physical_columns)
X_test_imputed = p.transform(X_test_imputed, physical_columns)

# Apply PCA for BIA columns
p.fit(X_train_imputed, BIA_columns)
X_train_imputed = p.transform(X_train_imputed, BIA_columns)
X_val_imputed = p.transform(X_val_imputed, BIA_columns)
X_test_imputed = p.transform(X_test_imputed, BIA_columns)

# Apply PCA for FGC columns
p.fit(X_train_imputed, FGC_columns)
X_train_imputed = p.transform(X_train_imputed, FGC_columns)
X_val_imputed = p.transform(X_val_imputed, FGC_columns)
X_test_imputed = p.transform(X_test_imputed, FGC_columns)


In [29]:
X_val_imputed.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 548 entries, 0 to 547
Data columns (total 82 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Basic_Demos-Age                         548 non-null    float64
 1   Physical-BMI                            548 non-null    float64
 2   Physical-Height                         548 non-null    float64
 3   Physical-Weight                         548 non-null    float64
 4   Physical-Systolic_BP                    548 non-null    float64
 5   FGC-FGC_CU                              548 non-null    float64
 6   FGC-FGC_PU                              548 non-null    float64
 7   FGC-FGC_SRL_Zone                        548 non-null    float64
 8   FGC-FGC_SRR_Zone                        548 non-null    float64
 9   FGC-FGC_TL                              548 non-null    float64
 10  BIA-BIA_BMI                             548 non-null    float6

In [50]:
#Excluding features that went through PCA dimensionality reduction and keeping the first two PCs from each 
excluded_features=physical_columns+BIA_columns+FGC_columns
X_train_df = X_train_imputed.drop(columns=excluded_features)
X_val_df= X_val_imputed.drop(columns=excluded_features)
X_test_df= X_test_imputed.drop(columns=excluded_features)


# Emsemble learning using stacking approach 

In [47]:
# Create individual models
rf_model = RandomForestClassifier(max_depth=4, min_samples_leaf=7,min_samples_split=4, oob_score=True )
gb_model = GradientBoostingClassifier(n_estimators=70, learning_rate=0.06, max_depth=2, random_state=42)
lgbm_model = LGBMClassifier(n_estimators=100,force_col_wise=True, random_state=42)
catboost_model = CatBoostClassifier(iterations=100, random_seed=42, verbose=0)

# Define the ensemble model (stacking)
base_learners = [
    ('rf', rf_model),
    ('gb', gb_model),
    ('lgbm', lgbm_model),
    ('catboost', catboost_model)
]

model = StackingClassifier(estimators=base_learners, final_estimator=LogisticRegression(max_iter=1000))


In [48]:
#Training the model
model.fit(X_train_df,y_train )

#Evaluation of the model
y_train_pred = model.predict(X_train_df)
y_val_pred = model.predict(X_val_df)

train_score=accuracy_score(y_train, y_train_pred)
val_score=accuracy_score(y_val, y_val_pred)  
print(train_score)
print(val_score)

[LightGBM] [Info] Total Bins 17058
[LightGBM] [Info] Number of data points in the train set: 2188, number of used features: 70
[LightGBM] [Info] Start training from score -0.540042
[LightGBM] [Info] Start training from score -1.320842
[LightGBM] [Info] Start training from score -1.980316
[LightGBM] [Info] Start training from score -4.394906
[LightGBM] [Info] Total Bins 17036
[LightGBM] [Info] Number of data points in the train set: 1750, number of used features: 70
[LightGBM] [Info] Start training from score -0.539813
[LightGBM] [Info] Start training from score -1.321042
[LightGBM] [Info] Start training from score -1.982574
[LightGBM] [Info] Start training from score -4.376329
[LightGBM] [Info] Total Bins 17040
[LightGBM] [Info] Number of data points in the train set: 1750, number of used features: 70
[LightGBM] [Info] Start training from score -0.539813
[LightGBM] [Info] Start training from score -1.318903
[LightGBM] [Info] Start training from score -1.982574
[LightGBM] [Info] Start t

In [45]:
# Generating the model output
predictions= model.predict(X_test_df)
submissions['sii']= predictions
submissions.to_csv('submission.csv', index=None)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submissions['sii']= predictions


In [46]:
submissions.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20 entries, 0 to 19
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   id      20 non-null     object
 1   sii     20 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 448.0+ bytes
