In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import mutual_info_classif
import lightgbm as lgb
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import ConfusionMatrixDisplay, cohen_kappa_score

In [2]:
train = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/train.csv')
test = pd.read_csv('/kaggle/input/child-mind-institute-problematic-internet-use/test.csv')

In [3]:
train.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,


In [4]:
train.isna().sum()

id                                           0
Basic_Demos-Enroll_Season                    0
Basic_Demos-Age                              0
Basic_Demos-Sex                              0
CGAS-Season                               1405
                                          ... 
SDS-SDS_Total_Raw                         1351
SDS-SDS_Total_T                           1354
PreInt_EduHx-Season                        420
PreInt_EduHx-computerinternet_hoursday     659
sii                                       1224
Length: 82, dtype: int64

In [5]:
train = train[train['sii'].notna()].reset_index(drop=True)

In [6]:
train = train.drop(columns=['id'], axis=1)

In [7]:
train.shape

(2736, 81)

In [8]:
dummy_train = train.copy()
dummy_train.head()

Unnamed: 0,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,Spring,13,1,Winter,50.0,Summer,22.279952,59.5,112.2,,...,1.0,2.0,1.0,34.0,Summer,40.0,56.0,Spring,0.0,1.0


In [9]:
categorical_columns = [
    'Basic_Demos-Enroll_Season',
    'CGAS-Season',
    'Physical-Season',
    'Fitness_Endurance-Season',
    'FGC-Season',
    'BIA-Season',
    'PAQ_A-Season',
    'PAQ_C-Season',
    'PCIAT-Season',
    'SDS-Season',
    'PreInt_EduHx-Season'
]

In [10]:
dummy_train = dummy_train.drop(columns=dummy_train[categorical_columns] , axis = 1)

In [11]:
dummy_train.head()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,...,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday,sii
0,5,0,51.0,16.877316,46.0,50.8,,,,,...,4.0,4.0,4.0,2.0,4.0,55.0,,,3.0,2.0
1,9,0,,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,...,0.0,0.0,0.0,0.0,0.0,0.0,46.0,64.0,0.0,0.0
2,10,1,71.0,16.648696,56.5,75.6,,65.0,94.0,117.0,...,0.0,2.0,2.0,1.0,1.0,28.0,38.0,54.0,2.0,0.0
3,9,0,71.0,18.292347,56.0,81.6,,60.0,97.0,117.0,...,3.0,4.0,3.0,4.0,1.0,44.0,31.0,45.0,0.0,1.0
4,13,1,50.0,22.279952,59.5,112.2,,60.0,73.0,102.0,...,1.0,3.0,1.0,2.0,1.0,34.0,40.0,56.0,0.0,1.0


In [12]:
columns_not_in_test = list(set(dummy_train.columns) - set(test.columns))
columns_not_in_test = sorted(columns_not_in_test)
columns_not_in_test.remove('sii')
columns_not_in_test

['PCIAT-PCIAT_01',
 'PCIAT-PCIAT_02',
 'PCIAT-PCIAT_03',
 'PCIAT-PCIAT_04',
 'PCIAT-PCIAT_05',
 'PCIAT-PCIAT_06',
 'PCIAT-PCIAT_07',
 'PCIAT-PCIAT_08',
 'PCIAT-PCIAT_09',
 'PCIAT-PCIAT_10',
 'PCIAT-PCIAT_11',
 'PCIAT-PCIAT_12',
 'PCIAT-PCIAT_13',
 'PCIAT-PCIAT_14',
 'PCIAT-PCIAT_15',
 'PCIAT-PCIAT_16',
 'PCIAT-PCIAT_17',
 'PCIAT-PCIAT_18',
 'PCIAT-PCIAT_19',
 'PCIAT-PCIAT_20',
 'PCIAT-PCIAT_Total']

In [13]:
dummy_train = dummy_train.drop(columns = dummy_train[columns_not_in_test] , axis = 1)

In [14]:
columns =(dummy_train.columns).to_list()

In [15]:
columns

['Basic_Demos-Age',
 'Basic_Demos-Sex',
 'CGAS-CGAS_Score',
 'Physical-BMI',
 'Physical-Height',
 'Physical-Weight',
 'Physical-Waist_Circumference',
 'Physical-Diastolic_BP',
 'Physical-HeartRate',
 'Physical-Systolic_BP',
 'Fitness_Endurance-Max_Stage',
 'Fitness_Endurance-Time_Mins',
 'Fitness_Endurance-Time_Sec',
 'FGC-FGC_CU',
 'FGC-FGC_CU_Zone',
 'FGC-FGC_GSND',
 'FGC-FGC_GSND_Zone',
 'FGC-FGC_GSD',
 'FGC-FGC_GSD_Zone',
 'FGC-FGC_PU',
 'FGC-FGC_PU_Zone',
 'FGC-FGC_SRL',
 'FGC-FGC_SRL_Zone',
 'FGC-FGC_SRR',
 'FGC-FGC_SRR_Zone',
 'FGC-FGC_TL',
 'FGC-FGC_TL_Zone',
 'BIA-BIA_Activity_Level_num',
 'BIA-BIA_BMC',
 'BIA-BIA_BMI',
 'BIA-BIA_BMR',
 'BIA-BIA_DEE',
 'BIA-BIA_ECW',
 'BIA-BIA_FFM',
 'BIA-BIA_FFMI',
 'BIA-BIA_FMI',
 'BIA-BIA_Fat',
 'BIA-BIA_Frame_num',
 'BIA-BIA_ICW',
 'BIA-BIA_LDM',
 'BIA-BIA_LST',
 'BIA-BIA_SMM',
 'BIA-BIA_TBW',
 'PAQ_A-PAQ_A_Total',
 'PAQ_C-PAQ_C_Total',
 'SDS-SDS_Total_Raw',
 'SDS-SDS_Total_T',
 'PreInt_EduHx-computerinternet_hoursday',
 'sii']

In [16]:
x = dummy_train.drop('sii' ,axis = 1)
y = pd.DataFrame(dummy_train['sii'])

In [17]:
x.columns

Index(['Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-CGAS_Score', 'Physical-BMI',
       'Physical-Height', 'Physical-Weight', 'Physical-Waist_Circumference',
       'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP',
       'Fitness_Endurance-Max_Stage', 'Fitness_Endurance-Time_Mins',
       'Fitness_Endurance-Time_Sec', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone',
       'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone',
       'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone',
       'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone',
       'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI',
       'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM',
       'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num',
       'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM',
       'BIA-BIA_TBW', 'PAQ_A-PAQ_A_Total', 'PAQ_C-PAQ_C_Total',
       'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T',
       'Pre

In [18]:
from sklearn.model_selection import train_test_split

In [19]:
x_train , x_val ,y_train , y_val = train_test_split(x , y ,test_size = 0.2 , random_state = 42)

In [20]:
from sklearn.experimental import enable_iterative_imputer 
from sklearn.impute import IterativeImputer
imputer = IterativeImputer(max_iter = 10 , random_state = 42 )
imp_x_train = pd.DataFrame(imputer.fit_transform(x_train))
imp_x_train.columns = x_train.columns

In [21]:
imp_x_train.isna().sum()

Basic_Demos-Age                           0
Basic_Demos-Sex                           0
CGAS-CGAS_Score                           0
Physical-BMI                              0
Physical-Height                           0
Physical-Weight                           0
Physical-Waist_Circumference              0
Physical-Diastolic_BP                     0
Physical-HeartRate                        0
Physical-Systolic_BP                      0
Fitness_Endurance-Max_Stage               0
Fitness_Endurance-Time_Mins               0
Fitness_Endurance-Time_Sec                0
FGC-FGC_CU                                0
FGC-FGC_CU_Zone                           0
FGC-FGC_GSND                              0
FGC-FGC_GSND_Zone                         0
FGC-FGC_GSD                               0
FGC-FGC_GSD_Zone                          0
FGC-FGC_PU                                0
FGC-FGC_PU_Zone                           0
FGC-FGC_SRL                               0
FGC-FGC_SRL_Zone                

In [22]:
imp_x_val = pd.DataFrame(imputer.transform(x_val))
imp_x_val.columns = x_val.columns

In [23]:
imp_x_val.isna().sum()

Basic_Demos-Age                           0
Basic_Demos-Sex                           0
CGAS-CGAS_Score                           0
Physical-BMI                              0
Physical-Height                           0
Physical-Weight                           0
Physical-Waist_Circumference              0
Physical-Diastolic_BP                     0
Physical-HeartRate                        0
Physical-Systolic_BP                      0
Fitness_Endurance-Max_Stage               0
Fitness_Endurance-Time_Mins               0
Fitness_Endurance-Time_Sec                0
FGC-FGC_CU                                0
FGC-FGC_CU_Zone                           0
FGC-FGC_GSND                              0
FGC-FGC_GSND_Zone                         0
FGC-FGC_GSD                               0
FGC-FGC_GSD_Zone                          0
FGC-FGC_PU                                0
FGC-FGC_PU_Zone                           0
FGC-FGC_SRL                               0
FGC-FGC_SRL_Zone                

In [24]:
from sklearn.model_selection import StratifiedKFold
rf_params = {
    'n_estimators': 2048,
    'max_depth': 12,
    'min_samples_leaf': 4,
    'max_features': 'sqrt',
    'class_weight': 'balanced',
    'random_state': 42,
    'verbose': 0
}

In [25]:
imp_x_train.isna().sum()

Basic_Demos-Age                           0
Basic_Demos-Sex                           0
CGAS-CGAS_Score                           0
Physical-BMI                              0
Physical-Height                           0
Physical-Weight                           0
Physical-Waist_Circumference              0
Physical-Diastolic_BP                     0
Physical-HeartRate                        0
Physical-Systolic_BP                      0
Fitness_Endurance-Max_Stage               0
Fitness_Endurance-Time_Mins               0
Fitness_Endurance-Time_Sec                0
FGC-FGC_CU                                0
FGC-FGC_CU_Zone                           0
FGC-FGC_GSND                              0
FGC-FGC_GSND_Zone                         0
FGC-FGC_GSD                               0
FGC-FGC_GSD_Zone                          0
FGC-FGC_PU                                0
FGC-FGC_PU_Zone                           0
FGC-FGC_SRL                               0
FGC-FGC_SRL_Zone                

In [26]:
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import cohen_kappa_score
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
import pandas as pd

# Step 1: Split the data into training and validation sets (80-20 split)
X_train, X_val, y_train, y_val = train_test_split(x, y, test_size=0.2, stratify=y, random_state=42)

# Step 2: Perform imputation
imputer = IterativeImputer(max_iter=10, random_state=42)

# Fit and transform the training set
imp_x_train = pd.DataFrame(imputer.fit_transform(X_train), columns=X_train.columns)

# Only transform the validation set
imp_x_val = pd.DataFrame(imputer.transform(X_val), columns=X_val.columns)

# Step 3: Initialize Stratified K-Fold for cross-validation on the training data
stratify_k_fold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

train_scores, valid_scores = [], []  # For cross-validation on training set
final_val_scores = []  # For evaluation on the validation set

# Step 4: Perform 5-fold cross-validation on the imputed training data
for fold, (train_idx, valid_idx) in enumerate(stratify_k_fold.split(imp_x_train, y_train)):
    X_fold_train, y_fold_train = imp_x_train.iloc[train_idx], y_train.iloc[train_idx]
    X_fold_val, y_fold_val = imp_x_train.iloc[valid_idx], y_train.iloc[valid_idx]

    # Initialize and train the classifier for this fold
    clf = RandomForestClassifier(**rf_params)
    clf.fit(X_fold_train, y_fold_train)

    # Training score for this fold
    train_predictions = clf.predict(X_fold_train)
    train_score = cohen_kappa_score(train_predictions, y_fold_train, weights='quadratic')
    train_scores.append(train_score)

    # Validation score for this fold (on the validation fold from the training set)
    fold_valid_predictions = clf.predict(X_fold_val)
    fold_valid_score = cohen_kappa_score(fold_valid_predictions, y_fold_val, weights='quadratic')
    valid_scores.append(fold_valid_score)

    # Evaluate on the held-out validation set (imp_x_val, y_val)
    final_val_predictions = clf.predict(imp_x_val)
    final_val_score = cohen_kappa_score(final_val_predictions, y_val, weights='quadratic')
    final_val_scores.append(final_val_score)

    # Print results for the current fold
    print("="*20 + f" Fold {fold+1} " + "="*20)
    print(f"Train score: {train_score:.2f}  |  Fold Validation score: {fold_valid_score:.2f}  |  Final Validation score (imp_x_val): {final_val_score:.2f}")
    print()

# Step 5: Print average scores across all folds
print(f"Average Train Score (cross-validation): {sum(train_scores)/len(train_scores):.2f}")
print(f"Average Fold Validation Score: {sum(valid_scores)/len(valid_scores):.2f}")
print(f"Average Final Validation Score (imp_x_val): {sum(final_val_scores)/len(final_val_scores):.2f}")


  clf.fit(X_fold_train, y_fold_train)


Train score: 0.89  |  Fold Validation score: 0.32  |  Final Validation score (imp_x_val): 0.43



  clf.fit(X_fold_train, y_fold_train)


Train score: 0.92  |  Fold Validation score: 0.44  |  Final Validation score (imp_x_val): 0.39



  clf.fit(X_fold_train, y_fold_train)


Train score: 0.92  |  Fold Validation score: 0.45  |  Final Validation score (imp_x_val): 0.42



  clf.fit(X_fold_train, y_fold_train)


Train score: 0.90  |  Fold Validation score: 0.34  |  Final Validation score (imp_x_val): 0.42



  clf.fit(X_fold_train, y_fold_train)


Train score: 0.92  |  Fold Validation score: 0.39  |  Final Validation score (imp_x_val): 0.42

Average Train Score (cross-validation): 0.91
Average Fold Validation Score: 0.39
Average Final Validation Score (imp_x_val): 0.41


In [27]:
submission_df = test[['id']]
submission_df.head()

Unnamed: 0,id
0,00008ff9
1,000fd460
2,00105258
3,00115b9f
4,0016bb22


In [28]:
categorical_columns = [
    'Basic_Demos-Enroll_Season',
    'CGAS-Season',
    'Physical-Season',
    'Fitness_Endurance-Season',
    'FGC-Season',
    'BIA-Season',
    'PAQ_A-Season',
    'PAQ_C-Season',
    'SDS-Season',
    'PreInt_EduHx-Season',
    'id',
]


In [29]:
test_df = test.drop(test[categorical_columns] , axis = 1)

In [30]:
test_df

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,...,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-PAQ_A_Total,PAQ_C-PAQ_C_Total,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-computerinternet_hoursday
0,5,0,51.0,16.877316,46.0,50.8,,,,,...,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,3.0
1,9,0,,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,...,21.0352,14.974,39.4497,15.4107,27.0552,,2.34,46.0,64.0,0.0
2,10,1,71.0,16.648696,56.5,75.6,,65.0,94.0,117.0,...,,,,,,,2.17,38.0,54.0,2.0
3,9,0,71.0,18.292347,56.0,81.6,,60.0,97.0,117.0,...,30.4041,16.779,58.9338,26.4798,45.9966,,2.451,31.0,45.0,0.0
4,18,1,,,,,,,,,...,,,,,,1.04,,,,
5,13,1,50.0,22.279952,59.5,112.2,,60.0,73.0,102.0,...,32.9141,20.902,79.6982,35.3804,63.1265,,4.11,40.0,56.0,0.0
6,10,0,,19.66076,55.0,84.6,,123.0,83.0,163.0,...,30.8936,16.0259,59.4643,26.1957,47.2211,,3.67,27.0,40.0,3.0
7,10,1,,16.861286,59.25,84.2,27.0,71.0,90.0,116.0,...,28.5367,17.476,63.8954,28.768,50.4767,,1.27,,,2.0
8,15,0,,,,,,,,,...,,,,,,,,,,2.0
9,19,1,,,,,,,,,...,,,,,,,,,,


In [31]:
columns


['Basic_Demos-Age',
 'Basic_Demos-Sex',
 'CGAS-CGAS_Score',
 'Physical-BMI',
 'Physical-Height',
 'Physical-Weight',
 'Physical-Waist_Circumference',
 'Physical-Diastolic_BP',
 'Physical-HeartRate',
 'Physical-Systolic_BP',
 'Fitness_Endurance-Max_Stage',
 'Fitness_Endurance-Time_Mins',
 'Fitness_Endurance-Time_Sec',
 'FGC-FGC_CU',
 'FGC-FGC_CU_Zone',
 'FGC-FGC_GSND',
 'FGC-FGC_GSND_Zone',
 'FGC-FGC_GSD',
 'FGC-FGC_GSD_Zone',
 'FGC-FGC_PU',
 'FGC-FGC_PU_Zone',
 'FGC-FGC_SRL',
 'FGC-FGC_SRL_Zone',
 'FGC-FGC_SRR',
 'FGC-FGC_SRR_Zone',
 'FGC-FGC_TL',
 'FGC-FGC_TL_Zone',
 'BIA-BIA_Activity_Level_num',
 'BIA-BIA_BMC',
 'BIA-BIA_BMI',
 'BIA-BIA_BMR',
 'BIA-BIA_DEE',
 'BIA-BIA_ECW',
 'BIA-BIA_FFM',
 'BIA-BIA_FFMI',
 'BIA-BIA_FMI',
 'BIA-BIA_Fat',
 'BIA-BIA_Frame_num',
 'BIA-BIA_ICW',
 'BIA-BIA_LDM',
 'BIA-BIA_LST',
 'BIA-BIA_SMM',
 'BIA-BIA_TBW',
 'PAQ_A-PAQ_A_Total',
 'PAQ_C-PAQ_C_Total',
 'SDS-SDS_Total_Raw',
 'SDS-SDS_Total_T',
 'PreInt_EduHx-computerinternet_hoursday',
 'sii']

In [32]:
test_df_imp = pd.DataFrame(imputer.transform(test_df))

In [33]:
test_df_imp

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,38,39,40,41,42,43,44,45,46,47
0,5.0,0.0,51.0,16.877316,46.0,50.8,23.345314,69.856406,88.399468,111.86046,...,24.4349,8.89536,38.9177,19.5413,32.6909,1.902735,2.381184,40.692997,57.568981,3.0
1,9.0,0.0,61.668808,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,...,21.0352,14.974,39.4497,15.4107,27.0552,2.096185,2.34,46.0,64.0,0.0
2,10.0,1.0,71.0,16.648696,56.5,75.6,25.222682,65.0,94.0,117.0,...,33.396173,20.421258,67.385099,42.446352,54.542506,2.103628,2.17,38.0,54.0,2.0
3,9.0,0.0,71.0,18.292347,56.0,81.6,26.593007,60.0,97.0,117.0,...,30.4041,16.779,58.9338,26.4798,45.9966,2.133026,2.451,31.0,45.0,0.0
4,18.0,1.0,62.997221,17.129807,61.286188,91.678666,25.548856,72.219062,96.651187,120.47503,...,33.396188,20.421277,67.385127,44.279832,54.542515,1.04,15.648652,40.58233,57.327223,1.964024
5,13.0,1.0,50.0,22.279952,59.5,112.2,27.554292,60.0,73.0,102.0,...,32.9141,20.902,79.6982,35.3804,63.1265,1.994699,4.11,40.0,56.0,0.0
6,10.0,0.0,63.785065,19.66076,55.0,84.6,26.799353,123.0,83.0,163.0,...,30.8936,16.0259,59.4643,26.1957,47.2211,2.100048,3.67,27.0,40.0,3.0
7,10.0,1.0,64.917553,16.861286,59.25,84.2,27.0,71.0,90.0,116.0,...,28.5367,17.476,63.8954,28.768,50.4767,1.962503,1.27,40.748889,57.60843,2.0
8,15.0,0.0,62.797875,17.781184,59.33218,89.696131,25.976097,70.000361,78.148396,117.790884,...,33.396186,20.421279,67.385116,32.083384,54.542513,2.209229,2.748112,40.774304,57.489369,2.0
9,19.0,1.0,64.777867,16.758025,62.499232,92.167681,25.381179,71.262375,75.612991,118.131533,...,33.396188,20.421282,67.385124,44.654698,54.542516,2.23323,2.714724,40.723945,57.45918,1.985173


In [34]:
columns

['Basic_Demos-Age',
 'Basic_Demos-Sex',
 'CGAS-CGAS_Score',
 'Physical-BMI',
 'Physical-Height',
 'Physical-Weight',
 'Physical-Waist_Circumference',
 'Physical-Diastolic_BP',
 'Physical-HeartRate',
 'Physical-Systolic_BP',
 'Fitness_Endurance-Max_Stage',
 'Fitness_Endurance-Time_Mins',
 'Fitness_Endurance-Time_Sec',
 'FGC-FGC_CU',
 'FGC-FGC_CU_Zone',
 'FGC-FGC_GSND',
 'FGC-FGC_GSND_Zone',
 'FGC-FGC_GSD',
 'FGC-FGC_GSD_Zone',
 'FGC-FGC_PU',
 'FGC-FGC_PU_Zone',
 'FGC-FGC_SRL',
 'FGC-FGC_SRL_Zone',
 'FGC-FGC_SRR',
 'FGC-FGC_SRR_Zone',
 'FGC-FGC_TL',
 'FGC-FGC_TL_Zone',
 'BIA-BIA_Activity_Level_num',
 'BIA-BIA_BMC',
 'BIA-BIA_BMI',
 'BIA-BIA_BMR',
 'BIA-BIA_DEE',
 'BIA-BIA_ECW',
 'BIA-BIA_FFM',
 'BIA-BIA_FFMI',
 'BIA-BIA_FMI',
 'BIA-BIA_Fat',
 'BIA-BIA_Frame_num',
 'BIA-BIA_ICW',
 'BIA-BIA_LDM',
 'BIA-BIA_LST',
 'BIA-BIA_SMM',
 'BIA-BIA_TBW',
 'PAQ_A-PAQ_A_Total',
 'PAQ_C-PAQ_C_Total',
 'SDS-SDS_Total_Raw',
 'SDS-SDS_Total_T',
 'PreInt_EduHx-computerinternet_hoursday',
 'sii']

In [35]:
test_df_imp.columns = imp_x_train.columns

In [36]:
submission_df['predictions'] = clf.predict(test_df_imp[clf.feature_names_in_])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  submission_df['predictions'] = clf.predict(test_df_imp[clf.feature_names_in_])


In [37]:
submission_df

Unnamed: 0,id,predictions
0,00008ff9,2.0
1,000fd460,0.0
2,00105258,0.0
3,00115b9f,1.0
4,0016bb22,0.0
5,001f3379,1.0
6,0038ba98,0.0
7,0068a485,0.0
8,0069fbed,2.0
9,0083e397,2.0


In [38]:
submission_df.to_csv('submission.csv', index=False)
print(submission_df['predictions'].value_counts())

predictions
0.0    9
1.0    6
2.0    5
Name: count, dtype: int64
