In [1]:
import os
from dotenv import load_dotenv
import pandas as pd
import numpy as np

load_dotenv()
TRAIN_SET = os.getenv("TRAIN_PATH")
TEST_SET  = os.getenv("TEST_PATH")
DESCRIPTION = os.getenv("DESCRIPTION_PATH")
    
train = pd.read_csv(TRAIN_SET)
test = pd.read_csv(TEST_SET)
description = pd.read_csv(DESCRIPTION)


train.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,3.0,4.0,1.0,44.0,Summer,31.0,45.0,Winter,0.0,1.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,,,,,,,,,


In [7]:
test.head()

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,...,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,...,32.6909,,,,,,,,Fall,3.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,...,27.0552,,,Fall,2.34,Fall,46.0,64.0,Summer,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,...,,,,Summer,2.17,Fall,38.0,54.0,Summer,2.0
3,00115b9f,Winter,9,0,Fall,71.0,Summer,18.292347,56.0,81.6,...,45.9966,,,Winter,2.451,Summer,31.0,45.0,Winter,0.0
4,0016bb22,Spring,18,1,Summer,,,,,,...,,Summer,1.04,,,,,,,


In [8]:
description.head()

Unnamed: 0,Instrument,Field,Description,Type,Values,Value Labels
0,Identifier,id,Participant's ID,str,,
1,Demographics,Basic_Demos-Enroll_Season,Season of enrollment,str,"Spring, Summer, Fall, Winter",
2,Demographics,Basic_Demos-Age,Age of participant,float,,
3,Demographics,Basic_Demos-Sex,Sex of participant,categorical int,01,"0=Male, 1=Female"
4,Children's Global Assessment Scale,CGAS-Season,Season of participation,str,"Spring, Summer, Fall, Winter",


## Obiettivo:
In questo file decidiamo di effettuare una pulizia dei dati ed eseguire le prime operazioni di feature engineering general purpose. Trattamento specifico dei dati allo scopo di un classificatore specifico avverrà nel file del classificatore specifico stesso.

Ogni passaggio sarà spiegato ed il codice sarà (se necessario) commentato per aiutare la comprensione

 - Elimino tutte le righe in cui la variabile target risulta NaN. Estraggo poi la variabile target (_'sii'_)

In [9]:
train = train.dropna(subset=['sii'])

y_train = train['sii']

y_train.shape

(2736,)

 - Elimino tutte le colonne presenti nel train ma non nel test. Elimino la colonna relativa all'id


In [10]:
def intersect_features(train_set, test_set):
    smaller_train = train_set[train_set.columns.intersection(test_set.columns)]
    smaller_train = smaller_train.drop(columns=['id'], axis=1)
    return smaller_train

X_train = intersect_features(train, test)

X_train.shape

(2736, 58)

In [11]:
description = description[description['Field'].isin(X_train.columns)]

#i want to delete all the columns of the description.T that have the substring 'Season' in their name
description = description[~description['Field'].str.contains('Season')]
description.shape

(48, 6)

In [12]:
description.head()

Unnamed: 0,Instrument,Field,Description,Type,Values,Value Labels
2,Demographics,Basic_Demos-Age,Age of participant,float,,
3,Demographics,Basic_Demos-Sex,Sex of participant,categorical int,1.0,"0=Male, 1=Female"
5,Children's Global Assessment Scale,CGAS-CGAS_Score,Children's Global Assessment Scale Score,int,,
7,Physical Measures,Physical-BMI,Body Mass Index (kg/m^2),float,,
8,Physical Measures,Physical-Height,Height (in),float,,


 - Rimuovo le feature che presentano un'elevata percentuale di valori NaN. (Valore arbitrario, scelgo 70% di valori NaN di default)

In [13]:
# Drop the columns with more than a fixed percentage of missing values:
def drop_columns(df, threshold = 0.3): #by default the threshold is 30%. If a column is >= 80% of NaNs, it will be dropped. It can be changed.

    # Calculate the minimum required non-NaN values per column based on the threshold percentage
    minimum_non_NaN = len(df) * threshold   

    # Identify columns to be dropped
    dropped_columns = df.columns[df.isnull().sum() > (len(df) - minimum_non_NaN)].tolist()

    # Drop the identified columns from the DataFrame
    new_df = df.drop(columns=dropped_columns)
    
    return new_df, dropped_columns

features_train, dropped_columns = drop_columns(X_train)
features_train.info()
print("=====================================================\n shape: ", features_train.shape)

<class 'pandas.core.frame.DataFrame'>
Index: 2736 entries, 0 to 3958
Data columns (total 52 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Basic_Demos-Enroll_Season               2736 non-null   object 
 1   Basic_Demos-Age                         2736 non-null   int64  
 2   Basic_Demos-Sex                         2736 non-null   int64  
 3   CGAS-Season                             2342 non-null   object 
 4   CGAS-CGAS_Score                         2342 non-null   float64
 5   Physical-Season                         2595 non-null   object 
 6   Physical-BMI                            2527 non-null   float64
 7   Physical-Height                         2530 non-null   float64
 8   Physical-Weight                         2572 non-null   float64
 9   Physical-Diastolic_BP                   2478 non-null   float64
 10  Physical-HeartRate                      2486 non-null   float64
 

 - Divido le feature del mio dataset tra categoriche e numeriche e decodifico le categoriche tramite 1HE

In [14]:
def extract_numerical_cathegorical(df):
    # Extract numerical and categorical columns
    numerical = df.select_dtypes(include=np.number).columns.tolist()
    categorical = df.select_dtypes(exclude=np.number).columns.tolist()

    return numerical, categorical

numerical_features, categorical_features = extract_numerical_cathegorical(features_train)
print("Numerical features are: ", len(numerical_features), "-> ", numerical_features)
print("=================================================\nCathegorical features are: ", len(categorical_features), "-> ", categorical_features)

Numerical features are:  43 ->  ['Basic_Demos-Age', 'Basic_Demos-Sex', 'CGAS-CGAS_Score', 'Physical-BMI', 'Physical-Height', 'Physical-Weight', 'Physical-Diastolic_BP', 'Physical-HeartRate', 'Physical-Systolic_BP', 'FGC-FGC_CU', 'FGC-FGC_CU_Zone', 'FGC-FGC_GSND', 'FGC-FGC_GSND_Zone', 'FGC-FGC_GSD', 'FGC-FGC_GSD_Zone', 'FGC-FGC_PU', 'FGC-FGC_PU_Zone', 'FGC-FGC_SRL', 'FGC-FGC_SRL_Zone', 'FGC-FGC_SRR', 'FGC-FGC_SRR_Zone', 'FGC-FGC_TL', 'FGC-FGC_TL_Zone', 'BIA-BIA_Activity_Level_num', 'BIA-BIA_BMC', 'BIA-BIA_BMI', 'BIA-BIA_BMR', 'BIA-BIA_DEE', 'BIA-BIA_ECW', 'BIA-BIA_FFM', 'BIA-BIA_FFMI', 'BIA-BIA_FMI', 'BIA-BIA_Fat', 'BIA-BIA_Frame_num', 'BIA-BIA_ICW', 'BIA-BIA_LDM', 'BIA-BIA_LST', 'BIA-BIA_SMM', 'BIA-BIA_TBW', 'PAQ_C-PAQ_C_Total', 'SDS-SDS_Total_Raw', 'SDS-SDS_Total_T', 'PreInt_EduHx-computerinternet_hoursday']
Cathegorical features are:  9 ->  ['Basic_Demos-Enroll_Season', 'CGAS-Season', 'Physical-Season', 'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_C-Season', 'SDS-Seas

In [15]:
decoded_train = pd.get_dummies(features_train, columns=categorical_features)
decoded_train.info()
print("=====================================================\n shape: ", decoded_train.shape)

<class 'pandas.core.frame.DataFrame'>
Index: 2736 entries, 0 to 3958
Data columns (total 79 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Basic_Demos-Age                         2736 non-null   int64  
 1   Basic_Demos-Sex                         2736 non-null   int64  
 2   CGAS-CGAS_Score                         2342 non-null   float64
 3   Physical-BMI                            2527 non-null   float64
 4   Physical-Height                         2530 non-null   float64
 5   Physical-Weight                         2572 non-null   float64
 6   Physical-Diastolic_BP                   2478 non-null   float64
 7   Physical-HeartRate                      2486 non-null   float64
 8   Physical-Systolic_BP                    2478 non-null   float64
 9   FGC-FGC_CU                              1919 non-null   float64
 10  FGC-FGC_CU_Zone                         1884 non-null   float64
 

In [16]:
decoded_train.head()

Unnamed: 0,Basic_Demos-Age,Basic_Demos-Sex,CGAS-CGAS_Score,Physical-BMI,Physical-Height,Physical-Weight,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,FGC-FGC_CU,...,PAQ_C-Season_Summer,PAQ_C-Season_Winter,SDS-Season_Fall,SDS-Season_Spring,SDS-Season_Summer,SDS-Season_Winter,PreInt_EduHx-Season_Fall,PreInt_EduHx-Season_Spring,PreInt_EduHx-Season_Summer,PreInt_EduHx-Season_Winter
0,5,0,51.0,16.877316,46.0,50.8,,,,0.0,...,False,False,False,False,False,False,True,False,False,False
1,9,0,,14.03559,48.0,46.0,75.0,70.0,122.0,3.0,...,False,False,True,False,False,False,False,False,True,False
2,10,1,71.0,16.648696,56.5,75.6,65.0,94.0,117.0,20.0,...,True,False,True,False,False,False,False,False,True,False
3,9,0,71.0,18.292347,56.0,81.6,60.0,97.0,117.0,18.0,...,False,True,False,False,True,False,False,False,False,True
5,13,1,50.0,22.279952,59.5,112.2,60.0,73.0,102.0,12.0,...,False,False,False,False,True,False,False,True,False,False


 - Converto le features booleane risultati dal 1HE in numeriche e converto tutto in _float64_

In [17]:
decoded_train *= 1
decoded_train = decoded_train.astype('float64')
decoded_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2736 entries, 0 to 3958
Data columns (total 79 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Basic_Demos-Age                         2736 non-null   float64
 1   Basic_Demos-Sex                         2736 non-null   float64
 2   CGAS-CGAS_Score                         2342 non-null   float64
 3   Physical-BMI                            2527 non-null   float64
 4   Physical-Height                         2530 non-null   float64
 5   Physical-Weight                         2572 non-null   float64
 6   Physical-Diastolic_BP                   2478 non-null   float64
 7   Physical-HeartRate                      2486 non-null   float64
 8   Physical-Systolic_BP                    2478 non-null   float64
 9   FGC-FGC_CU                              1919 non-null   float64
 10  FGC-FGC_CU_Zone                         1884 non-null   float64
 

 - Riempio i valori NaN nelle feature rimaste con la media dei valori

In [18]:
cleaned_train = decoded_train.fillna(decoded_train.mean())
cleaned_train.info()
print("=====================================================\n shape: ", cleaned_train.shape)

<class 'pandas.core.frame.DataFrame'>
Index: 2736 entries, 0 to 3958
Data columns (total 79 columns):
 #   Column                                  Non-Null Count  Dtype  
---  ------                                  --------------  -----  
 0   Basic_Demos-Age                         2736 non-null   float64
 1   Basic_Demos-Sex                         2736 non-null   float64
 2   CGAS-CGAS_Score                         2736 non-null   float64
 3   Physical-BMI                            2736 non-null   float64
 4   Physical-Height                         2736 non-null   float64
 5   Physical-Weight                         2736 non-null   float64
 6   Physical-Diastolic_BP                   2736 non-null   float64
 7   Physical-HeartRate                      2736 non-null   float64
 8   Physical-Systolic_BP                    2736 non-null   float64
 9   FGC-FGC_CU                              2736 non-null   float64
 10  FGC-FGC_CU_Zone                         2736 non-null   float64
 

 - Salvo il risultato della pulizia in un csv unico così che sia utilizzabile e sia il riferimento per i file in cui creiamo i modelli

In [19]:
cleaned_data_export = pd.concat([cleaned_train, y_train], axis=1)   

output_path = os.path.join('dataset/', 'cleaned_data_export.csv')
cleaned_data_export.to_csv(output_path, index=False)

print(f"Cleaned dataset saved to {output_path}")

Cleaned dataset saved to dataset/cleaned_data_export.csv
