In [1]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import make_column_selector, make_column_transformer, ColumnTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_selection import SelectPercentile, chi2
from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.preprocessing import MinMaxScaler, StandardScaler, LabelEncoder, OneHotEncoder, OrdinalEncoder 
from sklearn.model_selection import cross_val_score,StratifiedKFold, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.metrics import cohen_kappa_score, accuracy_score, adjusted_mutual_info_score, mean_absolute_error, r2_score, mean_squared_error

from sklearn.linear_model import LogisticRegression, LinearRegression
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor
from sklearn.impute import KNNImputer
from sklearn.ensemble import VotingRegressor, RandomForestRegressor

import warnings

In [2]:
warnings.filterwarnings("ignore")
pd.options.display.max_columns =100
pd.options.display.max_rows =100

In [3]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [4]:
train.head(3)

Unnamed: 0,id,Basic_Demos-Enroll_Season,Basic_Demos-Age,Basic_Demos-Sex,CGAS-Season,CGAS-CGAS_Score,Physical-Season,Physical-BMI,Physical-Height,Physical-Weight,Physical-Waist_Circumference,Physical-Diastolic_BP,Physical-HeartRate,Physical-Systolic_BP,Fitness_Endurance-Season,Fitness_Endurance-Max_Stage,Fitness_Endurance-Time_Mins,Fitness_Endurance-Time_Sec,FGC-Season,FGC-FGC_CU,FGC-FGC_CU_Zone,FGC-FGC_GSND,FGC-FGC_GSND_Zone,FGC-FGC_GSD,FGC-FGC_GSD_Zone,FGC-FGC_PU,FGC-FGC_PU_Zone,FGC-FGC_SRL,FGC-FGC_SRL_Zone,FGC-FGC_SRR,FGC-FGC_SRR_Zone,FGC-FGC_TL,FGC-FGC_TL_Zone,BIA-Season,BIA-BIA_Activity_Level_num,BIA-BIA_BMC,BIA-BIA_BMI,BIA-BIA_BMR,BIA-BIA_DEE,BIA-BIA_ECW,BIA-BIA_FFM,BIA-BIA_FFMI,BIA-BIA_FMI,BIA-BIA_Fat,BIA-BIA_Frame_num,BIA-BIA_ICW,BIA-BIA_LDM,BIA-BIA_LST,BIA-BIA_SMM,BIA-BIA_TBW,PAQ_A-Season,PAQ_A-PAQ_A_Total,PAQ_C-Season,PAQ_C-PAQ_C_Total,PCIAT-Season,PCIAT-PCIAT_01,PCIAT-PCIAT_02,PCIAT-PCIAT_03,PCIAT-PCIAT_04,PCIAT-PCIAT_05,PCIAT-PCIAT_06,PCIAT-PCIAT_07,PCIAT-PCIAT_08,PCIAT-PCIAT_09,PCIAT-PCIAT_10,PCIAT-PCIAT_11,PCIAT-PCIAT_12,PCIAT-PCIAT_13,PCIAT-PCIAT_14,PCIAT-PCIAT_15,PCIAT-PCIAT_16,PCIAT-PCIAT_17,PCIAT-PCIAT_18,PCIAT-PCIAT_19,PCIAT-PCIAT_20,PCIAT-PCIAT_Total,SDS-Season,SDS-SDS_Total_Raw,SDS-SDS_Total_T,PreInt_EduHx-Season,PreInt_EduHx-computerinternet_hoursday,sii
0,00008ff9,Fall,5,0,Winter,51.0,Fall,16.877316,46.0,50.8,,,,,,,,,Fall,0.0,0.0,,,,,0.0,0.0,7.0,0.0,6.0,0.0,6.0,1.0,Fall,2.0,2.66855,16.8792,932.498,1492.0,8.25598,41.5862,13.8177,3.06143,9.21377,1.0,24.4349,8.89536,38.9177,19.5413,32.6909,,,,,Fall,5.0,4.0,4.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,4.0,4.0,4.0,4.0,4.0,4.0,2.0,4.0,55.0,,,,Fall,3.0,2.0
1,000fd460,Summer,9,0,,,Fall,14.03559,48.0,46.0,22.0,75.0,70.0,122.0,,,,,Fall,3.0,0.0,,,,,5.0,0.0,11.0,1.0,11.0,1.0,3.0,0.0,Winter,2.0,2.57949,14.0371,936.656,1498.65,6.01993,42.0291,12.8254,1.21172,3.97085,1.0,21.0352,14.974,39.4497,15.4107,27.0552,,,Fall,2.34,Fall,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Fall,46.0,64.0,Summer,0.0,0.0
2,00105258,Summer,10,1,Fall,71.0,Fall,16.648696,56.5,75.6,,65.0,94.0,117.0,Fall,5.0,7.0,33.0,Fall,20.0,1.0,10.2,1.0,14.7,2.0,7.0,1.0,10.0,1.0,10.0,1.0,5.0,0.0,,,,,,,,,,,,,,,,,,,,Summer,2.17,Fall,5.0,2.0,2.0,1.0,2.0,1.0,1.0,2.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,0.0,2.0,2.0,1.0,1.0,28.0,Fall,38.0,54.0,Summer,2.0,0.0


In [5]:
train = train.drop(columns=['PCIAT-Season', 'PCIAT-PCIAT_01', 'PCIAT-PCIAT_02',
       'PCIAT-PCIAT_03', 'PCIAT-PCIAT_04', 'PCIAT-PCIAT_05', 'PCIAT-PCIAT_06',
       'PCIAT-PCIAT_07', 'PCIAT-PCIAT_08', 'PCIAT-PCIAT_09', 'PCIAT-PCIAT_10',
       'PCIAT-PCIAT_11', 'PCIAT-PCIAT_12', 'PCIAT-PCIAT_13', 'PCIAT-PCIAT_14',
       'PCIAT-PCIAT_15', 'PCIAT-PCIAT_16', 'PCIAT-PCIAT_17', 'PCIAT-PCIAT_18',
       'PCIAT-PCIAT_19', 'PCIAT-PCIAT_20', 'PCIAT-PCIAT_Total'], axis=1)

In [6]:
train = train.drop('id', axis=1)
test  = test.drop('id', axis=1)  

 #### <p style="background-color: #EDE7F6;color:#6600ff;display: inline-block;padding:.6rem;border-radius:.5rem">B1.Missing Values</p>

In [7]:
train['sii'] = train['sii'].fillna(train['sii'].mode()[0])  

In [8]:
# train['CGAS-Season'] = train['CGAS-Season'].fillna(train['CGAS-Season'].mode()[0])

In [9]:
# train['Basic_Demos-Age'].isnull().sum()

In [10]:
X = train.drop(columns=['sii']).copy()
y = train['sii']

In [11]:
# X = train[['Basic_Demos-Enroll_Season', 'CGAS-Season','Basic_Demos-Age']].copy()
# y = train['sii'].copy()

In [12]:
# label=LabelEncoder()
# y=label.fit_transform(y)
# y = y.astype('int16')
# y

# PIPELINE

In [13]:
# select columns by data type
num_cols = make_column_selector(dtype_include='number')
cat_cols = make_column_selector(dtype_exclude='number')

In [14]:
cat_cols_2 = X.select_dtypes(include='object').columns
cat_cols_2 =[ 'CGAS-Season', 'Physical-Season',
       'Fitness_Endurance-Season', 'FGC-Season', 'BIA-Season', 'PAQ_A-Season',
       'PAQ_C-Season', 'SDS-Season', 'PreInt_EduHx-Season']
# 'Basic_Demos-Enroll_Season' - wykloczone z listy żeby zrobić na tej kolumnie vectorize

In [15]:
vect = CountVectorizer()

In [16]:
# set up preprocessing for numeric columns
imp_median = SimpleImputer(strategy='median', add_indicator=True)
imp_knn = KNNImputer(n_neighbors=5, weights='uniform', metric='nan_euclidean')
# Euclidean distance measure that is NaN aware, e.g. will not include NaN values when calculating
# the distance between membersof the training dataset
# scaler = StandardScaler()
minmaxscaler = MinMaxScaler()

In [17]:
# set up preprocessing for categorical columns
# imp_constant = SimpleImputer(strategy='most_frequent')
# imp_constant = SimpleImputer(strategy='constant', fill_value='missing')
imp_constant = SimpleImputer(strategy='constant')
ohe = OneHotEncoder(sparse_output=False, drop='if_binary')
# ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False, drop='if_binary')
# drop='if_binary' drops the first category of binary features (zamiast dwóch kolumn będzie jedna)

In [18]:
# do all preprocessing
preprocessor = make_column_transformer(
                                        (vect, 'Basic_Demos-Enroll_Season'),
                                        (make_pipeline(imp_knn, minmaxscaler), num_cols),
                                        (make_pipeline(imp_constant, ohe), cat_cols_2)
                                      )

In [19]:
# create a pipeline
pipe = make_pipeline(preprocessor, LogisticRegression(random_state=42))

In [20]:
pipe.fit(X, y)

In [21]:
kf = KFold(5, shuffle=True, random_state=42)
cross_val_score(pipe, X, y, cv=kf, scoring='r2').mean()

-0.05956425412287878

# VotingRegressor (46)

In [423]:
model_1 = LinearRegression()
model_2 = XGBRegressor(random_state=42)

In [424]:
voting_regressor = VotingRegressor(estimators=[
    ('logistic_regression', model_1),
    ('xgb_boosting', model_2)
])

In [401]:
# keep 50% of features with the best chi-squared scores
selection = SelectPercentile(chi2, percentile=90)

In [425]:
pipe_2 = make_pipeline(preprocessor, selection, voting_regressor)

In [426]:
pipe_2.fit(X, y)

In [429]:
kf = KFold(5, shuffle=True, random_state=42)
cross_val_score(pipe_2, X, y, cv=kf, scoring='r2').mean()

-2.4582407124005555e+17

In [438]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [439]:
# Train the model
pipe_2.fit(X_train, y_train)

In [440]:
# Make predictions
y_pred = pipe_2.predict(X_test)

# Evaluating the Models

In [444]:
# Define MAPE function
def mean_absolute_percentage_error(y_true, y_pred):
    nonzero_indices = y_true != 0
    return np.mean(np.abs((y_true[nonzero_indices] - y_pred[nonzero_indices]) / y_true[nonzero_indices])) * 100

# Calculate evaluation metrics
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

# Print results
print('Mean Absolute Error:', mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', rmse)
print('R^2 Score:', r2_score(y_test, y_pred))
print('Mean Absolute Percentage Error:', mean_absolute_percentage_error(y_test, y_pred))

Mean Absolute Error: 0.4075567691178723
Mean Squared Error: 0.34372838254313837
Root Mean Squared Error: 0.586283534258927
R^2 Score: 0.3045606900334831
Mean Absolute Percentage Error: 45.66134282941602


In [None]:
# ColumnTransformer outputs 7 columns (38)
preprocessor.fit_transform(X).shape

In [None]:
# get the names of those 7 features (38)
preprocessor.get_feature_names_out

In [353]:
# see all parameters  (40)
preprocessor.get_params()

{'force_int_remainder_cols': True,
 'n_jobs': None,
 'remainder': 'drop',
 'sparse_threshold': 0.3,
 'transformer_weights': None,
 'transformers': [('countvectorizer',
   CountVectorizer(),
   'Basic_Demos-Enroll_Season'),
  ('pipeline-1',
   Pipeline(steps=[('simpleimputer',
                    SimpleImputer(add_indicator=True, strategy='median')),
                   ('minmaxscaler', MinMaxScaler())]),
   <sklearn.compose._column_transformer.make_column_selector at 0x28c64c6e750>),
  ('pipeline-2',
   Pipeline(steps=[('simpleimputer', SimpleImputer(strategy='constant')),
                   ('onehotencoder', OneHotEncoder(handle_unknown='ignore'))]),
   ['CGAS-Season',
    'Physical-Season',
    'Fitness_Endurance-Season',
    'FGC-Season',
    'BIA-Season',
    'PAQ_A-Season',
    'PAQ_C-Season',
    'SDS-Season',
    'PreInt_EduHx-Season'])],
 'verbose': False,
 'verbose_feature_names_out': True,
 'countvectorizer': CountVectorizer(),
 'pipeline-1': Pipeline(steps=[('simpleimputer',


In [354]:
# restore the previous behavior (40)
from sklearn import set_config
set_config(print_changed_only=False)
preprocessor

Inny zapis

In [242]:
ohe = OneHotEncoder()
vect = CountVectorizer()
imputer= SimpleImputer()
preprocessor = make_column_transformer(
                                        (ohe, ['Basic_Demos-Enroll_Season']), 
                                        (vect, 'CGAS-Season'),
                                        (imputer, ['Basic_Demos-Age'])
                                      )

In [238]:
ct = ColumnTransformer(
    [('ohe', OneHotEncoder(), ['Basic_Demos-Enroll_Season']),
     ('vectorizer', CountVectorizer(), 'CGAS-Season'),
     ('imputer', SimpleImputer(), ['Basic_Demos-Age'])])

In [239]:
clf1 = LogisticRegression(solver='liblinear', random_state=1)

In [240]:
pipe = Pipeline([('preprocessor', ct), ('classifier', clf1)])

In [241]:
pipe.fit(X, y)

# Pipeline with feature selection (34)

In [335]:
from sklearn.feature_selection import SelectPercentile, chi2

In [336]:
# keep 50% of features with the best chi-squared scores
selection = SelectPercentile(chi2, percentile=50)

In [363]:
pipe = make_pipeline(preprocessor, selection, LogisticRegression(), verbose=True)

In [364]:
# verbose=True pokazuje czas wykonania poszczególnych kroków
pipe.fit(X, y)

[Pipeline] . (step 1 of 3) Processing columntransformer, total=   0.0s
[Pipeline] .. (step 2 of 3) Processing selectpercentile, total=   0.0s
[Pipeline]  (step 3 of 3) Processing logisticregression, total=   0.1s


In [339]:
cross_val_score(pipe, X, y, scoring='accuracy').mean()

0.7194444444444444

# Display estimators as diagram (37)

In [341]:
from sklearn import set_config
set_config(display='diagram')

In [342]:
pipe

# Cross-validate the entire pipeline (not just the model)

'normalized_mutual_info_score', 'roc_auc_ovr', 'f1_samples', 'jaccard_weighted', 'roc_auc_ovo', 'recall_micro', 'neg_mean_squared_log_error', 'precision_samples', 'recall_samples', 'neg_mean_gamma_deviance', 'top_k_accuracy', 'neg_mean_poisson_deviance', 'homogeneity_score', 'precision', 'rand_score', 'roc_auc_ovo_weighted', 'recall_macro', 'recall_weighted', 'precision_micro', 'neg_root_mean_squared_log_error', 'balanced_accuracy', 'precision_weighted', 'recall', 'f1_weighted', 'jaccard_micro', 'v_measure_score', 'neg_mean_absolute_percentage_error', 'jaccard_samples', 'neg_mean_squared_error', 'fowlkes_mallows_score', 'neg_brier_score', 'roc_auc', 'r2', 'f1', 'precision_macro', 'mutual_info_score', 'neg_median_absolute_error', 'completeness_score', 'neg_mean_absolute_error', 'neg_log_loss', 'explained_variance', 'jaccard', 'max_error', 'matthews_corrcoef', 'accuracy', 'f1_micro', 'jaccard_macro', 'adjusted_rand_score', 'f1_macro', 'neg_negative_likelihood_ratio', 'neg_root_mean_squared_error', 'roc_auc_ovr_weighted', 'positive_likelihood_ratio', 'adjusted_mutual_info_score', 'average_precision', 'd2_absolute_error_score'

In [293]:
# cross-validate the pipeline
cross_val_score(pipe, X, y, cv=5, scoring='f1_micro').mean()

0.7164141414141414

# Use KFold with regression problems:

In [277]:
kf = KFold(5, shuffle=True, random_state=1)
cross_val_score(pipe, X, y, cv=kf, scoring='accuracy').mean()

0.717929292929293

# Use StratifiedKFold with classification problems:

In [278]:
skf = StratifiedKFold(5, shuffle=True, random_state=1)
cross_val_score(pipe, X, y, cv=skf, scoring='accuracy').mean()

0.7176767676767677

# Multiclass AUC with train/test split

AUC is an excellent evaluation metric for binary classification, especially if you have class imbalance.

In [299]:
from sklearn.metrics import roc_auc_score

In [300]:
kf = KFold(5, shuffle=True, random_state=1)
cross_val_score(pipe, X, y, cv=kf, scoring='roc_auc_ovo').mean()

0.7093730618811017

In [302]:
# use 'ovo' (One-vs-One) or 'ovr' (One-vs-Rest)
cross_val_score(pipe, X, y, cv=5, scoring='roc_auc_ovo').mean()

0.7151068323103933

# Find optimal tuning parameters for the entire pipeline

In [62]:
# specify parameter values to search
params = {}
# params['columntransformer__countvectorizer__min_df'] = [1, 2]
params['logisticregression__C'] = [0.1, 1, 10]
params['logisticregression__penalty'] = ['l1', 'l2']

# GridSearchCV

In [None]:
# Want your grid search to run faster? Set n_jobs=-1 to use parallel processing with all CPUs! (44)

In [63]:
# try all possible combinations of those parameter values
grid = GridSearchCV(pipe, params, cv=5, scoring='accuracy', n_jobs=-1)
grid.fit(X, y);

In [64]:
# what was the best score found during the search?
grid.best_score_

0.615909090909091

In [65]:
# which combination of parameters produced the best score?
grid.best_params_

{'logisticregression__C': 0.1, 'logisticregression__penalty': 'l2'}

In [66]:
# convert results into a DataFrame
results = pd.DataFrame(grid.cv_results_)[['params', 'mean_test_score', 'rank_test_score']]

In [67]:
# sort by test score
results.sort_values('rank_test_score')

Unnamed: 0,params,mean_test_score,rank_test_score
1,"{'logisticregression__C': 0.1, 'logisticregres...",0.615909,1
5,"{'logisticregression__C': 10, 'logisticregress...",0.611111,2
3,"{'logisticregression__C': 1, 'logisticregressi...",0.609091,3
0,"{'logisticregression__C': 0.1, 'logisticregres...",,4
2,"{'logisticregression__C': 1, 'logisticregressi...",,4
4,"{'logisticregression__C': 10, 'logisticregress...",,4


In [None]:
# fit the pipeline and make predictions
pipe.fit(X, y)
pipe.predict(test)

In [None]:
# save the pipeline to a file
import joblib
joblib.dump(pipe, 'pipe.joblib')

In [None]:
# load the pipeline from a file
same_pipe = joblib.load('pipe.joblib')

In [None]:
# use it to make the same predictions
same_pipe.predict(test)