## MAIN GOAL
The aim of this notebook is to predict the income of respondents who didn't answered this field

In [530]:
import sys
import os
import import_ipynb
# Add the folder containing "files.ipynb" to the Python path
sys.path.append(os.path.abspath("Utils"))
# Import the files notebook
import Utils.files as files

import numpy as np
import pandas as pd

SEED = 48

In [531]:
df = pd.read_csv("./Array_saved/X_without_null.csv")

In [532]:
df.head()

Unnamed: 0,Gender,AutonomousCommunity,SchoolOwnership,KindOfPlace,Income,LivingUnit,DependentPersons,WorkConfinement,WorkConfinementsSecondAdult,ChildEarlyEducation1,...,SpaceGym,SpaceOther,SpaceNone,SpaceStreet,SpacePlots,SpaceParks,SpacePublic,SpaceSurroundingOther,SpaceSurroundingNone,ActivitiesOutside
0,2.0,PAIS VASCO,Público,2.0,29505.0,2,0,1,1.0,0,...,0,0,0,0,0,1,1,0,0,2.0
1,2.0,PAIS VASCO,Público,2.0,29505.0,2,0,1,1.0,0,...,0,1,0,0,0,1,0,0,0,3.0
2,2.0,PAIS VASCO,Público,2.0,36046.0,2,1,1,0.0,0,...,1,0,0,1,0,0,1,0,0,3.0
3,2.0,PAIS VASCO,Concertado,2.0,36046.0,2,0,1,1.0,0,...,1,0,0,0,1,1,0,0,0,3.0
4,2.0,PAIS VASCO,Público,2.0,36046.0,2,0,0,1.0,0,...,1,1,0,0,0,0,1,0,0,2.0


## Dataset Splitting

In [533]:
y = df.loc[df.Income.isna() == False, "Income"].to_numpy()

In [534]:
X = df.loc[df.Income.isna() == False].drop("Income", axis=1).to_numpy()

In [535]:
X_to_pred = df.loc[df.Income.isna()].drop("Income", axis=1).to_numpy()

## Encoding

In [536]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import OrdinalEncoder
from sklearn.decomposition import PCA

In [537]:
nominal_col = list( df.loc[:, 'Gender':'ChildrenSpecialNeeds'].columns.drop(['Income', 'LivingUnit']) )
nominal_col.extend( list( df.loc[:, 'WorkloadChange':'HouseworkMore'].columns ) )
nominal_col.extend( list( df.loc[:, 'Reconciling':'InterruptChildren'].columns ) )
nominal_col.extend( list( df.loc[:, 'Bedtime':'ChildMissTeacher'].columns ) )
nominal_col.extend( list( df.loc[:, 'ChallengeContent':'ChallengeAny'].columns ) )
nominal_col.extend( list( df.loc[:, 'FamiliesCollaboration':'ActivitiesOutside'].columns ) )


ordinal_col = list(["LivingUnit"])
ordinal_col.extend( list( df.loc[:, 'Sleep':'Sex'].columns ) )
ordinal_col.extend( list( df.loc[:, 'InterruptChildrenFrequency':'ChildrenSchookwork'].columns ) )
ordinal_col.extend( list( df.loc[:, 'HelpOnline':'ParentsCommunicationReturn'].columns ) )
ordinal_col.extend( list( df.loc[:, 'PriorityCustody':'PriorityEmotional'].columns ) )

In [538]:
transformer = ColumnTransformer(transformers=[
    ('ohe', OneHotEncoder(), nominal_col)
], sparse_threshold=0, remainder="passthrough")

In [539]:
# Select training data
X_to_transform = df.loc[df.Income.isna() == False].drop("Income", axis=1)
# Select predicatble data
X_to_pred_to_transform = df.loc[df.Income.isna()].drop("Income", axis=1)

transformer.fit(df.drop("Income", axis=1))

X_transformed = pd.DataFrame(transformer.transform( X_to_transform ), columns=transformer.get_feature_names_out())

X_to_pred_transformed = pd.DataFrame(transformer.transform( X_to_pred_to_transform ), columns=transformer.get_feature_names_out())


In [540]:
X_transformed

Unnamed: 0,ohe__Gender_1.0,ohe__Gender_2.0,ohe__AutonomousCommunity_ANDALUCÍA,ohe__AutonomousCommunity_ARAGÓN,ohe__AutonomousCommunity_ASTURIAS,ohe__AutonomousCommunity_CANARIAS,ohe__AutonomousCommunity_CANTABRIA,ohe__AutonomousCommunity_CASTILLA LA MANCHA,ohe__AutonomousCommunity_CASTILLA Y LEÓN,ohe__AutonomousCommunity_CATALUÑA,...,remainder__ChildrenSchookwork,remainder__HelpOnline,remainder__SchoolCommunication,remainder__ParentsGroups,remainder__ParentsCommunicationReturn,remainder__PriorityCustody,remainder__PriorityContent,remainder__PriorityAutonomy,remainder__PrioritySocialisation,remainder__PriorityEmotional
0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,4.0,2.0,3.0,2.0,1.0,3.0,3.0,2.0,2.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,2.0,4.0,2.0,2.0,3.0,2.0,2.0,2.0
2,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,4.0,2.0,3.0,3.0,3.0,1.0,3.0,3.0,3.0
3,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,3.0,3.0,4.0,2.0,2.0,2.0,3.0,3.0,3.0
4,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,2.0,3.0,3.0,2.0,1.0,3.0,3.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2618,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,3.0,3.0,3.0,2.0,2.0,2.0,3.0,3.0,2.0
2619,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,3.0,1.0,3.0,2.0,3.0,3.0,2.0,3.0,2.0
2620,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,2.0,3.0,4.0,3.0,2.0,2.0,2.0,3.0,3.0,3.0
2621,0.0,1.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,3.0,1.0,1.0,2.0,2.0,3.0,2.0,3.0,3.0


## Training

In [541]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV

from sklearn.svm import SVR

import datetime

In [542]:
pca = PCA(n_components=0.99)

X_pca = pca.fit_transform(X_transformed)
X_to_pred_pca = pca.transform(X_to_pred_transformed)

In [546]:
rgs = SVR()

splits = KFold(n_splits=5, shuffle=True, random_state=SEED).split(X_transformed, y)

param_grid = {
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],  # Different kernel types
    'C': [0.1, 1, 10, 100],                         # Regularization parameter
    'gamma': ['scale', 'auto', 0.01, 0.1, 1]       # Kernel coefficient for 'rbf', 'poly', and 'sigmoid'
}

In [547]:
best_estim_list = []
scores_list = []
params_list = []

# Outer loop
for i, (train_idx, test_idx) in enumerate(splits):

    print(f'SPLIT {i+1}')

    # Split data into training and test sets for the current split
    X_train, X_test = X_pca[train_idx], X_pca[test_idx]
    y_train, y_test = y[train_idx], y[test_idx]

    # Inner loop
    gs_RGS = GridSearchCV(estimator=rgs,
                                 param_grid=param_grid,
                                 cv=2,
                                 scoring='neg_mean_squared_error',
                                 refit = True,
                                 n_jobs=3)
    


    START_t = datetime.datetime.now()

    # Fit the GridSearchCV
    gs_RGS.fit(X_train, y_train)

    END_t = datetime.datetime.now()



    # Retrieve and store the best model from GridSearchCV    
    best_estim_list.append(gs_RGS.best_estimator_)

    # Retrieve and store the best param from GridSearchCV
    params_list.append(gs_RGS.best_params_)

    # Print the best parameters and score
    print(f"SCORE sul train -> {gs_RGS.best_score_}\n Training time -> {END_t-START_t}\n")

    scores_list.append(gs_RGS.best_estimator_.score(X_test, y_test)) # TODO which metrics does it use?

SPLIT 1
SCORE sul train -> -78857905.05390242
 Training time -> 0:00:20.578320

SPLIT 2
SCORE sul train -> -73219686.6085953
 Training time -> 0:00:16.191054

SPLIT 3
SCORE sul train -> -71792327.14063257
 Training time -> 0:00:16.457369

SPLIT 4
SCORE sul train -> -73690546.35268025
 Training time -> 0:00:16.191977

SPLIT 5
SCORE sul train -> -74269151.07761
 Training time -> 0:00:15.926739



### Best score on test set

In [548]:
max(scores_list)

0.10408788990874007

In [549]:
best_index = np.argmax(scores_list)

final_param = params_list[best_index]

print("I migliori parametri (per questo Dataset) usando un SVR sono:")
print(final_param)

I migliori parametri (per questo Dataset) usando un SVR sono:
{'C': 0.1, 'gamma': 1, 'kernel': 'poly'}


In [550]:
regressor = best_estim_list[best_index]
regressor

### Prediction on test dataset

In [551]:
y_predicted = regressor.predict(X_to_pred_pca)

### Prediction on train dataset

In [552]:
from sklearn.metrics import mean_squared_error

In [553]:
test = regressor.predict(X_pca)
test

array([28219.53840926, 22977.04849598, 25629.67051845, ...,
       21808.09982979, 19970.09953134, 20043.78354707])

In [559]:
print(f"MSE: {mean_squared_error(y_true=y, y_pred=test)}")

MSE: 30460313.736756057


In [555]:
print(f"l'errore medio è di: {np.mean(list(y-test))}")


np.float64(1338.4179900192566)