In [1]:
# Install updated versions of required packages
!pip install -U ydata-profiling
!pip install -U category_encoders
!pip install -U numba==0.58.1

# Importing python packages
import os
from os.path import join
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import math
import plotly.express as px
import scipy.stats as stats

%config InlineBackend.figure_format = 'retina'
sns.set()
%matplotlib inline

# Importing the required packages
from category_encoders import TargetEncoder
from sklearn.preprocessing import KBinsDiscretizer, MinMaxScaler, StandardScaler, OneHotEncoder
from sklearn.model_selection import train_test_split, KFold, RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.metrics import (confusion_matrix, accuracy_score, precision_score, recall_score,
                             r2_score, mean_absolute_error, mean_squared_error, f1_score,
                             classification_report, roc_auc_score, roc_curve)
from sklearn.feature_selection import RFE, f_classif, SelectKBest
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from scipy.stats import chi2_contingency, spearmanr, pointbiserialr, randint
from ydata_profiling import ProfileReport
from datetime import datetime
from math import ceil
from itertools import combinations

# Verify installations
!pip list | grep -E "ydata-profiling|category_encoders|numba"

Collecting numba==0.58.1
  Using cached numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (2.7 kB)
Collecting llvmlite<0.42,>=0.41.0dev0 (from numba==0.58.1)
  Using cached llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (4.8 kB)
Using cached numba-0.58.1-cp310-cp310-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (3.6 MB)
Using cached llvmlite-0.41.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (43.6 MB)
Installing collected packages: llvmlite, numba
  Attempting uninstall: llvmlite
    Found existing installation: llvmlite 0.43.0
    Uninstalling llvmlite-0.43.0:
      Successfully uninstalled llvmlite-0.43.0
  Attempting uninstall: numba
    Found existing installation: numba 0.60.0
    Uninstalling numba-0.60.0:
      Successfully uninstalled numba-0.60.0
[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the fo

In [2]:
# For GoogleColab versions
from google.colab import drive
drive.mount('/content/drive', force_remount=True)
path = '/content/drive/MyDrive/Machine Learning Projects/Predictive Model'


Mounted at /content/drive


In [3]:
# Save the train and test csv in dataframes accordingly
traindf = pd.read_csv(path + '/train.csv')
testdf = pd.read_csv(path + '/test.csv')

In [None]:
from google.colab import drive
drive.mount('/content/drive')

# **1. Business Understanding**



## MetaData

* citizen_id: Unique identifier of the citizen.
* Name: First name of each citizen.
* Title: Title of each citizen.
* date_of_birth: Date of birth of each citizen.
* city: Name of citizen´s city.
* country: Name of citizen´s country.
* last_year_avg_monthly_charity_donations: The average of monthly charitable donations made by each citizen in the last year.
* environmental_awareness_rating: A rating [0, 10] of each individual's awareness of and engagement with environmental issues.
* financial_wellness_index: An index indicating each citizen´s overall financial health.
* investment_portfolio_value: The value, in thousands of units of currency, of each citizen´s investment portfolio.
* investments_risk_appetite: A measure of each individual's willingness to take risks in their investments.
* investments_risk_tolerance: A measure of each individual's tolerance for risk in their investment choices.
* tech_savviness_score: A score representing each citizen´s proficiency and comfort with technology.
* social_media_influence_score: A score representing each citizen´s influence and activity on social media platforms.
* entertainment_engagement_factor: A score representing each citizen´s engagement with entertainment activities.
* avg_monthly_entertainment_expenses: The monthly expenditure on entertainment for each citizen, in units of currency.
* avg_weekly_exercise_hours: The average number of hours each citizen spends on exercise weekly.
* health_consciousness_rating: A rating [0, 10] of each citizen´s awareness and proactive behavior towards their health.
* stress_management_score: A score indicating how effectively each citizen manages stress.
* overall_well_being: A score indicating each citizen's overall status.
* lifestyle_type: A categorization of the predominant lifestyle choice for each citizen (Target Variable).
















In [None]:
# Count the number of observations for each "category" (1 and 0)
traindf['lifestyle_type'].value_counts()

**PREDICTIVE GOAL:**
-explicar1
-explicar2

# **2. Data Understanding**

In [None]:
# Copy the original dfs to df_original so we can use it at any time in the script

traindf_original = traindf
testdf_original = testdf

In [None]:
# shape verification of the train dataframe, to see the number of rows and columns
traindf.shape

In [None]:
# Look at the dataset types
traindf.dtypes

In [None]:
# Look at the header columns of the initial dataframe, and the first 5 rows
traindf.head(5)

In [None]:
# Check for duplicate recordID in the dataframe so we can pass it to an index
traindf['citizen_id'].duplicated().sum()

In [None]:
# record id as an index
traindf.set_index('citizen_id', inplace=True)

In [None]:
# The same trasformation is done to the test dataframe
testdf.set_index('citizen_id', inplace=True)

## Splitting the Dataframes

Since we already have access to the Test dataframe, we will divide the traindf into distinct train and validation sets in order to check the models' accuracy. After that, we'll use the test set to generate predictions and determine results.

From this point on, every transformation will be applied to the train set, validation set, and test set. Only those transformations that presuppose the removal of particular rows are not applied to the test set; as a result, those transformations are not taken into consideration for the test set.

Considering points for Spliting:

* validation_size = 0.3

* random_state = 10
* shuffle = True
* stratify = y








In [None]:
# To make the separation of the train dataframe using train_test_split, we separate the target variable from the others and created X and Y to pass in the function.
X = traindf.drop(columns=['lifestyle_type'])
y = traindf['lifestyle_type']

We want to separate in Train, Test and Validation sets.

For that, we will first divide between train_validation and test. And then the train_validation in between train and validation.

In [None]:
# The following package was the one used to split the data. Uncomment to use it if required.
# from sklearn.model_selection import train_test_split

In [None]:
# the final variables and the function to split the information accordingly to what was described before.
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.30, stratify = y, random_state=10, shuffle = True)

Now we have three different datasets:



*  Trainining dataset: X_train & y_train - corresponding to 70% - serve to train the model and build it
*  Validation set: X_val & y_val - corresponding to 30% - to validate the model and make some checks
*  Test set: corresponding to the dataset that will be used to make the predictions - to evaluate the performance on the kagle competition





In [None]:
# Looking to the train dataframe to check the initial structure of X_Train and make a first assessment
X_train.head()

In [None]:
# Looking to the train dataframe to check the initial structure of y_Train and make a first assessment
y_train.head(1)

In [None]:
# We concat the X and y dataframes so all the addressed transformation from now on, can be excluded in the overall features and in the target features too with the same indexer.
# To not replicate all the time the transformation and to decrease risks, we concat it now
traindf = pd.concat([X_train, y_train], axis=1)

In [None]:
# We do the same for the validation set
valdf = pd.concat([X_val, y_val], axis=1)

Since it's unclear whether the rating variables should be rounded, we will include both the original and rounded versions of these variables in each dataset. This will allow us to decide later whether or not to use them in the correlation matrix. Consequently, we will have two additional variables in the dataset
- environmental_awareness_rating_rounded
-health_consciousness_rating_rounded

In [None]:
# Round variables in traindf
traindf['environmental_awareness_rating_rounded'] = traindf['environmental_awareness_rating'].round()
traindf['health_consciousness_rating_rounded'] = traindf['health_consciousness_rating'].round()

# Round variables in valdf
valdf['environmental_awareness_rating_rounded'] = valdf['environmental_awareness_rating'].round()
valdf['health_consciousness_rating_rounded'] = valdf['health_consciousness_rating'].round()

# Round variables in testdf
testdf['environmental_awareness_rating_rounded'] = testdf['environmental_awareness_rating'].round()
testdf['health_consciousness_rating_rounded'] = testdf['health_consciousness_rating'].round()

As we think it will be helpfull to have a boolean with the gender, we also think will be helpfull to a age column.
So, we have 2 new variables:
- Gender
- Age

In [None]:
# Creating the gender column

# Define the title to gender mapping
title_to_gender = {
    'Miss': 'Female',
    'Mrs.': 'Female',
    'Mr.': 'Male',
    'Ms.': 'Female'
}

# Add a new column for gender based on the title in the training DataFrame
traindf['gender'] = traindf['title'].map(title_to_gender)

# Add a new column for gender based on the title in the validation DataFrame
valdf['gender'] = valdf['title'].map(title_to_gender)

# Add a new column for gender based on the title in the test DataFrame
testdf['gender'] = testdf['title'].map(title_to_gender)

In [None]:
import pandas as pd
from datetime import datetime

# Define the reference date
reference_date = datetime(2024, 1, 1)

# Function to calculate age
def calculate_age(date_of_birth):
    return (reference_date - pd.to_datetime(date_of_birth)).days // 365

# Add the age column to the training DataFrame
traindf['age'] = traindf['date_of_birth'].apply(calculate_age)

# Add the age column to the validation DataFrame
valdf['age'] = valdf['date_of_birth'].apply(calculate_age)

# Add the age column to the test DataFrame
testdf['age'] = testdf['date_of_birth'].apply(calculate_age)

# Function to categorize age into intervals
def age_interval(age):
    if age < 10:
        return '0-10'
    elif age < 20:
        return '10-20'
    elif age < 30:
        return '20-30'
    elif age < 40:
        return '30-40'
    elif age < 50:
        return '40-50'
    elif age < 60:
        return '50-60'
    elif age < 70:
        return '60-70'
    else:
        return '70+'

# Add the Interval of ages column to the training DataFrame
traindf['Interval of ages'] = traindf['age'].apply(age_interval)

# Add the Interval of ages column to the validation DataFrame
valdf['Interval of ages'] = valdf['age'].apply(age_interval)

# Add the Interval of ages column to the test DataFrame
testdf['Interval of ages'] = testdf['age'].apply(age_interval)

# Print the first few rows of each DataFrame to verify
print(traindf.head())
print(valdf.head())

In [None]:
# info on the train dataset
traindf.info()

In [None]:
# printing the na values for the different columns in the train dataset
print(traindf.isna().sum())

In [None]:
# printing the na values for the different columns in the validation dataset
print(valdf.isna().sum())

In [None]:
# Describing All Data
# we do not make the same approach for the validation set because the base dataframe will be this one.
traindf.describe(include = 'all').T

In [None]:
# Describing Numerical Data
traindf.describe().T

In [None]:
# Describing Categorical Data
traindf.describe(include = ['O']).T

# 2.1 Variables Definition

In [None]:
# All the columns in the train dataframe
traindf.columns

In [None]:
# Defining the variables according to the datatypes so the transformations can be easy to implement, the visualizations to see and consistencies to verify

metric_features = ['last_year_avg_monthly_charity_donations','environmental_awareness_rating','financial_wellness_index','investment_portfolio_value','investments_risk_appetite','investments_risk_tolerance','tech_savviness_score','social_media_influence_score','entertainment_engagement_factor','avg_monthly_entertainment_expenses','avg_weekly_exercise_hours','health_consciousness_rating','stress_management_score','overall_well_being','environmental_awareness_rating_rounded','health_consciousness_rating_rounded','age']
categorical_features = ['name','title','date_of_birth','city','country','Interval of ages','gender']
categorical_features_enc = ['city','country','Interval of ages','gender']

In [None]:
# showing metric features only
traindf[metric_features].head(3)

In [None]:
# showing categorical features only
traindf[categorical_features].head(3)

In [None]:
# To verify if there any "empty" values
print((traindf == '').sum())

In [None]:
# To verify if there any "empty" values, but now on the validation set
print((valdf == '').sum())

In [None]:
# To verify if there any "empty" values, but now on the test set
print((testdf == '').sum())

# 2.2 Data Types

In [None]:
traindf.head(2)

In [None]:
valdf.head(2)

In [None]:
testdf.head(2)

In [None]:
traindf_dtypes = traindf.copy()
valdf_dtypes = valdf.copy()
testdf_dtypes = testdf.copy()

# 2.3 Data Visualization

In [None]:
# We create a profilingReport so we can address easily some points on the features and to get more and better insights on the data existent.
# however, the profile is runned everytime.

# Profiling for further detail analysis if required
profile = ProfileReport(
    traindf_dtypes,
    title='WWW Profile',
    correlations={
        "pearson": {"calculate": True},
        "spearman": {"calculate": False},
        "kendall": {"calculate": False},
        "phi_k": {"calculate": False},
        "cramers": {"calculate": False},
    },
)

# profile.to_notebook_iframe()

## Histograms

In [None]:
def plot_multiple_histograms(data, feats, title="Metrical' Histograms", fig_size=(6,3)):

    # Prepare figure. Create individual axes where each histogram will be placed
    fig, axes = plt.subplots(2, ceil(len(feats) / 2), figsize=(30, 10))

    # Plot data
    # Iterate across axes objects and associate each histogram (hint: use the ax.hist() instead of plt.hist()):
    for ax, feat in zip(axes.flatten(), feats): # Notice the zip() function and flatten() method
      ax.hist(data[feat], bins=20)
      ax.set_title(feat)
      ax.tick_params(axis='both', which='major', labelsize=8)

    # Layout
    # Add a centered title to the figure:
    plt.suptitle(title)

    plt.show()

    return


def plot_categorical_histograms(data, feats, title="Categorical Histograms", fig_size=(6,3)):

    # Prepare figure. Create individual axes where each histogram will be placed
    fig, axes = plt.subplots(2, ceil(len(feats) / 2), figsize=(30, 10))

    # Plot data
    # Iterate across axes objects and associate each histogram (hint: use the ax.hist() instead of plt.hist()):
    for ax, feat in zip(axes.flatten(), feats): # Notice the zip() function and flatten() method
        sns.countplot(data=data, x=feat, ax=ax)
        ax.set_title(feat)
        ax.tick_params(axis='both', which='major', labelsize=8)

    # Layout
    # Add a centered title to the figure:
    plt.suptitle(title)

    plt.show()

    return


def plot_boolean_histograms(data, feats, title="Boolean Histograms", fig_size=(6,3)):

    # Prepare figure. Create individual axes where each histogram will be placed
    fig, axes = plt.subplots(2, ceil(len(feats) / 2), figsize=(30, 10))

    # Plot data
    # Iterate across axes objects and associate each histogram (hint: use the ax.hist() instead of plt.hist()):
    for ax, feat in zip(axes.flatten(), feats): # Notice the zip() function and flatten() method
        sns.countplot(data=data, x=feat, ax=ax)
        ax.set_title(feat)
        ax.tick_params(axis='both', which='major', labelsize=8)
        ax.set_xticklabels(['False', 'True'])

    # Layout
    # Add a centered title to the figure:
    plt.suptitle(title)

    plt.show()

    return


## Define a function that plots multiple box plots

def plot_multiple_boxplots(data, feats, title="Metric Features Box Plots"):

    # Prepare figure. Create individual axes where each histogram will be placed
    fig, axes = plt.subplots(2, ceil(len(feats) /2), figsize=(30, 10))

    # Plot data
    # Iterate across axes objects and associate each histogram (hint: use the ax.hist() instead of plt.hist()):
    for ax, feat in zip(axes.flatten(), feats): # Notice the zip() function and flatten() method
      sns.boxplot(x=data[feat], ax=ax)
      ax.set_title(feat)

    # Layout
    # Add a centered title to the figure:
    plt.suptitle(title)

    plt.show()

    return

### Metric Features

In [None]:
# Setting the seaborn package
sns.set()

# Plot the different metric features to visualize them and understand their distribution
plot_multiple_histograms(traindf_dtypes, metric_features)

### Adressing the negative values

**TRAIN**

TREATING NEGATIVE VALUES - last_year_avg_monthly_charity_donations

In [None]:
# Verifying the negative values first
(traindf_dtypes['last_year_avg_monthly_charity_donations'] < 0).value_counts()

In [None]:
# Drop rows with negative values in 'last_year_avg_monthly_charity_donations' column
traindf_dtypes = traindf_dtypes[traindf_dtypes['last_year_avg_monthly_charity_donations'] >= 0]

# Verify that negative values are removed
print(traindf_dtypes[traindf_dtypes['last_year_avg_monthly_charity_donations'] < 0])

TREATING NEGATIVE VALUES - avg_weekly_exercise_hours

In [None]:
# Verifying the negative values first
(traindf_dtypes['avg_weekly_exercise_hours'] < 0).value_counts()

In [None]:
# Drop rows with negative values in 'last_year_avg_monthly_charity_donations' column
traindf_dtypes = traindf_dtypes[traindf_dtypes['avg_weekly_exercise_hours'] >= 0]

# Verify that negative values are removed
print(traindf_dtypes[traindf_dtypes['avg_weekly_exercise_hours'] < 0])

**VALIDATION**


TREATING NEGATIVE VALUES - last_year_avg_monthly_charity_donations

In [None]:
# Verifying the negative values first
(valdf_dtypes['last_year_avg_monthly_charity_donations'] < 0).value_counts()

In [None]:
# Drop rows with negative values in 'last_year_avg_monthly_charity_donations' column
valdf_dtypes = valdf_dtypes[valdf_dtypes['last_year_avg_monthly_charity_donations'] >= 0]

# Verify that negative values are removed
print(valdf_dtypes[valdf_dtypes['last_year_avg_monthly_charity_donations'] < 0])

TREATING NEGATIVE VALUES - avg_weekly_exercise_hours

In [None]:
# Verifying the negative values first
(valdf_dtypes['avg_weekly_exercise_hours'] < 0).value_counts()

In [None]:
# Drop rows with negative values in 'last_year_avg_monthly_charity_donations' column
valdf_dtypes = valdf_dtypes[valdf_dtypes['avg_weekly_exercise_hours'] >= 0]

# Verify that negative values are removed
print(valdf_dtypes[valdf_dtypes['avg_weekly_exercise_hours'] < 0])

**TEST**

In [None]:
#(testdf_dtypes['last_year_avg_monthly_charity_donations'] < 0).value_counts()

In [None]:
#testdf_dtypes = testdf_dtypes[testdf_dtypes['last_year_avg_monthly_charity_donations'] >= 0]

# Verify that negative values are removed
#print(testdf_dtypes[testdf_dtypes['last_year_avg_monthly_charity_donations'] < 0])

In [None]:
#(testdf_dtypes['avg_weekly_exercise_hours'] < 0).value_counts()

In [None]:
testdf_dtypes = testdf
#testdf_dtypes = testdf_dtypes[testdf_dtypes['avg_weekly_exercise_hours'] >= 0]

# Verify that negative values are removed
#print(testdf_dtypes[testdf_dtypes['avg_weekly_exercise_hours'] < 0])

## Boxplots

### Delaying the Outliers removal

As we see, we have a lot of outliers. Since we have this many outliers, we will first analyze the correlation and pairwise matrix's.

In [None]:
# Multiple boxplots plotted for the metric features
plot_multiple_boxplots(traindf_dtypes, metric_features)

## Pairwise Relationships

In [None]:
#Pairwise Relationship of All Numerical Variables
sns.set()

#Setting pairplot
sns.pairplot(traindf_dtypes[metric_features], diag_kind="hist")

#Layout
plt.subplots_adjust(top=0.95)
plt.suptitle("Pairwise Relationship of Variables", fontsize=20)

plt.show()

## Correlation Matrix - Before Outliers


### Metric Features

In [None]:
traindf_dtypes_corr = traindf_dtypes.copy()

In [None]:
valdf_dtypes_corr = valdf_dtypes.copy()

In [None]:
testdf_dtypes_corr = testdf_dtypes.copy()

In [None]:
# Prepare figure
fig = plt.figure(figsize=(15, 10))

# Obtain correlation matrix. Round the values to 1 decimal cases. Use the DataFrame corr() and round() method.
corr = np.round(traindf_dtypes_corr[metric_features].corr(method="pearson"), decimals=1)

# Build annotation matrix (values above |0.7| will appear annotated in the plot)
mask_annot = np.absolute(corr.values) >= 0.7
annot = np.where(mask_annot, corr.values, np.full(corr.shape,"")) # Try to understand what this np.where() does



# Plot heatmap of the correlation matrix
sns.heatmap(data=corr, annot=annot, cmap=sns.diverging_palette(240, 240, as_cmap=True),
            fmt='s', vmin=-1, vmax=1, center=0, square=True, linewidths=.5)

# Layout
fig.subplots_adjust(top=0.95)
fig.suptitle("Correlation Matrix", fontsize=20)

plt.show()

After analysing the correlation matrix, we know that we migh be droping both "stress_management_score" and "overall_well_being", therefore we will not treat the outliers of these variables.

In [None]:
# Sum the absolute correlations for each variable
sum_of_correlations = corr.abs().sum(axis=1)

# Sort the sum of correlations in ascending order
sorted_sum_of_correlations = sum_of_correlations.sort_values()

# Print the sum of correlations for each variable in ascending order
print(sorted_sum_of_correlations)

BOX PLOT

In [None]:
# Multiple boxplots plotted for the metric features
plot_multiple_boxplots(traindf_dtypes, metric_features)

#### Outlier Removal - Manual

In [None]:
#investments_risk_appetite                  1.4
#tech_savviness_score                       1.6
#investment_portfolio_value                 2.0
#avg_monthly_entertainment_expenses         2.9
#health_consciousness_rating                3.3
#environmental_awareness_rating             3.3

filters1 = (
    (traindf_dtypes['environmental_awareness_rating'] <10) &
    (traindf_dtypes['investment_portfolio_value'] <= 400) &
    (traindf_dtypes['investments_risk_tolerance'] <= 40) &
    (traindf_dtypes['tech_savviness_score'] <= 30) &
    (traindf_dtypes['social_media_influence_score'] <= 40) &
    (traindf_dtypes['entertainment_engagement_factor'] <= 4) &
    (traindf_dtypes['avg_monthly_entertainment_expenses'] <= 165) &
    (traindf_dtypes['avg_weekly_exercise_hours'] < 8) &
    (traindf_dtypes['health_consciousness_rating'] <= 11) &
    (traindf_dtypes['stress_management_score'] <= 9) &
    (traindf_dtypes['overall_well_being'] <= 450))


traindf_dtypes_outlierm = traindf_dtypes[filters1]

print('Percentage of data kept after removing outliers:', np.round(traindf_dtypes_outlierm.shape[0] / traindf_dtypes.shape[0], 4))

In [None]:
# Losing almost 22% of the dataset is too much, and still not solving the real problem. We continue with the outliers as we can see below

In [None]:
plot_multiple_boxplots(traindf_dtypes_outlierm, metric_features)

#### Outlier Removal - IQR Method

In [None]:
# Loop through each column in the dataframe

for column in traindf_dtypes[metric_features].columns:
  # Calculate Q1, Q3, and IQR for the column
  Q1 = traindf_dtypes[metric_features][column].quantile(0.25)
  Q3 = traindf_dtypes[metric_features][column].quantile(0.75)
  IQR = Q3 - Q1

  # Determine the lower and upper bounds for the column
  lower_bound = Q1 - 1.5 * IQR
  upper_bound = Q3 + 1.5 * IQR

  # Remove rows with values that fall outside the bounds
  traindf_dtypes_mf_outlieriqr = traindf_dtypes[metric_features][(traindf_dtypes[metric_features][column] >= lower_bound) & (traindf_dtypes[metric_features][column] <= upper_bound)]

# The dataframe now contains only rows with values within the bounds for all columns

print('Percentage of data kept after removing outliers:', np.round(traindf_dtypes_mf_outlieriqr.shape[0] / traindf_dtypes.shape[0], 4))

In [None]:
plot_multiple_boxplots(traindf_dtypes_mf_outlieriqr, metric_features)

We retain most of the data with this strategy, but there are still some outliers that may not be important for understanding the model's forecast. We'll take a different tack and use limit imposition to investigate the findings.

#### Outlier Limit Inputation

In [None]:
# showing the initial without transformations again
plot_multiple_boxplots(traindf_dtypes, metric_features)

In [None]:
# Copying the before dataframe for the new where the outliers will be treated
traindf_dtypes_outlier = traindf_dtypes.copy()

In [None]:
# All the values for seen as outliers in the boxplots will be stacked to the new maximum limit defined below.
environmental_awareness_rating = traindf_dtypes['environmental_awareness_rating'].copy()
environmental_awareness_rating.loc[environmental_awareness_rating>9] =9
print(environmental_awareness_rating[environmental_awareness_rating == 9].count())
traindf_dtypes_outlier['environmental_awareness_rating'] = environmental_awareness_rating

In [None]:
financial_wellness_index = traindf_dtypes['financial_wellness_index'].copy()
financial_wellness_index.loc[financial_wellness_index>400] = 400
#financial_wellness_index.loc[financial_wellness_index<20] = 20
print(financial_wellness_index[financial_wellness_index == 400].count())
#print(financial_wellness_index[financial_wellness_index == 20].count())
traindf_dtypes_outlier['financial_wellness_index'] = financial_wellness_index

In [None]:
investment_portfolio_value = traindf_dtypes['investment_portfolio_value'].copy()
investment_portfolio_value.loc[investment_portfolio_value>300] =300
print(investment_portfolio_value[investment_portfolio_value == 300].count())
traindf_dtypes_outlier['investment_portfolio_value'] = investment_portfolio_value

In [None]:
investments_risk_tolerance = traindf_dtypes['investments_risk_tolerance'].copy()
investments_risk_tolerance.loc[investments_risk_tolerance>28] = 28
print(investments_risk_tolerance[investments_risk_tolerance == 28].count())
traindf_dtypes_outlier['investments_risk_tolerance'] = investments_risk_tolerance

In [None]:
tech_savviness_score = traindf_dtypes['tech_savviness_score'].copy()
tech_savviness_score.loc[tech_savviness_score>20] =20
tech_savviness_score.loc[tech_savviness_score<8] = 8
print(tech_savviness_score[tech_savviness_score == 20].count())
print(tech_savviness_score[tech_savviness_score == 8].count())
traindf_dtypes_outlier['tech_savviness_score'] = tech_savviness_score

In [None]:
social_media_influence_score = traindf_dtypes['social_media_influence_score'].copy()
social_media_influence_score.loc[social_media_influence_score>27] =27
print(social_media_influence_score[social_media_influence_score == 27].count())
traindf_dtypes_outlier['social_media_influence_score'] = social_media_influence_score

In [None]:
avg_monthly_entertainment_expenses = traindf_dtypes['avg_monthly_entertainment_expenses'].copy()
avg_monthly_entertainment_expenses.loc[avg_monthly_entertainment_expenses>150] = 150
print(avg_monthly_entertainment_expenses[avg_monthly_entertainment_expenses == 150].count())
traindf_dtypes_outlier['avg_monthly_entertainment_expenses'] = avg_monthly_entertainment_expenses

In [None]:
avg_weekly_exercise_hours = traindf_dtypes['avg_weekly_exercise_hours'].copy()
avg_weekly_exercise_hours.loc[avg_weekly_exercise_hours>7] = 7
print(avg_weekly_exercise_hours[avg_weekly_exercise_hours == 7].count())
traindf_dtypes_outlier['avg_weekly_exercise_hours'] = avg_weekly_exercise_hours

In [None]:
health_consciousness_rating = traindf_dtypes['health_consciousness_rating'].copy()
health_consciousness_rating.loc[health_consciousness_rating>9] = 9
print(health_consciousness_rating[health_consciousness_rating == 9].count())
traindf_dtypes_outlier['health_consciousness_rating'] = health_consciousness_rating

With this last method, we treat the outliers and do not lose any % of the data set

we end up with the df: **traindf_dtypes_outlier**

The same is applied for the validation set

In [None]:
valdf_dtypes_outlier = valdf_dtypes.copy()

In [None]:
# All the values for seen as outliers in the boxplots will be stacked to the new maximum limit defined below.
environmental_awareness_rating = valdf_dtypes['environmental_awareness_rating'].copy()
environmental_awareness_rating.loc[environmental_awareness_rating>9] =9
print(environmental_awareness_rating[environmental_awareness_rating == 9].count())
valdf_dtypes_outlier['environmental_awareness_rating'] = environmental_awareness_rating

In [None]:
financial_wellness_index = valdf_dtypes['financial_wellness_index'].copy()
financial_wellness_index.loc[financial_wellness_index>400] = 400
#financial_wellness_index.loc[financial_wellness_index<20] = 20
print(financial_wellness_index[financial_wellness_index == 400].count())
#print(financial_wellness_index[financial_wellness_index == 20].count())
valdf_dtypes_outlier['financial_wellness_index'] = financial_wellness_index

In [None]:
investment_portfolio_value = valdf_dtypes['investment_portfolio_value'].copy()
investment_portfolio_value.loc[investment_portfolio_value>300] =300
print(investment_portfolio_value[investment_portfolio_value == 300].count())
valdf_dtypes_outlier['investment_portfolio_value'] = investment_portfolio_value

In [None]:
investments_risk_tolerance = valdf_dtypes['investments_risk_tolerance'].copy()
investments_risk_tolerance.loc[investments_risk_tolerance>28] = 28
print(investments_risk_tolerance[investments_risk_tolerance == 28].count())
valdf_dtypes_outlier['investments_risk_tolerance'] = investments_risk_tolerance

In [None]:
tech_savviness_score = valdf_dtypes['tech_savviness_score'].copy()
tech_savviness_score.loc[tech_savviness_score>20] =20
tech_savviness_score.loc[tech_savviness_score<8] = 8
print(tech_savviness_score[tech_savviness_score == 20].count())
print(tech_savviness_score[tech_savviness_score == 8].count())
valdf_dtypes_outlier['tech_savviness_score'] = tech_savviness_score

In [None]:
social_media_influence_score = valdf_dtypes['social_media_influence_score'].copy()
social_media_influence_score.loc[social_media_influence_score>27] =27
print(social_media_influence_score[social_media_influence_score == 27].count())
valdf_dtypes_outlier['social_media_influence_score'] = social_media_influence_score

In [None]:
avg_monthly_entertainment_expenses = valdf_dtypes['avg_monthly_entertainment_expenses'].copy()
avg_monthly_entertainment_expenses.loc[avg_monthly_entertainment_expenses>150] = 150
print(avg_monthly_entertainment_expenses[avg_monthly_entertainment_expenses == 150].count())
valdf_dtypes_outlier['avg_monthly_entertainment_expenses'] = avg_monthly_entertainment_expenses

In [None]:
avg_weekly_exercise_hours = valdf_dtypes['avg_weekly_exercise_hours'].copy()
avg_weekly_exercise_hours.loc[avg_weekly_exercise_hours>7] = 7
print(avg_weekly_exercise_hours[avg_weekly_exercise_hours == 7].count())
valdf_dtypes_outlier['avg_weekly_exercise_hours'] = avg_weekly_exercise_hours

In [None]:
health_consciousness_rating = valdf_dtypes['health_consciousness_rating'].copy()
health_consciousness_rating.loc[health_consciousness_rating>9] = 9
print(health_consciousness_rating[health_consciousness_rating == 9].count())
valdf_dtypes_outlier['health_consciousness_rating'] = health_consciousness_rating

The same is applied for the test set

In [None]:
testdf_dtypes_outlier = testdf_dtypes.copy()

In [None]:
# All the values for seen as outliers in the boxplots will be stacked to the new maximum limit defined below.
environmental_awareness_rating = testdf_dtypes['environmental_awareness_rating'].copy()
environmental_awareness_rating.loc[environmental_awareness_rating>9] =9
print(environmental_awareness_rating[environmental_awareness_rating == 9].count())
testdf_dtypes_outlier['environmental_awareness_rating'] = environmental_awareness_rating

In [None]:
financial_wellness_index = testdf_dtypes['financial_wellness_index'].copy()
financial_wellness_index.loc[financial_wellness_index>400] = 400
#financial_wellness_index.loc[financial_wellness_index<20] = 20
print(financial_wellness_index[financial_wellness_index == 400].count())
#print(financial_wellness_index[financial_wellness_index == 20].count())
testdf_dtypes_outlier['financial_wellness_index'] = financial_wellness_index

In [None]:
investment_portfolio_value = testdf_dtypes['investment_portfolio_value'].copy()
investment_portfolio_value.loc[investment_portfolio_value>300] =300
print(investment_portfolio_value[investment_portfolio_value == 300].count())
testdf_dtypes_outlier['investment_portfolio_value'] = investment_portfolio_value

In [None]:
investments_risk_tolerance = testdf_dtypes['investments_risk_tolerance'].copy()
investments_risk_tolerance.loc[investments_risk_tolerance>28] = 28
print(investments_risk_tolerance[investments_risk_tolerance == 28].count())
testdf_dtypes_outlier['investments_risk_tolerance'] = investments_risk_tolerance

In [None]:
tech_savviness_score = testdf_dtypes['tech_savviness_score'].copy()
tech_savviness_score.loc[tech_savviness_score>20] =20
tech_savviness_score.loc[tech_savviness_score<8] = 8
print(tech_savviness_score[tech_savviness_score == 20].count())
print(tech_savviness_score[tech_savviness_score == 8].count())
testdf_dtypes_outlier['tech_savviness_score'] = tech_savviness_score

In [None]:
social_media_influence_score = testdf_dtypes['social_media_influence_score'].copy()
social_media_influence_score.loc[social_media_influence_score>27] =27
print(social_media_influence_score[social_media_influence_score == 27].count())
testdf_dtypes_outlier['social_media_influence_score'] = social_media_influence_score

In [None]:
avg_monthly_entertainment_expenses = testdf_dtypes['avg_monthly_entertainment_expenses'].copy()
avg_monthly_entertainment_expenses.loc[avg_monthly_entertainment_expenses>150] = 150
print(avg_monthly_entertainment_expenses[avg_monthly_entertainment_expenses == 150].count())
testdf_dtypes_outlier['avg_monthly_entertainment_expenses'] = avg_monthly_entertainment_expenses

In [None]:
avg_weekly_exercise_hours = testdf_dtypes['avg_weekly_exercise_hours'].copy()
avg_weekly_exercise_hours.loc[avg_weekly_exercise_hours>7] = 7
print(avg_weekly_exercise_hours[avg_weekly_exercise_hours == 7].count())
testdf_dtypes_outlier['avg_weekly_exercise_hours'] = avg_weekly_exercise_hours

In [None]:
health_consciousness_rating = testdf_dtypes['health_consciousness_rating'].copy()
health_consciousness_rating.loc[health_consciousness_rating>9] = 9
print(health_consciousness_rating[health_consciousness_rating == 9].count())
testdf_dtypes_outlier['health_consciousness_rating'] = health_consciousness_rating

## Correlation Matrix

### Metric Features

In [None]:
traindf_dtypes_outlier_corr = traindf_dtypes_outlier.copy()

In [None]:
valdf_dtypes_outlier_corr = valdf_dtypes_outlier.copy()

In [None]:
testdf_dtypes_outlier_corr = testdf_dtypes_outlier.copy()

#### Correlation between features

```
# This is formatted as code
```



In [None]:
# Prepare figure
fig = plt.figure(figsize=(15, 10))

# Obtain correlation matrix. Round the values to 1 decimal cases. Use the DataFrame corr() and round() method.
corr = np.round(traindf_dtypes_outlier_corr[metric_features].corr(method="pearson"), decimals=1)

# Build annotation matrix (values above |0.7| will appear annotated in the plot)
mask_annot = np.absolute(corr.values) >= 0.7
annot = np.where(mask_annot, corr.values, np.full(corr.shape,"")) # Try to understand what this np.where() does



# Plot heatmap of the correlation matrix
sns.heatmap(data=corr, annot=annot, cmap=sns.diverging_palette(240, 240, as_cmap=True),
            fmt='s', vmin=-1, vmax=1, center=0, square=True, linewidths=.5)

# Layout
fig.subplots_adjust(top=0.95)
fig.suptitle("Correlation Matrix", fontsize=20)

plt.show()

Variable "stress_management_score" and "overall_well_being" are correlated with other variables, so we will drop it.
We will also drop "environmental_awareness_rating_rounded" and "health_consciousness_rating_rounded" as it didn't proved it would improve our results.

In [None]:
# List of columns to drop
columns_to_drop = [
    'stress_management_score',
    'overall_well_being',
    'environmental_awareness_rating_rounded',
    'health_consciousness_rating_rounded'
]

# Dropping columns from train, val, and test sets
traindf_dtypes_outlier_corr.drop(columns=columns_to_drop, axis=1, inplace=True)
valdf_dtypes_outlier_corr.drop(columns=columns_to_drop, axis=1, inplace=True)
testdf_dtypes_outlier_corr.drop(columns=columns_to_drop, axis=1, inplace=True)

In [None]:
metric_features = [feature for feature in metric_features if feature not in columns_to_drop]

#### Dependency with lifestyle_type

In [None]:
from scipy.stats import f_oneway
import pandas as pd

# Create a copy of the dataframe to avoid altering the original dataframe
df_copy = traindf_dtypes_outlier_corr.copy()

# Convert lifestyle_type to numerical codes in the copied dataframe
df_copy['lifestyle_type_code'] = df_copy['lifestyle_type'].astype('category').cat.codes

# Dictionary to hold feature and their ANOVA F-value and p-value with lifestyle_type
anova_dict = {}

# Perform one-way ANOVA for each metric feature
for feature in metric_features:
    # Create groups for each lifestyle_type_code
    groups = []
    for code in df_copy['lifestyle_type_code'].unique():
        group = df_copy[df_copy['lifestyle_type_code'] == code][feature]
        groups.append(group)

    # Check if all groups have data
    if all(len(group) > 0 for group in groups):
        # Perform ANOVA
        f_val, p_val = f_oneway(*groups)
        anova_dict[feature] = {'F-value': f_val, 'p-value': p_val}
    else:
        warnings.warn(f'One of the groups for feature {feature} is empty. Skipping ANOVA for this feature.')

# Set your p-value threshold
p_value_threshold = 0.05  # Common choice for p-value threshold

# Get features with p-value above threshold (less statistically significant features)
insignificant_features = [feature for feature, result in anova_dict.items() if result['p-value'] > p_value_threshold]

print("Features less likely to be significant with lifestyle_type:")
for feature in insignificant_features:
    print(f'{feature}: F-value: {anova_dict[feature]["F-value"]:.3f}, p-value: {anova_dict[feature]["p-value"]:.3f}')

### Boolean Features

There are no booleans in our data set. Nothing needed.

### Categorical Features

#### Dependency with lifestyle_type

In [None]:
# get categorical features
cat_features = traindf_dtypes_outlier_corr[categorical_features]

# iterate over each categorical feature and compute chi2 test
for feature in categorical_features:
    contingency_table = pd.crosstab(cat_features[feature], traindf_dtypes_outlier_corr['lifestyle_type'])
    _, p, _, _ = chi2_contingency(contingency_table)
    print(f"{feature}: P-value={p}")

Features like name, title, date_of_birth, city, and country don't seem to have any significant association with lifestyle type.
The test confirms what's already known: there's a significant association between the lifestyle_type and itself.

#### Depency between features

In [None]:
# Define the threshold for significance (for example, 0.05)
alpha = 0.05

# Define an empty list to store the results
result = []

for cat_feature1 in categorical_features:
    for cat_feature2 in categorical_features:
        if cat_feature1 != cat_feature2:
            contingency_table = pd.crosstab(traindf_dtypes_outlier_corr[cat_feature1], traindf_dtypes_outlier_corr[cat_feature2])
            chi2, p, dof, expected = stats.chi2_contingency(contingency_table)

            # Store only significant correlations in the result list
            if p < alpha:
                result.append([cat_feature1, cat_feature2, p])

# Convert the result list into a DataFrame
chi_test_output = pd.DataFrame(result, columns=['var1', 'var2', 'p-value'])

# Pivot the DataFrame to create a crosstab-like format
pivoted_output = chi_test_output.pivot(index='var1', columns='var2', values='p-value')

In [None]:
# Print the pivoted DataFrame
pivoted_output

In [None]:
for cat_feature1 in categorical_features:
    for cat_feature2 in categorical_features:
        if cat_feature1 != cat_feature2:
            contingency_table = pd.crosstab(traindf_dtypes_outlier_corr[cat_feature1], traindf_dtypes_outlier_corr[cat_feature2])
            chi2, p, dof, expected = stats.chi2_contingency(contingency_table)
            print(f"Features: {cat_feature1} and {cat_feature2}, p-value: {p}")

In [None]:
# Output dataframe

# traindf_dtypes_outlier_corr

## Inconsistency Check

In [None]:
# In this step we will address inconsistencies of the train dataframe

In [None]:
traindf.shape[0]

In [None]:
traindf_dtypes.shape[0]

In [None]:
traindf_dtypes_outlier.shape[0]

In [None]:
traindf_dtypes_outlier_corr.shape[0]

### Duplicates

In [None]:
duplicates = traindf_dtypes_outlier_corr[traindf_dtypes_outlier_corr.duplicated()]
print(duplicates)

Yet python is considering this rows duplicates, after exploring them, we are going to keep them both because we dont really agree that these two are duplicates.

### Negative Values

In [None]:
plot_multiple_boxplots(traindf_dtypes_outlier_corr, metric_features)

In [None]:
# Checking for any negative values
negative_values = traindf_dtypes_outlier_corr[metric_features].lt(0).any(1)

# Count rows with negative values
negative_count = negative_values.sum()
print(f"Number of rows with negative values: {negative_count}")

No negative values!

### Handling Missing Values

**Delete rows with missing values**: This approach is straightforward but can result in a loss of information. If the amount of missing data is small, this approach may be reasonable.

**Impute missing values**: This approach involves replacing missing values with estimated values. Common imputation methods include mean imputation, median imputation, mode imputation, regression imputation, and k-nearest neighbor imputation.

**Create a missing value indicator**: This approach involves creating a binary indicator variable that indicates whether a value is missing or not. This approach can be useful in situations where the missingness itself is informative.

**Use models that can handle missing values**: Some models, such as decision trees and random forests, can handle missing values directly. In these models, missing values are treated as a separate category and are included in the analysis.

**Use domain knowledge to estimate missing values**: In some cases, it may be possible to use domain knowledge to estimate missing values. For example, if you are analyzing data on the height and weight of a population, you may be able to use knowledge of human biology to estimate missing values.

In [None]:
# plotting the na values for the different features
traindf_dtypes_outlier_corr.isna().sum()

#### Dropping missing values

In [None]:
traindf_dtypes_outlier_corr_dropna = traindf_dtypes_outlier_corr.dropna()

In [None]:
v = traindf_dtypes_outlier_corr_dropna.shape[0]

In [None]:
vb = traindf_dtypes_outlier_corr.shape[0]

In [None]:
print(v / vb)

By dropping all the Nan, we are loosing 20% of the dataset. As we dont want to lose that much data set, we are not following this aproach.

#### Filling missing values

In [None]:
traindf_dtypes_outlier_corr.isna().sum()

In [None]:
valdf_dtypes_outlier_corr.isna().sum()

In [None]:
testdf_dtypes_outlier_corr.isna().sum()

Inputting for Categorical Features

In [None]:
traindf_dtypes_outlier_corr.head()

##### Scalling Min-Max

Knowing that using the KNN iputer or any other type of inputation we need to scale the data accordingly, we decided to address the point right here, before the use of KNN for the inputation of the na values on metric features.

In [None]:
# First we do the fit transform in the train dataset

from sklearn.preprocessing import MinMaxScaler

# Instantiate the scaler
minmax_scaler = MinMaxScaler()

# Scale the metric features separately
scaled_features = minmax_scaler.fit_transform(traindf_dtypes_outlier_corr[metric_features])

# Create a DataFrame from the scaled features
scaled_features_df = pd.DataFrame(scaled_features, columns=metric_features, index=traindf_dtypes_outlier_corr.index)

# Drop the original metric features from the dataframe
traindf_dtypes_outlier_corr = traindf_dtypes_outlier_corr.drop(columns=metric_features)

# Merge the scaled features back into the dataframe
traindf_dtypes_outlier_corr = pd.concat([traindf_dtypes_outlier_corr, scaled_features_df], axis=1)

In [None]:
# Then we apply the transform to the validation and test set so the scale is the same

from sklearn.preprocessing import MinMaxScaler

# Scale the metric features separately
scaled_features = minmax_scaler.transform(valdf_dtypes_outlier_corr[metric_features])

# Create a DataFrame from the scaled features
scaled_features_df = pd.DataFrame(scaled_features, columns=metric_features, index=valdf_dtypes_outlier_corr.index)

# Drop the original metric features from the dataframe
valdf_dtypes_outlier_corr = valdf_dtypes_outlier_corr.drop(columns=metric_features)

# Merge the scaled features back into the dataframe
valdf_dtypes_outlier_corr = pd.concat([valdf_dtypes_outlier_corr, scaled_features_df], axis=1)

In [None]:
from sklearn.preprocessing import MinMaxScaler

# Scale the metric features separately
scaled_features = minmax_scaler.transform(testdf_dtypes_outlier_corr[metric_features])

# Create a DataFrame from the scaled features
scaled_features_df = pd.DataFrame(scaled_features, columns=metric_features, index=testdf_dtypes_outlier_corr.index)

# Drop the original metric features from the dataframe
testdf_dtypes_outlier_corr = testdf_dtypes_outlier_corr.drop(columns=metric_features)

# Merge the scaled features back into the dataframe
testdf_dtypes_outlier_corr = pd.concat([testdf_dtypes_outlier_corr, scaled_features_df], axis=1)

Inputting for Metric Features

In [None]:
# Now, we will input the missing values with the KNN Inputer
# The rational before on the min-max sclaling applies here:
# 1. First we fit_transform in the train set
# 2. Than we transform into the validation and test set

In [None]:
from sklearn.impute import KNNImputer

imputer = KNNImputer(n_neighbors=5)

# Create a copy of the dataframe
df_filled = traindf_dtypes_outlier_corr.copy()

# Apply the imputer to the metric features only
df_filled[metric_features] = imputer.fit_transform(df_filled[metric_features])

# Now replace only the missing values in the original dataframe with those from the filled dataframe
for feature in metric_features:
    traindf_dtypes_outlier_corr.loc[traindf_dtypes_outlier_corr[feature].isnull(), feature] = df_filled.loc[traindf_dtypes_outlier_corr[feature].isnull(), feature]

using the KNN inputer from the train set we apply to the validation and test set

In [None]:
# Create a copy of the dataframe
dfval_filled = valdf_dtypes_outlier_corr.copy()

# Apply the imputer to the metric features only
dfval_filled[metric_features] = imputer.transform(dfval_filled[metric_features])

# Now replace only the missing values in the original dataframe with those from the filled dataframe
for feature in metric_features:
    valdf_dtypes_outlier_corr.loc[valdf_dtypes_outlier_corr[feature].isnull(), feature] = dfval_filled.loc[valdf_dtypes_outlier_corr[feature].isnull(), feature]

In [None]:
# Create a copy of the test dataframe
df_test_filled = testdf_dtypes_outlier_corr.copy()

# Apply the imputer to the metric features only
df_test_filled[metric_features] = imputer.transform(df_test_filled[metric_features])

# Now replace only the missing values in the original test dataframe with those from the filled dataframe
for feature in metric_features:
    testdf_dtypes_outlier_corr.loc[testdf_dtypes_outlier_corr[feature].isnull(), feature] = df_test_filled.loc[testdf_dtypes_outlier_corr[feature].isnull(), feature]


In [None]:
traindf_dtypes_outlier_corr.isna().sum()

In [None]:
valdf_dtypes_outlier_corr.isna().sum()

In [None]:
testdf_dtypes_outlier_corr.isna().sum()

In [None]:
traindf_dtypes_outlier_corr = traindf_dtypes_outlier_corr.dropna()

In [None]:
valdf_dtypes_outlier_corr = valdf_dtypes_outlier_corr.dropna()

In [None]:
testdf_dtypes_outlier_corr = testdf_dtypes_outlier_corr.dropna()

In [None]:
v = traindf_dtypes_outlier_corr.shape[0]
vb = traindf_dtypes_outlier.shape[0]

print(v / vb)

In [None]:
traindf_dtypes_outlier_corr_incon = traindf_dtypes_outlier_corr.copy()
valdf_dtypes_outlier_corr_incon = valdf_dtypes_outlier_corr.copy()
testdf_dtypes_outlier_corr_incon = testdf_dtypes_outlier_corr.copy()

Replacing the values with KNN helped us not losing any rows of the dataset.

# 3. Data Preprocessing

### 3.1 Encoding

For the categorical features we need to proceed to the encoding so we can have only numerical values to input in the model.

In [None]:
print(traindf_dtypes_outlier_corr_incon.columns)


In [None]:
traindf_dtypes_outlier_corr_incon_encoded = pd.get_dummies(traindf_dtypes_outlier_corr_incon, columns= categorical_features_enc, drop_first=True)

In [None]:
valdf_dtypes_outlier_corr_incon_encoded = pd.get_dummies(valdf_dtypes_outlier_corr_incon, columns=categorical_features_enc, drop_first=True)

In [None]:
testdf_dtypes_outlier_corr_incon_encoded = pd.get_dummies(testdf_dtypes_outlier_corr_incon, columns=categorical_features_enc, drop_first=True)

In [None]:
# We need to use the categorical features encoded and not those before as they were replaced.

# Get the encoded columns
encoded_columns = traindf_dtypes_outlier_corr_incon_encoded.columns.tolist()

# Get the original columns
original_columns = traindf_dtypes_outlier_corr_incon.columns.tolist()

# Get the categorical_features_encoded - substitution of the previous categorical_features
categorical_features_encoded = [col for col in encoded_columns if col not in original_columns]

print("Encoded Categorical Features:")
for feature in categorical_features_encoded:
    print(feature)

In [None]:
traindf_dtypes_outlier_corr_incon_encoded.isna().sum()

In [None]:
traindf_dtypes_outlier_corr_incon_encoded.columns

### 3.2 Validating the datasets for modelling

In [None]:
# Checking if all the names are in the corrected order and aligned

def check_column_names_match(df1, df2):
    # Get the column names of both dataframes
    df1_columns = set(df1.columns)
    df2_columns = set(df2.columns)

    # Find columns that exist in one dataframe but not in the other
    columns_only_in_df1 = df1_columns - df2_columns
    columns_only_in_df2 = df2_columns - df1_columns

    if columns_only_in_df1:
        print("Columns present in traind_ but not in testdf:", columns_only_in_df1)

    if columns_only_in_df2:
        print("Columns present in testdf but not in traind_:", columns_only_in_df2)

    if not columns_only_in_df1 and not columns_only_in_df2:
        print("Column names match!")

In [None]:
check_column_names_match(traindf_dtypes_outlier_corr_incon_encoded, testdf_dtypes_outlier_corr_incon_encoded)

In [None]:
# Drop columns starting with "Interval of age" from train and validation datasets
traindf_dtypes_outlier_corr_incon_encoded = traindf_dtypes_outlier_corr_incon_encoded.drop(columns=traindf_dtypes_outlier_corr_incon_encoded.filter(regex='^Interval of age').columns)
valdf_dtypes_outlier_corr_incon_encoded = valdf_dtypes_outlier_corr_incon_encoded.drop(columns=valdf_dtypes_outlier_corr_incon_encoded.filter(regex='^Interval of age').columns)

In [None]:
check_column_names_match(traindf_dtypes_outlier_corr_incon_encoded, valdf_dtypes_outlier_corr_incon_encoded)

In [None]:
traindf_dtypes_outlier_corr_incon_encoded.head()

In [None]:
# Make copies of the original DataFrames
traindf_dtypes_outlier_corr_incon_encoded_fs = traindf_dtypes_outlier_corr_incon_encoded.copy()
valdf_dtypes_outlier_corr_incon_encoded_fs = valdf_dtypes_outlier_corr_incon_encoded.copy()
testdf_dtypes_outlier_corr_incon_encoded_fs = testdf_dtypes_outlier_corr_incon_encoded.copy()

# Define columns to exclude
columns_to_exclude = ['name', 'title', 'date_of_birth','city','country']

# Filter out columns that actually exist in the DataFrame
columns_to_exclude_train = [col for col in columns_to_exclude if col in traindf_dtypes_outlier_corr_incon_encoded_fs.columns]
columns_to_exclude_val = [col for col in columns_to_exclude if col in valdf_dtypes_outlier_corr_incon_encoded_fs.columns]
columns_to_exclude_test = [col for col in columns_to_exclude if col in testdf_dtypes_outlier_corr_incon_encoded_fs.columns]

# Drop the specified columns
traindf_dtypes_outlier_corr_incon_encoded_fs.drop(columns=columns_to_exclude_train, inplace=True)
valdf_dtypes_outlier_corr_incon_encoded_fs.drop(columns=columns_to_exclude_val, inplace=True)
testdf_dtypes_outlier_corr_incon_encoded_fs.drop(columns=columns_to_exclude_test, inplace=True)

### 3.3 Splitting

In [None]:
# Splitting the target features from all the other features for the model inputation

X_train = traindf_dtypes_outlier_corr_incon_encoded_fs.drop(columns=['lifestyle_type'])
y_train = traindf_dtypes_outlier_corr_incon_encoded_fs['lifestyle_type']

In [None]:
X_val = valdf_dtypes_outlier_corr_incon_encoded_fs.drop(columns=['lifestyle_type'])
y_val = valdf_dtypes_outlier_corr_incon_encoded_fs['lifestyle_type']

In [None]:
testdf = testdf_dtypes_outlier_corr_incon_encoded_fs.copy()

### 3.4 Feature Selection

Because some of the features after the preprocessing might not be totally required we will use different approaches for the feature selection part.

1st we will use the RFE with Logistic Regression.

2nd we will use the RFE with the Random Forest Classifier.

3rd we will use the NerualNetwork to explore the feature selection.

---

Each final subset of columns based on the different methods will be indicated as LR, RFC and MLP for the specific models applied for the feature selection

#### RFE with Logistic Regression

In [None]:
# the following RFE formula give us the final features to keep and to introduce in the model. Those that are strictly relevant when using this specific approach.

In [None]:
# Number of features to keep
nof_list=np.arange(2,X_train.shape[1])
high_score=0
# Variable to store the optimum features
nof=0

#list of scores
score_list =[]

for n in range(len(nof_list)):
    model = LogisticRegression()

    #create instance of RFE
    rfe = RFE(model,n_features_to_select = nof_list[n])

    #fit RFE to training data
    X_train_rfe = rfe.fit_transform(X_train,y_train)

    #Apply the same RFE transformation to the test data
    X_val_rfe = rfe.transform(X_val)

    #Create and train instance of Logistic Regression
    model.fit(X_train_rfe,y_train)

    #predict
    y_pred = model.predict(X_val_rfe)

    #store f1score in variable
    f_score = f1_score(y_val, y_pred, average= "weighted")

    score_list.append(f_score)

    #compare against previous best performance
    if(f_score > high_score):
        high_score = f_score
        nof = nof_list[n]
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

In [None]:
'''rfe = traindf_dtypes_outlier_corr_incon_encoded_fs_age
Optimum number of features: 13
Score with 13 features: 0.638124'''

In [None]:
#fit RFE to your training data - metric features only
X_rfe = rfe.fit_transform(X = X_train, y = y_train)

# Create an object `selected_features` that will shows which features to keep and which features to throw away
selected_features_lr = pd.Series(rfe.support_, index = X_train.columns)
selected_features_lr

In [None]:
# Get the features that were selected as important (i.e., have a value of True)
true_features = selected_features_lr[selected_features_lr].index.tolist()

# Get the features that are part of the categorical features and should always be kept
#categorical_features_to_keep = [feature for feature in categorical_features_encoded if feature not in true_features]

# Combine the two lists to get the final list of features to keep
features_to_keep_lr = true_features + categorical_features_to_keep

In [None]:
features_to_keep_lr = ['last_year_avg_monthly_charity_donations','environmental_awareness_rating','financial_wellness_index','investment_portfolio_value','investments_risk_appetite','investments_risk_tolerance','tech_savviness_score','social_media_influence_score','entertainment_engagement_factor','avg_monthly_entertainment_expenses','avg_weekly_exercise_hours','health_consciousness_rating','age']

In [None]:
# defining the new dataframe name for the dataframe that will keep only the features selected by the RFE Logisti Regression
X_train_featureSelected_lr = X_train[features_to_keep_lr]

In [None]:
X_val_featureSelected_lr = X_val[features_to_keep_lr]

In [None]:
testdf_featureSelected_lr = testdf[features_to_keep_lr]

#### RFE with RandomForestClassifier

In [None]:
# Train the random forest model to get feature importances
model = RandomForestClassifier()
model.fit(X_train, y_train)

# Get feature importances and sort them
importances = model.feature_importances_
indices = np.argsort(importances)[::-1]

# Try different numbers of top features
nof_list = np.arange(2, X_train.shape[1], step=5)  # Adjust step size for fewer iterations
high_score = 0
nof = 0
score_list = []

for n in nof_list:
    selected_features = indices[:n]

    X_train_selected = X_train.iloc[:, selected_features]
    X_val_selected = X_val.iloc[:, selected_features]

    # Train and predict using the same RandomForestClassifier
    model.fit(X_train_selected, y_train)
    y_pred = model.predict(X_val_selected)

    f_score = f1_score(y_val, y_pred, average="weighted")
    score_list.append(f_score)

    if f_score > high_score:
        high_score = f_score
        nof = n

print("Optimum number of features: %d" % nof)
print("Score with %d features: %f" % (nof, high_score))

In [None]:
'''Optimum number of features: 12
Score with 12 features: 0.0.775521'''

In [None]:
rfe = RFE(estimator = RandomForestClassifier(), n_features_to_select = nof)

In [None]:
#fit RFE to your training data - metric features only
X_rfe = rfe.fit_transform(X = X_train, y = y_train)

# Create an object `selected_features` that will shows which features to keep and which features to throw away
selected_features_rfc = pd.Series(rfe.support_, index = X_train.columns)
selected_features_rfc

In [None]:
# Get the features that were selected as important (i.e., have a value of True)
true_features_rfc = selected_features_rfc[selected_features_rfc].index.tolist()

# Get the features that are part of the categorical features and should always be kept
#categorical_features_to_keep_rfc = [feature for feature in categorical_features_encoded if feature not in true_features_rfc]

# Combine the two lists to get the final list of features to keep
features_to_keep_rfc = true_features_rfc

In [None]:
features_to_keep_rfc = ['environmental_awareness_rating',
 'financial_wellness_index',
 'investment_portfolio_value',
 'investments_risk_appetite',
 'investments_risk_tolerance',
 'tech_savviness_score',
 'social_media_influence_score',
 'entertainment_engagement_factor',
 'avg_monthly_entertainment_expenses',
 'avg_weekly_exercise_hours',
 'health_consciousness_rating',
 'age']

In [None]:
X_train_featureSelected_rfc = X_train[features_to_keep_rfc]

In [None]:
X_val_featureSelected_rfc = X_val[features_to_keep_rfc]

In [None]:
testdf_featureSelected_rfc = testdf[features_to_keep_rfc]

#### Feature Selection with Neural Networks

In [None]:
# Number of features to keep
nof_list=np.arange(2,X_train.shape[1])
high_score=0
# Variable to store the optimum features
nof=0
#list of scores
score_list =[]

for n in nof_list:
    #create instance of SelectKBest
    selector = SelectKBest(score_func=f_classif, k=n)

    #fit to training data
    X_train_selected = selector.fit_transform(X_train, y_train)

    #apply the transformation to the test data
    X_val_selected = selector.transform(X_val)

    #Create and train instance of MLPClassifier
    model = MLPClassifier(max_iter=1000) # increase max_iter if needed
    model.fit(X_train_selected, y_train)

    #predict
    y_pred = model.predict(X_val_selected)

    #store f1score in variable
    f_score = f1_score(y_val, y_pred, average="weighted")

    score_list.append(f_score)

    #compare against previous best performance
    if(f_score > high_score):
        high_score = f_score
        nof = n
print("Optimum number of features: %d" %nof)
print("Score with %d features: %f" % (nof, high_score))

In [None]:
'''Optimum number of features: 12
Score with 12 features: 0.781824'''

In [None]:
#fit RFE to your training data - metric features only
X_rfe = rfe.fit_transform(X = X_train, y = y_train)

# Create an object `selected_features` that will shows which features to keep and which features to throw away
selected_features_mlp = pd.Series(rfe.support_, index = X_train.columns)
selected_features_mlp

In [None]:
selected_features_mlp:['environmental_awareness_rating',
 'financial_wellness_index',
 'investment_portfolio_value',
 'investments_risk_appetite',
 'investments_risk_tolerance',
 'tech_savviness_score',
 'social_media_influence_score',
 'entertainment_engagement_factor',
 'avg_monthly_entertainment_expenses',
 'avg_weekly_exercise_hours',
 'health_consciousness_rating',
 'age']

In [None]:
# Combine the two lists to get the final list of features to keep
features_to_keep_mlp = selected_features_mlp

In [None]:
features_to_keep_mlp = ['last_year_avg_monthly_charity_donations',
 'environmental_awareness_rating',
 'financial_wellness_index',
 'investment_portfolio_value',
 'investments_risk_appetite',
 'investments_risk_tolerance',
 'tech_savviness_score',
 'social_media_influence_score',
 'entertainment_engagement_factor',
 'avg_monthly_entertainment_expenses',
 'avg_weekly_exercise_hours',
 'health_consciousness_rating',
 'age']

In [None]:
X_train_featureSelected_mlp = X_train[features_to_keep_mlp]

In [None]:
X_val_featureSelected_mlp = X_val[features_to_keep_mlp]

In [None]:
testdf_featureSelected_mlp = testdf[features_to_keep_mlp]

#### Analzing the Features

In [None]:
# Convert your lists to sets
set_features_lr = set(features_to_keep_lr)
set_features_rfc = set(features_to_keep_rfc)
set_features_mlp = set(features_to_keep_mlp)

# Find common features among all three methods
common_features = set_features_lr.intersection(set_features_rfc, set_features_mlp)
print(f"Common features selected by all methods: {common_features}")

# Find features unique to each method
unique_features_lr = set_features_lr.difference(set_features_rfc, set_features_mlp)
unique_features_rfc = set_features_rfc.difference(set_features_lr, set_features_mlp)
unique_features_mlp = set_features_mlp.difference(set_features_lr, set_features_rfc)

print(f"Features unique to Logistic Regression: {unique_features_lr}")
print(f"Features unique to Random Forest Classifier: {unique_features_rfc}")
print(f"Features unique to MLP: {unique_features_mlp}")

# Find features that were selected by at least two methods
common_features_lr_rfc = set_features_lr.intersection(set_features_rfc)
common_features_lr_mlp = set_features_lr.intersection(set_features_mlp)
common_features_rfc_mlp = set_features_rfc.intersection(set_features_mlp)

print(f"Features selected by both Logistic Regression and Random Forest Classifier: {common_features_lr_rfc}")
print(f"Features selected by both Logistic Regression and MLP: {common_features_lr_mlp}")
print(f"Features selected by both Random Forest Classifier and MLP: {common_features_rfc_mlp}")

## 3.5 Cross-Validation

We will approach the cross validation to check which of the model this method indicate us, however we use cross validation before in each model in order to produce the best outcomes.

### Stratified K-Folds

In [None]:
from sklearn.model_selection import StratifiedKFold

In [None]:
classifiers = {
    'Logistic Regression': (
        LogisticRegression(max_iter=10000),
        [
            {'C': np.logspace(-2, 2, 5), 'penalty': ['l1'], 'solver': ['liblinear']},
            {'C': np.logspace(-2, 2, 5), 'penalty': ['l2'], 'solver': ['lbfgs']}
        ]
    ),
    'Random Forest': (
        RandomForestClassifier(),
        {
            'n_estimators': [50, 100], 'max_depth': [None, 10, 20], 'min_samples_split': [3, 5, 7]}
        ),
    'Decision Tree': (
        DecisionTreeClassifier(),
        {
            'max_depth': [None, 10, 20], 'criterion': ['gini']}
    ),
    'KNN': (
        KNeighborsClassifier(),
        {
            'n_neighbors': [3, 5], 'weights': ['uniform']}
    )
}

In [None]:
# the strategy to cross validation
cv_strategy = StratifiedKFold(n_splits=5)

In [None]:
from sklearn.metrics import accuracy_score, f1_score, make_scorer

# defining the measure functions
scoring = {
    'Accuracy': make_scorer(accuracy_score),
    'F1': make_scorer(f1_score, average='weighted')
    }

#### with Feature Selection LR

In [None]:
'''# Iterate over the classifiers, perform hyperparameter tuning using grid search, and cross-validation

for classifier_name, (model, parameters) in classifiers.items():
    gs_clf = GridSearchCV(model, parameters, cv=cv_strategy, scoring=scoring, refit='F1', return_train_score=True)
    gs_clf.fit(X_train_featureSelected_lr, y_train)
    print(f"Best parameters for {classifier_name} are {gs_clf.best_params_}")
    print(f"Best score for {classifier_name} is {gs_clf.best_score_}")
    print("-----------------------\n")'''

In [None]:
'''Best parameters for Logistic Regression are {'C': 100.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score for Logistic Regression is 0.6389079933682763
-----------------------

Best parameters for Random Forest are {'max_depth': None, 'min_samples_split': 3, 'n_estimators': 100}
Best score for Random Forest is 0.7753435758777119
-----------------------

Best parameters for Decision Tree are {'criterion': 'gini', 'max_depth': 10}
Best score for Decision Tree is 0.7134352393085869
-----------------------

Best parameters for KNN are {'n_neighbors': 5, 'weights': 'uniform'}
Best score for KNN is 0.6771021018808263
-----------------------'''


#### with Feature Selection RFC

In [None]:
'''# Iterate over the classifiers, perform hyperparameter tuning using grid search, and cross-validation

for classifier_name, (model, parameters) in classifiers.items():
    gs_clf = GridSearchCV(model, parameters, cv=cv_strategy, scoring=scoring, refit='F1', return_train_score=True)
    gs_clf.fit(X_train_featureSelected_rfc, y_train)
    print(f"Best parameters for {classifier_name} are {gs_clf.best_params_}")
    print(f"Best score for {classifier_name} is {gs_clf.best_score_}")
    print("-----------------------\n")'''

In [None]:
'''Best parameters for Logistic Regression are {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score for Logistic Regression is 0.6363930801119528
-----------------------

Best parameters for Random Forest are {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best score for Random Forest is 0.7717204338689451
-----------------------

Best parameters for Decision Tree are {'criterion': 'gini', 'max_depth': 10}
Best score for Decision Tree is 0.7115679782401878
-----------------------

Best parameters for KNN are {'n_neighbors': 5, 'weights': 'uniform'}
Best score for KNN is 0.6750837745504914
-----------------------
'''

#### with Feature Selection MLP

In [None]:
# Iterate over the classifiers, perform hyperparameter tuning using grid search, and cross-validation

for classifier_name, (model, parameters) in classifiers.items():
    gs_clf = GridSearchCV(model, parameters, cv=cv_strategy, scoring=scoring, refit='F1', return_train_score=True)
    gs_clf.fit(X_train_featureSelected_mlp, y_train)
    print(f"Best parameters for {classifier_name} are {gs_clf.best_params_}")
    print(f"Best score for {classifier_name} is {gs_clf.best_score_}")
    print("-----------------------\n")

In [None]:
'''Best parameters for Logistic Regression are {'C': 100.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score for Logistic Regression is 0.6389079933682763
-----------------------

Best parameters for Random Forest are {'max_depth': None, 'min_samples_split': 7, 'n_estimators': 100}
Best score for Random Forest is 0.774980665884699
-----------------------

Best parameters for Decision Tree are {'criterion': 'gini', 'max_depth': 10}
Best score for Decision Tree is 0.7135939938067647
-----------------------

Best parameters for KNN are {'n_neighbors': 5, 'weights': 'uniform'}
Best score for KNN is 0.6771021018808263
-----------------------'''

### Traditional K-Folds

In [None]:
 from sklearn.model_selection import StratifiedKFold

In [None]:
# The definition of the classifiers to be used and approach

classifiers = {
    'Logistic Regression': (
        LogisticRegression(max_iter=10000),
        [
            {'C': np.logspace(-2, 2, 5), 'penalty': ['l1'], 'solver': ['liblinear']},
            {'C': np.logspace(-2, 2, 5), 'penalty': ['l2'], 'solver': ['lbfgs']}
        ]
    ),
    'Random Forest': (
        RandomForestClassifier(),
        {
            'n_estimators': [50, 100], 'max_depth': [None, 10, 20], 'min_samples_split': [3, 5, 7]}
        ),
    'Decision Tree': (
        DecisionTreeClassifier(),
        {
            'max_depth': [None, 10, 20], 'criterion': ['gini']}
    ),
    'KNN': (
        KNeighborsClassifier(),
        {
            'n_neighbors': [3, 5], 'weights': ['uniform']}
    )
}

In [None]:
# the strategy to cross validation
cvs_strategy = KFold(n_splits=5)

In [None]:
# defining the measure functions
scoring = {
    'Accuracy': make_scorer(accuracy_score),
    'F1': make_scorer(f1_score, average='weighted')
}

#### with Feature Selection LR

In [None]:
'''# Iterate over the classifiers, perform hyperparameter tuning using grid search, and cross-validation

for classifier_name, (model, parameters) in classifiers.items():
    gs_clf = GridSearchCV(model, parameters, cv=cvs_strategy, scoring=scoring, refit='F1', return_train_score=True)
    gs_clf.fit(X_train_featureSelected_lr, y_train)
    print(f"Best parameters for {classifier_name} are {gs_clf.best_params_}")
    print(f"Best score for {classifier_name} is {gs_clf.best_score_}")
    print("-----------------------\n")'''

In [None]:
'''Best parameters for Logistic Regression are {'C': 100.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score for Logistic Regression is 0.6389561794062362
-----------------------

Best parameters for Random Forest are {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best score for Random Forest is 0.7744894486925247
-----------------------

Best parameters for Decision Tree are {'criterion': 'gini', 'max_depth': 10}
Best score for Decision Tree is 0.7155028736669042
-----------------------

Best parameters for KNN are {'n_neighbors': 5, 'weights': 'uniform'}
Best score for KNN is 0.6772173140686943
-----------------------'''

#### with Feature Selection RFC

In [None]:
# Iterate over the classifiers, perform hyperparameter tuning using grid search, and cross-validation

for classifier_name, (model, parameters) in classifiers.items():
    gs_clf = GridSearchCV(model, parameters, cv=cvs_strategy, scoring=scoring, refit='F1', return_train_score=True)
    gs_clf.fit(X_train_featureSelected_rfc, y_train)
    print(f"Best parameters for {classifier_name} are {gs_clf.best_params_}")
    print(f"Best score for {classifier_name} is {gs_clf.best_score_}")
    print("-----------------------\n")

In [None]:
'''Best parameters for Logistic Regression are {'C': 1.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score for Logistic Regression is 0.6364994222429894
-----------------------

Best parameters for Random Forest are {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best score for Random Forest is 0.7726886463169531
-----------------------

Best parameters for Decision Tree are {'criterion': 'gini', 'max_depth': 10}
Best score for Decision Tree is 0.7131829334884128
-----------------------

Best parameters for KNN are {'n_neighbors': 5, 'weights': 'uniform'}
Best score for KNN is 0.6751881883992498
-----------------------'''

#### with Feature Selection MLP

In [None]:
# Iterate over the classifiers, perform hyperparameter tuning using grid search, and cross-validation

for classifier_name, (model, parameters) in classifiers.items():
    gs_clf = GridSearchCV(model, parameters, cv=cvs_strategy, scoring=scoring, refit='F1', return_train_score=True)
    gs_clf.fit(X_train_featureSelected_mlp, y_train)
    print(f"Best parameters for {classifier_name} are {gs_clf.best_params_}")
    print(f"Best score for {classifier_name} is {gs_clf.best_score_}")
    print("-----------------------\n")

In [None]:
'''Best parameters for Logistic Regression are {'C': 100.0, 'penalty': 'l2', 'solver': 'lbfgs'}
Best score for Logistic Regression is 0.6389561794062362
-----------------------

Best parameters for Random Forest are {'max_depth': None, 'min_samples_split': 5, 'n_estimators': 100}
Best score for Random Forest is 0.775204630666269
-----------------------

Best parameters for Decision Tree are {'criterion': 'gini', 'max_depth': 10}
Best score for Decision Tree is 0.7156243887801113
-----------------------

Best parameters for KNN are {'n_neighbors': 5, 'weights': 'uniform'}
Best score for KNN is 0.6772173140686943
-----------------------'''

## 4. Modelling

### 4.1 KNN Classifier

#### FS with LR

In this section we apprach the different models to be used.

Here we will go through each model, and in each, we try by using a normal approach and with the different features selected to check the best outcome in each.

**---**

For each model we run a hyperparameterization, that will be applied to the best outcome model with the different feature selection approach.


**---**

The hyper parameterization will be commented so the script runs from the begining till the end.

On the moddeling we follow always the same approach for consistency

1. Pass the model to a variable
2. Fit the model into the X_train and the y_train
3. Predict the values using the X_val and saving it to a y_pred
4. Comparing this y_pred with the existing y_val to address the behavior and performance of the model:
    -    4.1 Confusion Matrix
    -    4.2 Accuracy
    -    4.3 ROC-AUC score
    -    4.4 Precision Score
    -    4.5 Recall Score
    -    4.6 ROC curve
5. Make the predicitons on the test set
6. Saving those predicition in a variable in a folder

In [None]:
modelKNN_lr = KNeighborsClassifier()

In [None]:
modelKNN_lr.fit(X_train_featureSelected_lr, y_train)

In [None]:
y_pred = modelKNN_lr.predict(X_val_featureSelected_lr)

In [None]:
# Confusion Matrix

confusion_matrix(y_val, y_pred)

In [None]:
accuracy = accuracy_score(y_val, y_pred)
print("KNN Accuracy: ", accuracy)

In [None]:
y_pred_prob = modelKNN_lr.predict_proba(X_val_featureSelected_lr)
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("KNN ROC-AUC: ", roc_auc)

In [None]:
precision = precision_score(y_val, y_pred, average='weighted')
print("KNN Precision (weighted): ", precision)

In [None]:
recall_score(y_val, y_pred, average='weighted')

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))

In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
# Get the predicted probabilities for each class
y_pred_prob = modelKNN_lr.predict_proba(X_val_featureSelected_lr)

# Binarize the true labels
y_val_binarized = label_binarize(y_val, classes=np.unique(y_val))

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = len(np.unique(y_val))

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_val_binarized[:, i], y_pred_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure(figsize=(8, 6))

for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for class %d' % (roc_auc[i], i))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KNN')
plt.legend(loc="lower right")
plt.show()


In [None]:
modelKNN_lr.score(X_train_featureSelected_lr, y_train)

In [None]:
modelKNN_lr.score(X_val_featureSelected_lr, y_val)

In [None]:
print(f'F1 Score between the y_test and labels_test: {f1_score(y_val, y_pred, average = "weighted"):.3f}')

In [None]:
# FOR KAGGLE SUBMISSION

test_predictions_modelknn_fS_lr = modelKNN_lr.predict(testdf_featureSelected_lr)

In [None]:
# Step 1: Check Lengths
print("Length of testdf.index:", len(testdf.index))
print("Length of test_predictions_modelknn_fS_lr:", len(test_predictions_modelknn_fS_lr))

# Step 3: Create DataFrame
if len(testdf.index) == len(test_predictions_modelknn_fS_lr):
    modelknn_submission_df_fS_lr = pd.DataFrame({
        "citizen_id": testdf.index,  # Use the correct index or ID column
        "lifestyle_type": test_predictions_modelknn_fS_lr
    })
    print(modelknn_submission_df_fS_lr.head())
else:
    print("Error: Lengths of index and predictions do not match.")


In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
modelknn_submission_df_fS_lr = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_modelknn_fS_lr
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(modelknn_submission_df_fS_lr, 'modelknn_submission_df_nV_fS_lr', '/content/drive/MyDrive/Data Mining II project/Results')'''


In [None]:
#score : 0,67205

#### FS with RFC

In [None]:
modelKNN_rfc = KNeighborsClassifier()

In [None]:
modelKNN_rfc.fit(X_train_featureSelected_rfc, y_train)

In [None]:
y_pred = modelKNN_rfc.predict(X_val_featureSelected_rfc)

In [None]:
# Confusion Matrix

confusion_matrix(y_val, y_pred)

In [None]:
accuracy = accuracy_score(y_val, y_pred)
print("KNN Accuracy: ", accuracy)

In [None]:
y_pred_prob = modelKNN_rfc.predict_proba(X_val_featureSelected_rfc)
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("KNN ROC-AUC: ", roc_auc)

In [None]:
precision_score(y_val, y_pred, average = "weighted")

In [None]:
recall_score(y_val, y_pred, average = "weighted")

In [None]:
from sklearn.preprocessing import label_binarize
from sklearn.metrics import roc_curve, auc
# Get the predicted probabilities for each class
y_pred_prob = modelKNN_rfc.predict_proba(X_val_featureSelected_rfc)

# Binarize the true labels
y_val_binarized = label_binarize(y_val, classes=np.unique(y_val))

# Compute ROC curve and ROC area for each class
fpr = dict()
tpr = dict()
roc_auc = dict()
n_classes = len(np.unique(y_val))

for i in range(n_classes):
    fpr[i], tpr[i], _ = roc_curve(y_val_binarized[:, i], y_pred_prob[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])

# Plot ROC curve for each class
plt.figure(figsize=(8, 6))

for i in range(n_classes):
    plt.plot(fpr[i], tpr[i], label='ROC curve (area = %0.2f) for class %d' % (roc_auc[i], i))

plt.plot([0, 1], [0, 1], 'k--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KNN')
plt.legend(loc="lower right")
plt.show()

In [None]:
modelKNN_rfc.score(X_train_featureSelected_rfc, y_train)

In [None]:
modelKNN_rfc.score(X_val_featureSelected_rfc, y_val)

In [None]:
print(f'F1 Score between the y_test and labels_test: {f1_score(y_val, y_pred, average = "weighted"):.3f}')

In [None]:
# FOR KAGGLE SUBMISSION

test_predictions_modelknn_fS_rfc = modelKNN_rfc.predict(testdf_featureSelected_rfc)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
modelknn_submission_df_fS_rfc = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_modelknn_fS_rfc
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(modelknn_submission_df_fS_rfc, 'modelknn_submission_df_nV_fS_rfc', '/content/drive/MyDrive/Data Mining II project/Results')
'''

In [None]:
# score: 0,67278

#### Hyperparameterization


Knowing that the KNN model with the best performance was the one using the feature selected based logistic regression, so we are using X_train_featureSelected_lr

In [None]:
'''# Parameter distribution for RandomizedSearch
param_dist_knn = {
    'n_neighbors': randint(1, 30),
    'weights': ['uniform', 'distance']
}

random_search_knn = RandomizedSearchCV(KNeighborsClassifier(), param_dist_knn, n_iter=100, cv=5)
random_search_knn.fit(X_train_featureSelected_rfc, y_train)
print("RandomizedSearchCV - Best parameters for KNN: ", random_search_knn.best_params_)
print("RandomizedSearchCV - Best score for KNN: ", random_search_knn.best_score_)'''

#### Model w/ Hyper Parameters

In [None]:
'''modelKNN_hp = KNeighborsClassifier(n_neighbors=24, weights='uniform')'''

In [None]:
'''modelKNN_hp.fit(X_train_featureSelected_rfc, y_train)'''

In [None]:
'''y_pred = modelKNN_hp.predict(X_val_featureSelected_rfc)'''

In [None]:
'''confusion_matrix(y_val, y_pred)'''

In [None]:
'''accuracy = accuracy_score(y_val, y_pred)
print("KNN Accuracy: ", accuracy)'''

In [None]:
'''y_pred_prob = modelKNN_hp.predict_proba(X_val_featureSelected_rfc)[:, 1]
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("KNN ROC-AUC: ", roc_auc)'''

In [None]:
'''precision_score(y_val, y_pred)'''

In [None]:
'''recall_score(y_val, y_pred)'''

In [None]:
'''print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))'''

In [None]:
'''fpr, tpr, thresholds = roc_curve(y_val, y_pred_prob)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='KNN')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('KNN ROC Curve')
plt.show()'''

In [None]:
'''modelKNN_hp.score(X_train_featureSelected_lr, y_train)'''

In [None]:
'''modelKNN_hp.score(X_val_featureSelected_lr, y_val)'''

In [None]:
'''print(f'F1 Score between the y_test and labels_test: {f1_score(y_val, y_pred):.3f}')'''

In [None]:
# FOR KAGGLE SUBMISSION

'''test_predictions_modelknn_hp = modelKNN_hp.predict(testdf_featureSelected_lr)'''

In [None]:
'''# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
modelknn_submission_df_hp = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_modelknn_hp
})'''

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(modelknn_submission_df_hp, '/content/drive/MyDrive/Data Mining II project/Results/hpp')'''

## 4.2 Random Forest Classifier

#### FS with LR

In [None]:
rfc_model_fs_lr = RandomForestClassifier()
rfc_model_fs_lr.fit(X_train_featureSelected_lr, y_train)

In [None]:
# predictions to y_pred, using the method `predict()`.

y_pred = rfc_model_fs_lr.predict(X_val_featureSelected_lr)

In [None]:
# Confusion Matrix

confusion_matrix(y_val, y_pred)

In [None]:
# accuracy score for the logistic regression applied on insurance.
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Random Forest Classifier Accuracy: ", accuracy)

In [None]:
y_pred_prob = rfc_model_fs_lr.predict_proba(X_val_featureSelected_lr)
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("Random Forest Classifie ROC-AUC: ", roc_auc)

In [None]:
precision_score(y_val, y_pred, average ="weighted")

In [None]:
recall_score(y_val, y_pred, average ="weighted")

In [None]:
# Get feature importance
importances = rfc_model_fs_lr.feature_importances_

# Print them out
for feature, importance in zip(X_train_featureSelected_lr.columns, importances):
    print(f"The feature {feature} has an importance of {importance}")

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))

In [None]:
#fazer plot de ROC_AUC?

In [None]:
test_predictions_rfcmodel_fs_lr = rfc_model_fs_lr.predict(testdf_featureSelected_lr)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
rfcmodel_submission_df_fs_lr = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_rfcmodel_fs_lr
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(rfcmodel_submission_df_fs_lr, 'rfcmodel_submission_df_nV_fS_lr', '/content/drive/MyDrive/Data Mining II project/Results')'''

In [None]:
#score de 0,77302

### FS with RFC

In [None]:
rfc_model_fs_rfc = RandomForestClassifier()
rfc_model_fs_rfc.fit(X_train_featureSelected_rfc, y_train)

In [None]:
# predictions to y_pred, using the method `predict()`.

y_pred = rfc_model_fs_rfc.predict(X_val_featureSelected_rfc)

In [None]:
# Confusion Matrix

confusion_matrix(y_val, y_pred)

In [None]:
# accuracy score for the logistic regression applied on insurance.
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Random Forest Classifier Accuracy: ", accuracy)

In [None]:
y_pred_prob = rfc_model_fs_rfc.predict_proba(X_val_featureSelected_rfc)
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("Random Forest Classifie ROC-AUC: ", roc_auc)

In [None]:
precision_score(y_val, y_pred, average = "weighted")

In [None]:
recall_score(y_val, y_pred, average = "weighted")

In [None]:
# Get feature importance
importances = rfc_model_fs_rfc.feature_importances_

# Print them out
for feature, importance in zip(X_train_featureSelected_rfc.columns, importances):
    print(f"The feature {feature} has an importance of {importance}")

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))

In [None]:
# plot de ROC CURVE?

In [None]:
# FOR KAGGLE SUBMISSION

test_predictions_rfcmodel_fs_rfc = rfc_model_fs_rfc.predict(testdf_featureSelected_rfc)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
rfcmodel_submission_df_fs_rfc = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_rfcmodel_fs_rfc
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(rfcmodel_submission_df_fs_rfc, 'rfcmodel_submission_df_nV_fS_rfc', '/content/drive/MyDrive/Data Mining II project/Results')'''

In [None]:
#score : 0,766672

### Hyperparameterization

Knowing that the RFC model with the best performance was the one using the feature selected based logistic regression, so we are using X_train_featureSelected_lr

In [None]:
'''from sklearn.model_selection import RandomizedSearchCV

param_dist = {
    'n_estimators': [200, 250, 300, 350, 400],
    'max_depth': [30, 40, 50, 60],
    'min_samples_split': [3, 4, 5, 6, 7, 8, 9, 10]
}

random_search = RandomizedSearchCV(estimator=RandomForestClassifier(), param_distributions=param_dist, n_iter=50, cv=5, n_jobs=-1)
random_search.fit(X_train_featureSelected_lr, y_train)

print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)'''

### Model w/ Hyper Parameters

In [None]:
'''rfc_model_fs_lr_hp = RandomForestClassifier(n_estimators=300, min_samples_split=5, max_depth=50)
rfc_model_fs_lr_hp.fit(X_train_featureSelected_lr, y_train)'''

In [None]:
'''# predictions to y_pred, using the method `predict()`.

y_pred = rfc_model_fs_lr_hp.predict(X_val_featureSelected_lr)'''

In [None]:
'''# Confusion Matrix

confusion_matrix(y_val, y_pred)'''

In [None]:
'''# accuracy score for the logistic regression applied on insurance.
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Random Forest Classifier Accuracy: ", accuracy)'''

In [None]:
'''y_pred_prob = rfc_model_fs_lr_hp.predict_proba(X_val_featureSelected_lr)[:, 1]
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class = 'ovr')
print("Random Forest Classifie ROC-AUC: ", roc_auc)'''

In [None]:
'''precision_score(y_val, y_pred, average = "weighted")'''

In [None]:
'''recall_score(y_val, y_pred, average = "weighted")'''

In [None]:
'''# Get feature importance
importances = rfc_model_fs_lr_hp.feature_importances_

# Print them out
for feature, importance in zip(X_train_featureSelected_lr.columns, importances):
    print(f"The feature {feature} has an importance of {importance}")'''

In [None]:
'''print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))'''

In [None]:
'''fpr, tpr, thresholds = roc_curve(y_val, y_pred_prob)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='Random Forest')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Random Forest ROC Curve')
plt.show()'''

In [None]:
'''# FOR KAGGLE SUBMISSION

test_predictions_rfcmodel_hp_fs_lr = rfc_model_fs_lr_hp.predict(testdf_featureSelected_lr)'''

In [None]:
'''# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
rfcmodel_hp_submission_df_fs_lr = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_rfcmodel_hp_fs_lr
})'''

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(rfcmodel_hp_submission_df_fs_lr, '/content/drive/MyDrive/Data Mining II project/Results/hpp')'''

## 4.3 Ensemble

### FS with LR

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

In [None]:
# Define the three base models
model1 = RandomForestClassifier(n_estimators=300, min_samples_split=5, max_depth=50)
model2 = GradientBoostingClassifier(subsample=0.8, n_estimators=100, min_samples_split=2, max_depth=4, learning_rate=0.1)
model3 = MLPClassifier(solver='adam', learning_rate='adaptive', hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation='tanh')

In [None]:
# Combine the models into an ensemble
ensemble_fS_lr = VotingClassifier(estimators=[('rf', model1), ('gb', model2), ('lr', model3)])

# Fit the ensemble model
ensemble_fS_lr.fit(X_train_featureSelected_lr, y_train)

In [None]:
# Make predictions
y_pred = ensemble_fS_lr.predict(X_val_featureSelected_lr)

In [None]:
# Confusion Matrix

confusion_matrix(y_val, y_pred)

In [None]:
# accuracy score for the logistic regression applied on insurance.
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Ensemble: ", accuracy)

In [None]:
precision_score(y_val, y_pred, average = "weighted")

In [None]:
recall_score(y_val, y_pred, average = "weighted")

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))

In [None]:
#plot de ROC CURVE?

In [None]:
# FOR KAGGLE SUBMISSION

test_predictions_ensemblemodel_fs_lr = ensemble_fS_lr.predict(testdf_featureSelected_lr)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
ensemblemodel_submission_df_fs_lr = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_ensemblemodel_fs_lr
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(ensemblemodel_submission_df_fs_lr, 'ensemblemodel_submission_df_fs_lr', '/content/drive/MyDrive/Data Mining II project/Results')'''

In [None]:
#score de 0,77779

### FS with RFC

In [None]:
# Define the three base models
model1 = RandomForestClassifier(n_estimators=300, min_samples_split=5, max_depth=50)
model2 = GradientBoostingClassifier(subsample=0.8, n_estimators=100, min_samples_split=2, max_depth=4, learning_rate=0.1)
model3 = MLPClassifier(solver='adam', learning_rate='adaptive', hidden_layer_sizes=(50, 100, 50), alpha=0.05, activation='tanh')

In [None]:
# Combine the models into an ensemble
ensemble_fS_rfc = VotingClassifier(estimators=[('rf', model1), ('gb', model2), ('lr', model3)])

# Fit the ensemble model
ensemble_fS_rfc.fit(X_train_featureSelected_rfc, y_train)

In [None]:
# Make predictions
y_pred = ensemble_fS_rfc.predict(X_val_featureSelected_rfc)

In [None]:
# Confusion Matrix

confusion_matrix(y_val, y_pred)

In [None]:
# accuracy score for the logistic regression applied on insurance.
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Ensemble: ", accuracy)

In [None]:
precision_score(y_val, y_pred, average = "weighted")

In [None]:
recall_score(y_val, y_pred, average = "weighted")

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('Confusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('Classification Report:\n', classification_report(y_val, y_pred))

In [None]:
#PLOT DE ROC CURVE?

In [None]:
# FOR KAGGLE SUBMISSION

test_predictions_ensemblemodel_fs_rfc = ensemble_fS_rfc.predict(testdf_featureSelected_rfc)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
ensemblemodel_submission_df_fs_rfc = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_ensemblemodel_fs_rfc
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(ensemblemodel_submission_df_fs_rfc, 'ensemblemodel_submission_df_fs_rfc', '/content/drive/MyDrive/Data Mining II project/Results')'''

In [None]:
#score de 0,77710

### Hyperparameterization

In [None]:
'''# Create the VotingClassifier
ensemble_fS_lr = VotingClassifier(estimators=[
    ('rf', model1),
    ('gb', model2),
    ('mlp', model3)
])

# Define the parameter grid
param_dist = {
    'rf__n_estimators': [200, 250, 300, 350, 400],
    'rf__max_depth': [30, 40, 50, 60],
    'rf__min_samples_split': [3, 4, 5, 6, 7, 8, 9, 10],
    'gb__n_estimators': [100, 150, 200, 250],
    'gb__max_depth': [3, 4, 5, 6],
    'gb__learning_rate': [0.01, 0.05, 0.1, 0.2],
    'gb__subsample': [0.7, 0.8, 0.9, 1.0],
    'mlp__hidden_layer_sizes': [(50, 100, 50), (100,)],
    'mlp__activation': ['tanh', 'relu'],
    'mlp__solver': ['adam'],
    'mlp__learning_rate': ['constant', 'adaptive'],
    'mlp__alpha': [0.0001, 0.001, 0.01, 0.05]
}

random_search = RandomizedSearchCV(estimator=ensemble_fS_lr, param_distributions=param_dist, n_iter=50, cv=5, n_jobs=-1, verbose=2, scoring='f1_macro')
random_search.fit(X_train_featureSelected_lr, y_train)
print("Best parameters: ", random_search.best_params_)
print("Best score: ", random_search.best_score_)'''

### Model w/ Hyper Parameters

In [None]:
'''ensemble_model_fs_lr_hp = ensemble_fS_lr(subsample=0.8, n_estimators=100, min_samples_split=2, max_depth=4, learning_rate=0.1)
ensemble_fs_lr_hp.fit(X_train_featureSelected_lr, y_train)'''

In [None]:
'''# predictions to y_pred, using the method `predict()`.

y_pred = ensemble_fs_lr_hp.predict(X_val_featureSelected_lr)'''

In [None]:
'''# Confusion Matrix

confusion_matrix(y_val, y_pred)'''

In [None]:
'''# accuracy score for the logistic regression applied on insurance.
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Ensemble: ", accuracy)'''

In [None]:
'''y_pred_prob = ensemble_fs_lr_hp.predict_proba(X_val_featureSelected_lr)[:, 1]
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class = 'ovr')
print("Ensemble ROC-AUC: ", roc_auc)'''

In [None]:
'''precision_score(y_val, y_pred, average = "weighted")'''

In [None]:
'''recall_score(y_val, y_pred, average = "weighted")'''

In [None]:
'''# Get feature importance
importances = ensemble_fs_lr_hp.feature_importances_

# Print them out
for feature, importance in zip(X_train_featureSelected_lr.columns, importances):
    print(f"The feature {feature} has an importance of {importance}")'''

In [None]:
'''print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))'''

In [None]:
'''fpr, tpr, thresholds = roc_curve(y_val, y_pred_prob)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr, label='VotingClassifier')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Ensemble ROC Curve')
plt.show()'''

In [None]:
'''# FOR KAGGLE SUBMISSION

test_predictions_ensemble_fs_lr_hp = ensemble_fs_lr_hp.predict(testdf_featureSelected_lr)'''

In [None]:
'''# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
ensemblemodel_hp_submission_df_fs_lr = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_ensemble_fs_lr_hp
})'''

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(ensemblemodel_hp_submission_df_fs_lr, '/content/drive/MyDrive/Data Mining II project/Results/hpp')'''

## 4.4 Logistic Regression

### FS with LR

In [None]:
# instance of LogisticRegression named as `log_model` with the default parameters and fit to your train data.

lr_model_fs = LogisticRegression()
lr_model_fs.fit(X_train_featureSelected_lr, y_train)

In [None]:
# predictions to y_pred, using the method `predict()`.

y_pred = lr_model_fs.predict(X_val_featureSelected_lr)

In [None]:
# Confusion Matrix

confusion_matrix(y_val, y_pred)

In [None]:
# accuracy score for the logistic regression applied on insurance.
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Logistic Regression Accuracy: ", accuracy)

In [None]:
y_pred_prob = lr_model_fs.predict_proba(X_val_featureSelected_lr)
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("Logistic Regression ROC-AUC: ", roc_auc)

In [None]:
precision_score(y_val, y_pred, average = "weighted")

In [None]:
recall_score(y_val, y_pred, average = "weighted")

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))

In [None]:
#plot de ROC CURVE?

In [None]:
# FOR KAGGLE SUBMISSION

test_predictions_lrmode_fs_lr = lr_model_fs.predict(testdf_featureSelected_lr)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
lrmodel_submission_df_fs_lr = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_lrmode_fs_lr
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(lrmodel_submission_df_fs_lr, 'lrmodel_submission_df_nV_fS_lr', '/content/drive/MyDrive/Data Mining II project/Results')'''

In [None]:
#score: 0,63801

### FS with RFC

In [None]:
# instance of LogisticRegression named as `log_model` with the default parameters and fit to your train data.

lr_model_fs = LogisticRegression()
lr_model_fs.fit(X_train_featureSelected_rfc, y_train)

In [None]:
# predictions to y_pred, using the method `predict()`.

y_pred = lr_model_fs.predict(X_val_featureSelected_rfc)

In [None]:
# Confusion Matrix

confusion_matrix(y_val, y_pred)

In [None]:
# accuracy score for the logistic regression applied on insurance.
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Logistic Regression Accuracy: ", accuracy)

In [None]:
y_pred_prob = lr_model_fs.predict_proba(X_val_featureSelected_rfc)
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("Logistic Regression ROC-AUC: ", roc_auc)

In [None]:
precision_score(y_val, y_pred, average = "weighted")

In [None]:
recall_score(y_val, y_pred, average = "weighted")

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))

In [None]:
# PLOT DE ROC CURVE?

In [None]:
# FOR KAGGLE SUBMISSION

test_predictions_lrmode_fs_rfc = lr_model_fs.predict(testdf_featureSelected_rfc)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
lrmodel_submission_df_fs_rfc = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_lrmode_fs_rfc
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(lrmodel_submission_df_fs_rfc, 'lrmodel_submission_df_nV_fS_rfc', '/content/drive/MyDrive/Data Mining II project/Results')'''

In [None]:
#score: 0,63333

## 4.5 Decision trees

### FS with LR

In [None]:
# Instantiate the model
dt_model_lr = DecisionTreeClassifier()

# Train the model
dt_model_lr.fit(X_train_featureSelected_lr, y_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

# Limit the tree depth for simplicity
dt_model_lr = DecisionTreeClassifier(max_depth=3)
dt_model_lr.fit(X_train_featureSelected_lr, y_train)

# Unique classes and their corresponding class names
class_names = ['Health-Conscious', 'Investor', 'Adventure Seeker', 'Fitness Enthusiast', 'Travel Enthusiast']

# Plot the decision tree
fig, ax = plt.subplots(figsize=(12, 12))  # Adjust size as needed
plot_tree(dt_model_lr,
          feature_names=X_train_featureSelected_lr.columns,
          class_names=class_names,
          filled=True,
          rounded=True,
          ax=ax)
plt.show()

In [None]:
# Make predictions
y_pred = dt_model_lr.predict(X_val_featureSelected_lr)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Decision Tree Accuracy: ", accuracy)

In [None]:
# Calculate AUC-ROC
y_pred_prob = dt_model_lr.predict_proba(X_val_featureSelected_lr)
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("Decision Tree ROC-AUC: ", roc_auc)

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))

In [None]:
#PLOT roc curve?

In [None]:
# Predict on the test set for the Kaggle competition
test_predictions_dtmodel_fs_lr = dt_model_lr.predict(testdf_featureSelected_lr)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
dtmodel_submission_df_fs_lr = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_dtmodel_fs_lr
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(dtmodel_submission_df_fs_lr, 'decisiontrees_submission_df_nV_fS_lr', '/content/drive/MyDrive/Data Mining II project/Results')'''

In [None]:
#score: 0,61279

### FS with RFC

In [None]:
# Instantiate the model
dt_model_rfc = DecisionTreeClassifier()

# Train the model
dt_model_rfc.fit(X_train_featureSelected_rfc, y_train)

In [None]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

# Limit the tree depth for simplicity
dt_model_rfc = DecisionTreeClassifier(max_depth=3)
dt_model_rfc.fit(X_train_featureSelected_rfc, y_train)

# Unique classes and their corresponding class names
class_names = ['Health-Conscious', 'Investor', 'Adventure Seeker', 'Fitness Enthusiast', 'Travel Enthusiast']

# Plot the decision tree
fig, ax = plt.subplots(figsize=(12, 12))  # Adjust size as needed
plot_tree(dt_model_rfc,
          feature_names=X_train_featureSelected_rfc.columns,
          class_names=class_names,
          filled=True,
          rounded=True,
          ax=ax)
plt.show()

In [None]:
# Make predictions
y_pred = dt_model_rfc.predict(X_val_featureSelected_rfc)

In [None]:
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Decision Tree Accuracy: ", accuracy)

In [None]:
# Calculate AUC-ROC
y_pred_prob = dt_model_rfc.predict_proba(X_val_featureSelected_rfc)
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("Decision Tree ROC-AUC: ", roc_auc)

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))

In [None]:
#plot roc curve?

In [None]:
# Predict on the test set for the Kaggle competition
test_predictions_dtmodel_fs_rfc = dt_model_rfc.predict(testdf_featureSelected_rfc)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
dtmodel_submission_df_fs_rfc = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_dtmodel_fs_rfc
    })

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(dtmodel_submission_df_fs_rfc, 'decisiontrees_submission_df_nV_fS_rfc', '/content/drive/MyDrive/Data Mining II project/Results')'''

In [None]:
#score: 0,61279

## 4.6 Neural Network

### FS with LR

In [None]:
from sklearn.neural_network import MLPClassifier

In [None]:
mlp_model_fs_lr = MLPClassifier()
mlp_model_fs_lr.fit(X_train_featureSelected_lr, y_train)

In [None]:
# predictions to y_pred, using the method `predict()`.

y_pred = mlp_model_fs_lr.predict(X_val_featureSelected_lr)

In [None]:
# Confusion Matrix

confusion_matrix(y_val, y_pred)

In [None]:
# accuracy score for the Multi Layer Perceptron applied on insurance.
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Multi-Layer Perceptron Accuracy: ", accuracy)

In [None]:
y_pred_prob = mlp_model_fs_lr.predict_proba(X_val_featureSelected_lr)
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("Multi layer Perceptron ROC-AUC: ", roc_auc)

In [None]:
precision_score(y_val, y_pred, average = "weighted")

In [None]:
recall_score(y_val, y_pred, average = "weighted")

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))

In [None]:
# FOR KAGGLE SUBMISSION

test_predictions_mlpmodel_fs_lr = mlp_model_fs_lr.predict(testdf_featureSelected_lr)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
mlpmodel_submission_df_fs_lr = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_mlpmodel_fs_lr
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(mlpmodel_submission_df_fs_lr, 'mlpmodel_submission_df_fs_lr', '/content/drive/MyDrive/Data Mining II project/Results')'''

In [None]:
# score 0,7699

### FS with RFC

In [None]:
mlp_model_fs_rfc = MLPClassifier()
mlp_model_fs_rfc.fit(X_train_featureSelected_rfc, y_train)

In [None]:
# predictions to y_pred, using the method `predict()`.

y_pred = mlp_model_fs_rfc.predict(X_val_featureSelected_rfc)

In [None]:
# Confusion Matrix

confusion_matrix(y_val, y_pred)

In [None]:
# accuracy score for the Multi Layer Perceptron applied on insurance.
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Multi-Layer Perceptron Accuracy: ", accuracy)

In [None]:
y_pred_prob = mlp_model_fs_rfc.predict_proba(X_val_featureSelected_rfc)
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("Multi layer Perceptron ROC-AUC: ", roc_auc)

In [None]:
precision_score(y_val, y_pred, average = "weighted")

In [None]:
recall_score(y_val, y_pred, average = "weighted")

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))

In [None]:
# FOR KAGGLE SUBMISSION

test_predictions_mlpmodel_fs_rfc = mlp_model_fs_rfc.predict(testdf_featureSelected_rfc)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
mlpmodel_submission_df_fs_rfc = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_mlpmodel_fs_rfc
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(mlpmodel_submission_df_fs_rfc, 'mlpmodel_submission_df_fs_rfc', '/content/drive/MyDrive/Data Mining II project/Results')'''

In [None]:
# score 0,757

## 4.7 Gradient Boosted Classifier

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

### FS with LR

In [None]:
gbc_model_fs_lr = GradientBoostingClassifier()
gbc_model_fs_lr.fit(X_train_featureSelected_lr, y_train)

In [None]:
# predictions to y_pred, using the method `predict()`.

y_pred = gbc_model_fs_lr.predict(X_val_featureSelected_lr)

In [None]:
# Confusion Matrix

confusion_matrix(y_val, y_pred)

In [None]:
# accuracy score for the logistic regression applied on insurance.
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Gradient Boosting Classifier Accuracy: ", accuracy)

In [None]:
y_pred_prob = gbc_model_fs_lr.predict_proba(X_val_featureSelected_lr)
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("Gradient Boosting Classifier ROC-AUC: ", roc_auc)

In [None]:
precision_score(y_val, y_pred, average = "weighted")

In [None]:
recall_score(y_val, y_pred, average = "weighted")

In [None]:
# Get feature importance
importances = gbc_model_fs_lr.feature_importances_

# Print them out
for feature, importance in zip(X_train_featureSelected_lr.columns, importances):
    print(f"The feature {feature} has an importance of {importance}")

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))

In [None]:
#Plot de ROC CURVE?

In [None]:
# FOR KAGGLE SUBMISSION

test_predictions_gbcmodel_fs_lr = gbc_model_fs_lr.predict(testdf_featureSelected_lr)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
gbcmodel_submission_df_fs_lr = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_gbcmodel_fs_lr
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(gbcmodel_submission_df_fs_lr, 'gbcmodel_submission_df_nV_fS_lr', '/content/drive/MyDrive/Data Mining II project/Results')'''

In [None]:
#score : 0,61279

### FS with RFC

In [None]:
gbc_model_fs_rfc = GradientBoostingClassifier()
gbc_model_fs_rfc.fit(X_train_featureSelected_rfc, y_train)

In [None]:
# predictions to y_pred, using the method `predict()`.

y_pred = gbc_model_fs_rfc.predict(X_val_featureSelected_rfc)

In [None]:
# Confusion Matrix

confusion_matrix(y_val, y_pred)

In [None]:
# accuracy score for the logistic regression applied on insurance.
# Calculate accuracy
accuracy = accuracy_score(y_val, y_pred)
print("Gradient Boosting Classifier Accuracy: ", accuracy)

In [None]:
y_pred_prob = gbc_model_fs_rfc.predict_proba(X_val_featureSelected_rfc)
roc_auc = roc_auc_score(y_val, y_pred_prob, multi_class='ovr')
print("Gradient Boosting Classifier ROC-AUC: ", roc_auc)

In [None]:
precision_score(y_val, y_pred, average = "weighted")

In [None]:
recall_score(y_val, y_pred, average = "weighted")

In [None]:
# Get feature importance
importances = gbc_model_fs_rfc.feature_importances_

# Print them out
for feature, importance in zip(X_train_featureSelected_rfc.columns, importances):
    print(f"The feature {feature} has an importance of {importance}")

In [None]:
print('Accuracy:', accuracy_score(y_val, y_pred))
print('\nConfusion Matrix:\n', confusion_matrix(y_val, y_pred))
print('\nClassification Report:\n', classification_report(y_val, y_pred))

In [None]:
#plot de roc curve?

In [None]:
# FOR KAGGLE SUBMISSION

test_predictions_gbcmodel_fs_rfc = gbc_model_fs_rfc.predict(testdf_featureSelected_rfc)

In [None]:
# Create a DataFrame with the required columns (e.g., 'Id' and 'lifestyle_type')
gbcmodel_submission_df_fs_rfc = pd.DataFrame({
    "citizen_id": testdf.index,  # Replace "Id" with the actual ID column of your test dataset
    "lifestyle_type": test_predictions_gbcmodel_fs_rfc
})

In [None]:
'''def save_versioned(df, base_filename, directory):
    version = 1
    filename = f"{base_filename}_v{version}.csv"
    full_path = os.path.join(directory, filename)

    while os.path.exists(full_path):
        version += 1
        filename = f"{base_filename}_v{version}.csv"
        full_path = os.path.join(directory, filename)

    df.to_csv(full_path, index=False)
    print(f"Saved to {full_path}")

# Usage example:
save_versioned(gbcmodel_submission_df_fs_rfc, 'gbcmodel_submission_df_nV_fS_rfc', '/content/drive/MyDrive/Data Mining II project/Results')'''

In [None]:
#score: 0,61279