# Import Libraries

In [2]:
import pandas as pd
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error, r2_score
import math
import pickle
from pickle import dump
from sklearn.feature_selection import f_regression, SelectKBest
import warnings
from sklearn.exceptions import DataConversionWarning
from sklearn.model_selection import GridSearchCV

# Import Data

In [3]:
y_train = pd.read_csv('/workspaces/EDA_4/data/processed/heart_prevalence_y_train.csv')
y_test = pd.read_csv('/workspaces/EDA_4/data/processed/heart_prevalence_y_test.csv')
X_train_norm = pd.read_csv('/workspaces/EDA_4/data/interim/heart_prevalence_X_train_std.csv')
X_test_norm = pd.read_csv('/workspaces/EDA_4/data/interim/heart_prevalence_X_test_std.csv')
X_train = pd.read_csv('/workspaces/EDA_4/data/processed/heart_prevalence_X_train.csv')
X_test = pd.read_csv('/workspaces/EDA_4/data/processed/heart_prevalence_X_test.csv')


In [4]:
# Ignore specific data conversion warnings
warnings.filterwarnings("ignore", category=DataConversionWarning)

# Model Training

As I embark on the model training phase, I've chosen to include both normalized and non-normalized datasets this time. The purpose is to scrutinize how data normalization impacts the model's performance and make comparisons between the outcomes under different preprocessing conditions.

Following the methodology established in the previous script, I'll use a systematic loop to identify the optimal number of variables for our model. This step is crucial, as it aims to strike a balance between the model's complexity and its ability to generalize, ultimately contributing to a more effective and robust predictive tool.

Once we determine the optimal configuration, the next step involves evaluating the model's performance metrics, with a specific focus on the mean squared error and the coefficient of determination. This evaluation aims to highlight the subtle differences between the training and testing datasets, serving as a crucial indicator of the model's ability to generalize well to unseen data.

By incorporating both normalized and non-normalized datasets, my goal is to gain nuanced insights into how preprocessing decisions influence the model's performance. This comprehensive exploration aligns with best practices in model development and will play a pivotal role in guiding decisions for model selection and fine-tuning in the subsequent phases.

In [5]:
# Lists to store root mean squared errors (rmss) and R-squared values (r2s)
rmss = []
r2s = []

# Percentage values for feature selection
percents = [1, 0.8, 0.7, 0.6, 0.5]

# Iterate over different percentages for feature selection
for p in percents:
    
    # SelectKBest with f_regression for feature selection
    selection_model = SelectKBest(f_regression, k=int(len(X_train_norm.columns) * p))
    selection_model.fit(X_train_norm, y_train)
    ix = selection_model.get_support()

    # Transform datasets based on selected features
    X_train_sel = pd.DataFrame(selection_model.transform(X_train_norm), columns=X_train_norm.columns.values[ix])
    X_test_sel = pd.DataFrame(selection_model.transform(X_test_norm), columns=X_test_norm.columns.values[ix])

    # Save the selection model for potential future use
    dump(selection_model, open(f"/workspaces/EDA_4/models/selection_model_ridge{p}.pk", "wb"))

    # Ridge Regression model
    ridge_model = Ridge(alpha=0.1, max_iter=300)
    ridge_model.fit(X_train_sel, y_train)

    # Predictions on the training set
    y_pred = ridge_model.predict(X_train_sel)
    
    # Calculate and store root mean squared error (rmse) and R-squared value (r2)
    rmss.append(math.sqrt(mean_squared_error(y_train, y_pred)))
    r2s.append(r2_score(y_train, y_pred))

# Find the index of the best model based on maximum rmse and r2 values
best_rmss = rmss.index(max(rmss))
best_r2s = r2s.index(max(r2s))

# Print the evaluation metrics for the best model
print(f"Root Mean Squared Error: {rmss[best_rmss]}")
print(f"Coefficient of Determination (R-squared): {r2s[best_r2s]}")
print(f"Best model is selected with {percents[best_rmss]} of features")

# Load the selection model of the best-performing percentage
selection_model = pickle.load(open(f"/workspaces/EDA_4/models/selection_model_ridge0.5.pk", "rb"))
ix = selection_model.get_support()

# Transform datasets based on selected features
X_train_sel = pd.DataFrame(selection_model.transform(X_train_norm), columns=X_train_norm.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test_norm), columns=X_test_norm.columns.values[ix])

# Create a Ridge Regression model
model = Ridge(alpha=0.1, max_iter=300)
model.fit(X_train_sel, y_train)

# Display model coefficients and intercept
print('-' * 35)
print(f"Intercept (a): {model.intercept_}")
print(f"Coefficients (b): {model.coef_}")
print('-' * 35)

# Evaluate the model on the training set
y_pred = model.predict(X_train_sel)
e1 = math.sqrt(mean_squared_error(y_train, y_pred))
r1 = r2_score(y_train, y_pred)
print(f"Root Mean Squared Error for Training set: {e1}")
print(f"Coefficient of Determination for Training set: {r1}")
print('-' * 35)

# Evaluate the model on the testing set
y_pred = model.predict(X_test_sel)
e2 = math.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error for Testing set: {e2}")
print(f"Coefficient of Determination for Testing set: {r2}")
print('-' * 35)

# Print the difference between training and testing performance metrics
print(f"The difference between Train and Test in Root Mean Squared Error is {e1 - e2}.")
print(f"The difference between Train and Test in Coefficient of Determination is {r1 - r2}.")



Root Mean Squared Error: 0.27978595682405133
Coefficient of Determination (R-squared): 0.9574473435853107
Best model is selected with 0.5 of features
-----------------------------------
Intercept (a): [8.37149865]
Coefficients (b): [[-0.16040501  0.68951099 -0.24149325  0.54356876 -0.57176282  0.54451463]]
-----------------------------------
Root Mean Squared Error for Training set: 0.27978595682405133
Coefficient of Determination for Training set: 0.9338943689562693
-----------------------------------
Root Mean Squared Error for Testing set: 0.5216644287205902
Coefficient of Determination for Testing set: 0.8706226686953741
-----------------------------------
The difference between Train and Test in Root Mean Squared Error is -0.24187847189653883.
The difference between Train and Test in Coefficient of Determination is 0.06327170026089524.


In [6]:
# Lists to store root mean squared errors (rmss) and R-squared values (r2s)
rmss = []
r2s = []

# Percentage values for feature selection
percents = [1, 0.8, 0.7, 0.6, 0.5]

# Iterate over different percentages for feature selection
for p in percents:
    
    # SelectKBest with f_regression for feature selection
    selection_model = SelectKBest(f_regression, k=int(len(X_train.columns) * p))
    selection_model.fit(X_train, y_train)
    ix = selection_model.get_support()

    # Transform datasets based on selected features
    X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns=X_train.columns.values[ix])
    X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns=X_test.columns.values[ix])

    # Save the selection model for potential future use
    dump(selection_model, open(f"/workspaces/EDA_4/models/selection_model_ridge{p}.pk", "wb"))

    # Ridge Regression model
    ridge_model = Ridge(alpha=0.1, max_iter=300)
    ridge_model.fit(X_train_sel, y_train)

    # Predictions on the training set
    y_pred = ridge_model.predict(X_train_sel)
    
    # Calculate and store root mean squared error (rmse) and R-squared value (r2)
    rmss.append(math.sqrt(mean_squared_error(y_train, y_pred)))
    r2s.append(r2_score(y_train, y_pred))

# Find the index of the best model based on maximum rmse and r2 values
best_rmss = rmss.index(max(rmss))
best_r2s = r2s.index(max(r2s))

# Print the evaluation metrics for the best model
print(f"Root Mean Squared Error: {rmss[best_rmss]}")
print(f"Coefficient of Determination (R-squared): {r2s[best_r2s]}")
print(f"Best model is selected with {percents[best_rmss]} of features")

# Load the selection model of the best-performing percentage
selection_model = pickle.load(open(f"/workspaces/EDA_4/models/selection_model_ridge0.5.pk", "rb"))
ix = selection_model.get_support()

# Transform datasets based on selected features
X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns=X_train.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns=X_test.columns.values[ix])

# Create a Ridge Regression model
model = Ridge(alpha=0.1, max_iter=300)
model.fit(X_train_sel, y_train)

# Display model coefficients and intercept
print('-' * 35)
print(f"Intercept (a): {model.intercept_}")
print(f"Coefficients (b): {model.coef_}")
print('-' * 35)

# Evaluate the model on the training set
y_pred = model.predict(X_train_sel)
e1 = math.sqrt(mean_squared_error(y_train, y_pred))
r1 = r2_score(y_train, y_pred)
print(f"Root Mean Squared Error for Training set: {e1}")
print(f"Coefficient of Determination for Training set: {r1}")
print('-' * 35)

# Evaluate the model on the testing set
y_pred = model.predict(X_test_sel)
e2 = math.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error for Testing set: {e2}")
print(f"Coefficient of Determination for Testing set: {r2}")
print('-' * 35)

# Print the difference between training and testing performance metrics
print(f"The difference between Train and Test in Root Mean Squared Error is {e1 - e2}.")
print(f"The difference between Train and Test in Coefficient of Determination is {r1 - r2}.")

Root Mean Squared Error: 0.2825484423493222
Coefficient of Determination (R-squared): 0.9574454972798395
Best model is selected with 0.5 of features
-----------------------------------
Intercept (a): [2.63161161]
Coefficients (b): [[-2.84166300e-05  1.63809491e-01 -9.72199961e-02  2.61134043e-01
  -2.67989164e-01  1.25618512e+00]]
-----------------------------------
Root Mean Squared Error for Training set: 0.2825484423493222
Coefficient of Determination for Training set: 0.9325825276763562
-----------------------------------
Root Mean Squared Error for Testing set: 0.5105536241453681
Coefficient of Determination for Testing set: 0.876075131352427
-----------------------------------
The difference between Train and Test in Root Mean Squared Error is -0.2280051817960459.
The difference between Train and Test in Coefficient of Determination is 0.056507396323929204.


After conducting both training sessions, it becomes evident that the model trained with non-normalized data exhibits a smaller difference between the training and testing datasets. Consequently, it displays less overfitting and higher overall quality. I opt for this model, save its configuration, and proceed to fine-tune its hyperparameters for optimization in the subsequent steps.







In [7]:
# Load the selection model of the best-performing percentage
selection_model = pickle.load(open(f"/workspaces/EDA_4/models/selection_model_ridge0.5.pk", "rb"))
ix = selection_model.get_support()

# Transform datasets based on selected features
X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns=X_train.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns=X_test.columns.values[ix])

# Create a Ridge Regression model
model = Ridge(alpha=0.1, max_iter=300)
best_model = model.fit(X_train_sel, y_train)

# Save the best model
dump(best_model, open(f"/workspaces/EDA_4/models/best_model_ridge.pk", "wb"))

# Optimization

As we approach the final phase of the project, our objective is to fine-tune the model's hyperparameters, aiming to enhance its overall effectiveness while mitigating overfitting. To accomplish this, we'll load the previously selected model, and through a systematic exploration of hyperparameters using GridSearch, we intend to identify the optimal combination that maximizes performance.

Utilizing a dictionary, we'll specify the hyperparameters to be optimized, and the GridSearch algorithm from sklearn will systematically search through the provided parameter grid to find the configuration that yields the best results. It's worth noting that, given the relatively small size of our dataset, the computational overhead of GridSearch is manageable, and the process is expected to conclude efficiently.

This meticulous hyperparameter tuning step is crucial for refining the model's predictive capabilities and ensuring that it generalizes well to unseen data. The goal is to strike a balance that maximizes performance without compromising the model's ability to adapt to new information. The outcome of this optimization process will mark the culmination of our efforts, providing us with a well-tailored and effective predictive model for the given dataset.

In [8]:
# Load the selection model of the best-performing percentage
selection_model = pickle.load(open(f"/workspaces/EDA_4/models/selection_model_ridge0.5.pk", "rb"))
ix = selection_model.get_support()

# Transform datasets based on selected features
X_train_sel = pd.DataFrame(selection_model.transform(X_train), columns=X_train.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test), columns=X_test.columns.values[ix])

In [9]:
# Dictionary for Ridge Model hyperparameters
hyper = {
    'alpha': [1e-3, 1e-2, 1e-1, 1.0, 10.0],            # Regularization parameter
    'fit_intercept': [True, False],                     # Whether to fit the intercept                        
    'copy_X': [True, False],                            # Whether to make a copy of X
    'max_iter': [None, 100, 500, 1000],                 # Maximum number of iterations
    'tol': [1e-4, 1e-3, 1e-2],                          # Tolerance for convergence
    'solver': ['auto', 'svd', 'cholesky', 'lsqr'],     # Solver method
    'random_state': [None, 42, 100],                    # Seed for random number generation
}

# Store the model into a variable
ridge_regression = Ridge(random_state=24)

# Hyperparameter tuning using Grid Search
grid = GridSearchCV(ridge_regression, hyper, cv = 5)
grid.fit(X_train_sel, y_train)


In [10]:
# Get the best hyperparameters from the grid search results
best_hyper = grid.best_params_

# Create an optimized Ridge Regression model using the best hyperparameters
opt_model = Ridge(**best_hyper)
opt_model.fit(X_train_sel, y_train)

# Display model coefficients and intercept
print('-' * 35)
print(f"Intercept (a): {opt_model.intercept_}")
print(f"Coefficients (b): {opt_model.coef_}")
print('-' * 35)

# Evaluate the model on the training set
y_pred = opt_model.predict(X_train_sel)
e1 = math.sqrt(mean_squared_error(y_train, y_pred))
r1 = r2_score(y_train, y_pred)
print(f"Root Mean Squared Error for Training set: {e1}")
print(f"Coefficient of Determination for Training set: {r1}")
print('-' * 35)

# Evaluate the model on the testing set
y_pred = opt_model.predict(X_test_sel)
e2 = math.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error for Testing set: {e2}")
print(f"Coefficient of Determination for Testing set: {r2}")
print('-' * 35)

# Print the difference between training and testing performance metrics
print(f"The difference between Train and Test in Root Mean Squared Error is {e1 - e2}.")
print(f"The difference between Train and Test in Coefficient of Determination is {r1 - r2}.")

-----------------------------------
Intercept (a): [5.88958773]
Coefficients (b): [[-4.56273848e-05  1.88729360e-01 -1.32780122e-01  1.97686126e-01
  -1.53890686e-01  3.30592230e-01]]
-----------------------------------
Root Mean Squared Error for Training set: 0.3016723228171599
Coefficient of Determination for Training set: 0.9231475768642361
-----------------------------------
Root Mean Squared Error for Testing set: 0.5053895283499601
Coefficient of Determination for Testing set: 0.8785693782344194
-----------------------------------
The difference between Train and Test in Root Mean Squared Error is -0.2037172055328002.
The difference between Train and Test in Coefficient of Determination is 0.04457819862981671.


In [11]:
# Save the absolute best and optimized model :)
dump(opt_model, open(f"/workspaces/EDA_4/models/opt_model_ridge.pk", "wb"))

# Conclusion (final)

After a series of deliberations and strategic decisions, we have successfully further reduced the overfitting to an impressive 0.045. Considering the practical goals of the project, we find this outcome satisfactory. Importantly, each decision and step taken throughout the process has been geared towards progressively minimizing the gap between the training and testing datasets. This continuous refinement has contributed significantly to the enhancement of our model.

In summary, we explored two models, linear regression and Ridge, with the latter providing a modest yet notable improvement in addressing the overfitting issue. Subsequently, by introducing non-normalized data into this Ridge model, we achieved even better results. As a result, we have chosen to proceed with the non-normalized Ridge model for the final optimization phase.

In this optimization phase, we fine-tuned the hyperparameters, leading to a further enhancement of results. The overall progress from the initial model training in the previous script to the latest phase reflects a significant improvement. We are content with the achieved outcome and, for the time being, consider the project concluded.