# Import Libraries

In [1]:
import pandas as pd
from sklearn.linear_model import LinearRegression
import pickle
from pickle import dump
from sklearn.metrics import mean_squared_error, r2_score
import math
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
from sklearn.feature_selection import f_regression, SelectKBest
import warnings
from sklearn.exceptions import DataConversionWarning


# Import Data

In [2]:
# Reading the labels from the CSV file
y_train = pd.read_csv('/workspaces/EDA_4/data/processed/heart_prevalence_y_train.csv')
y_test = pd.read_csv('/workspaces/EDA_4/data/processed/heart_prevalence_y_test.csv')
X_train_norm = pd.read_csv('/workspaces/EDA_4/data/interim/heart_prevalence_X_train_std.csv')
X_test_norm = pd.read_csv('/workspaces/EDA_4/data/interim/heart_prevalence_X_test_std.csv')

In [3]:
# Ignore specific data conversion warnings
warnings.filterwarnings("ignore", category=DataConversionWarning)

# Best Model Search

The primary aim of this phase in the project is to leverage the capabilities of the linear regression model offered by the sklearn library. Our choice is substantiated by the insights gained during the Exploratory Data Analysis (EDA), which highlighted several variables exhibiting a distribution pattern aligning with the assumptions of linear regression in relation to our target variable.

To optimize the model's effectiveness, a systematic approach will be adopted. We will employ a loop to iteratively determine the optimal value of 'k,' representing the number of variables the model should incorporate during the training process. This thoughtful selection process is crucial for enhancing the model's predictive accuracy and avoiding overfitting or underfitting scenarios.

Once we ascertain the best configuration, we will persistently store it for future reference. Subsequently, we will reload this configuration and proceed with the model training phase. This training will involve comparing the model's predictions with the actual results from the test dataset, serving as a robust validation step to gauge the model's generalization capabilities.

In essence, this meticulous procedure aims not only to identify the most effective model configuration but also to ensure its reliability and performance on unseen data. This iterative loop provides a systematic framework to fine-tune our model, fostering a more nuanced understanding of its predictive prowess.

In [4]:
# Ignore specific data conversion warnings
rmss = []  # Root Mean Squared Errors
r2s = []   # R-squared values
percents = [1, 0.8, 0.7, 0.6, 0.5]

# Iterate over different percentages of selected features
for p in percents:
    # Select top features using SelectKBest with f_regression
    selection_model = SelectKBest(f_regression, k=int(len(X_train_norm.columns) * p))
    selection_model.fit(X_train_norm, y_train)
    ix = selection_model.get_support()

    # Transform datasets using selected features
    X_train_sel = pd.DataFrame(selection_model.transform(X_train_norm), columns=X_train_norm.columns.values[ix])
    X_test_sel = pd.DataFrame(selection_model.transform(X_test_norm), columns=X_test_norm.columns.values[ix])

    # Save the selection model
    dump(selection_model, open(f"/workspaces/EDA_4/models/selection_model{p}.pk", "wb"))

    # Train linear regression model with selected features
    model = LinearRegression()
    model.fit(X_train_sel, y_train)
    y_pred = model.predict(X_train_sel)

    # Evaluate and store performance metrics
    rmss.append(math.sqrt(mean_squared_error(y_train, y_pred)))
    r2s.append(r2_score(y_train, y_pred))

# Find the index of the best models based on RMSE and R-squared
best_rmss = rmss.index(max(rmss))
best_r2s = r2s.index(max(r2s))

# Print the results
print(f"Root Mean Squared Error: {rmss[best_rmss]}")
print(f"Coefficient of Determination (R-squared): {r2s[best_r2s]}")
print(f"The best model corresponds to {percents[best_rmss]} of selected features.")


Root Mean Squared Error: 0.2794007435544796
Coefficient of Determination (R-squared): 0.9579028289114827
The best model corresponds to 0.5 of selected features.


# Model Training

In [5]:
# Load the pre-trained feature selection model
selection_model = pickle.load(open("/workspaces/EDA_4/models/selection_model0.5.pk", "rb"))
ix = selection_model.get_support()

# Transform datasets using selected features
X_train_sel = pd.DataFrame(selection_model.transform(X_train_norm), columns=X_train_norm.columns.values[ix])
X_test_sel = pd.DataFrame(selection_model.transform(X_test_norm), columns=X_test_norm.columns.values[ix])

# Train a linear regression model with the selected features
model = LinearRegression()
model.fit(X_train_sel, y_train)

# Print the intercept and coefficients of the linear regression model
print(f"Intercept (a): {model.intercept_}")
print(f"Coefficients (b): {model.coef_}")
print("-" * 32)

# Predict and evaluate on the training set
y_pred = model.predict(X_train_sel)
e1 = math.sqrt(mean_squared_error(y_train, y_pred))
r1 = r2_score(y_train, y_pred)
print(f"Root Mean Squared Error on Train: {e1}")
print(f"Coefficient of Determination on Train: {r1}")
print("-" * 32)

# Predict and evaluate on the testing set
y_pred = model.predict(X_test_sel)
e2 = math.sqrt(mean_squared_error(y_test, y_pred))
r2 = r2_score(y_test, y_pred)
print(f"Root Mean Squared Error on Test: {e2}")
print(f"Coefficient of Determination on Test: {r2}")
print("-" * 32)

# Print the difference between training and testing performance metrics
print(f"The difference between Train and Test in Root Mean Squared Error is {e1 - e2}.")
print(f"The difference between Train and Test in Coefficient of Determination is {r1 - r2}.")



Intercept (a): [8.37149865]
Coefficients (b): [[-0.15549461  0.75459943 -0.27330536  0.53737403 -0.61739438  0.56593799]]
--------------------------------
Root Mean Squared Error on Train: 0.2794007435544796
Coefficient of Determination on Train: 0.9340762739842043
--------------------------------
Root Mean Squared Error on Test: 0.5255850187397291
Coefficient of Determination on Test: 0.8686706799451416
--------------------------------
The difference between Train and Test in Root Mean Squared Error is -0.2461842751852495.
The difference between Train and Test in Coefficient of Determination is 0.06540559403906276.


# Conlusion

Upon completing this initial phase of model training, my foremost conclusion is the presence of overfitting. Consequently, it would be prudent to revisit both the Exploratory Data Analysis (EDA) and consider optimizing hyperparameters. Additionally, exploring the application of another model from the sklearn library is a viable avenue.

For the sake of thorough practice, I intend to conduct a separate training session in a new notebook employing an alternative model. In this subsequent endeavor, I will delve into hyperparameter optimization, aiming to mitigate overfitting and enhance the model's generalization performance. This approach aligns with best practices for model development and will contribute to a more robust and reliable predictive model.