# 0. Import Libraries 

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV,cross_val_score, cross_val_predict
import seaborn as sns
%matplotlib inline
plt.style.use('bmh')

from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler, OneHotEncoder, OrdinalEncoder
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix, mean_squared_error, precision_recall_curve, roc_curve, r2_score, ConfusionMatrixDisplay, auc, roc_auc_score, precision_score, recall_score
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.feature_selection import RFE
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn import metrics
from sklearn.base import BaseEstimator, TransformerMixin
from scipy import stats
from scipy.stats import expon, randint

import warnings
warnings.filterwarnings("ignore")


# Basics 

In [None]:
# adress elements of a list:
# my_list[0], my_list[1]

# adress  a column in a numpy array:
# arr[:,2]  to adress more columns arr[:,[1,2]]

# 1. Import the data set 

### Import the dataset form CVS

In [23]:
df = pd.read_csv('./MarathonData.csv')

### Import from sklearn 

In [24]:
# Import Librarie
from sklearn.datasets import load_breast_cancer

data = load_breast_cancer(return_X_y=False, as_frame=True)
# as_frame=True: pandas data frame
# as_frame=Fale: no data fram 
X = data.data
t = data.target


### Export to csv

In [25]:
df.to_csv('data.csv', index=False)

# 2. Data Preprocessing 

### Data Cleaning: 

In [26]:
# Drop Features: 
df = df.drop(columns=['Name', 'Marathon','id'])

# Drop Samples:
df = df.dropna(subset=['Category'])

# Convert to numeric values:
df['Wall21'] = pd.to_numeric(df['Wall21'])

# Impute Missing Values 
df['CrossTraining'] = df['CrossTraining'].fillna('None')

# adding new features 
#df['NewFeature'] = df[]/df['']

### Pearson's Correlation Coefficient 

In [None]:
# if the data and the target a seperate:
#df = pd.concat([data,target], axis 1)

# get the pearons correlation coefficient 
corr_matrix = df.corr(method='pearson', numeric_only=True)

# get the largest correlated feature 
target = 'MarathonTime'
best_feature = corr_matrix[target].drop(target).idxmax()
print(f"Feature with the largest predictive value for Target is: '{best_feature}'")

# get the value of the highest corrleted feature 
highest_cor_feat = corr_matrix[target].drop(target).max()
print(f"The Pearson Coefficent Value is: {highest_cor_feat:.2f} ")

# print the Correlation Values by order 
corr_matrix[target].sort_values(ascending=False)

### Visualize the Correlation Matrix using a Heat Map

In [None]:
plt.figure(figsize=(8, 6))
sns.heatmap(corr_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title("Correlation Matrix Heatmap")
plt.show()

# 3. Splitting Data in Training and Testing 

In [None]:
# GET AN OVERVIEW OF THE DISTRBUTION HOW MANY UNIQUE TIMES ARE IN A PANDAS COLUMNY
t.value_counts()

### Equal Distribution of the Target:   Simple 80/20 split:

In [31]:
# PIPELINE: If the data is seperated in Data X and Target y
X_train, X_test, t_train, t_test = train_test_split(X, t, 
                                                    test_size =0.2,
                                                    random_state=42,
                                                    shuffle=True)
                                                    

# PIPELINE: If the data is one dataset and get seperated later 
test_set,train_set = train_test_split(df,
                                  test_size=0.2,
                                  random_state=42,
                                  shuffle=True)

### Target is imbalanced and binary: Stratified 80/20 split based on target 

In [32]:
# PIPELINE: If the data is seperated in Data X and Target y
X_train, X_test, t_train, t_test = train_test_split(X, t, 
                                                    test_size =0.2,
                                                    random_state=42,
                                                    shuffle=True, 
                                                    stratify=t)

In [33]:
# just for testings
X_train_c, X_test_c, t_train_c, t_test_c = train_test_split(X, t, 
                                                    test_size =0.2,
                                                    random_state=42,
                                                    shuffle=True, 
                                                    stratify=t)

### Most predictive Feature is Imbalanced and continuous: Categories Feature into bins perform a stratified split based on the bins 

Process of partitioning the dataset into subsets while preserving the (categorical) classes prior probabilities the same as in the original dataset. 

In [None]:
# Get of how the bins should look like:
# About 95% of the samples lie in the interval [a,b]
best_feature= 'Wall21'
a = df[best_feature].mean() - 1.96*df[best_feature].std(ddof=1)
b = df[best_feature].mean() + 1.96*df[best_feature].std(ddof=1)
print(f"About 95% of the samples lie in the interval [{a:.1f},{b:.1f}]")

In [None]:
df[best_feature].describe()

In [36]:
best_feature = 'Wall21'

# Categorize feature one in order to performed stratified splitting
cat = pd.cut(df[best_feature],
                bins=[0.,1.5, 3.0, 4.5, 6., np.inf],
                labels=[1, 2, 3, 4, 5])


# Pipeline to perform a stratified split 
train_set, test_set, cat_train, cat_test = train_test_split(df, cat, 
                                                        test_size=0.2,
                                                        shuffle=True,
                                                        random_state=42,
                                                        stratify=cat)

plt.hist(cat_train, color='blue', label='Train Set', alpha=0.3)
plt.hist(cat_test, color='orange', label='Test Set', alpha=0.3)
plt.legend();

# TRAINING

# target values 
t_train = train_set['MarathonTime'].copy()

# feature matrix 
X_train = train_set.drop(labels='MarathonTime',axis=1)

# TESTING

# target values 
t_test = test_set['MarathonTime'].copy()

# feature matrix 
X_test = test_set.drop(labels='MarathonTime', axis=1)

# 3. Preprocessing Pipelines 

### One Pipline for encoding Numericial and Categorical Features:

In [37]:
num_attribs = ['km4week', 'sp4week','Wall21']
cat_attribs = ['Category', 'CrossTraining']

preprocessing_pipeline = ColumnTransformer([('num', StandardScaler(), num_attribs),
                                           ('cat', OneHotEncoder(), cat_attribs)])

### Seperate Pipelines for Encoding Numeric and Categorical Features 

In [38]:
# NUMERIC ATTRIBUTES
num_attribs = ['km4week', 'sp4week','Wall21']

num_pipeline = Pipeline(steps=[('imputer', SimpleImputer(strategy='median')), 
                       ('scaler', StandardScaler())                 
])

# CATEGORIAL PIPELINE 
cat_attribs = ['Category', 'CrossTraining']

cat_pipeline = Pipeline(steps=[  
   ('encoder', OneHotEncoder(handle_unknown='ignore'))  
])

# COMBINE CATEGORIAL AND NUMERICAL PIPELINES
preprocessing_pipeline = ColumnTransformer(transformers=[
    ('num', num_pipeline, num_attribs),
    ('cat', cat_pipeline, cat_attribs)
])

### Run just the preprocessing pipeline

In [39]:
# RUN PIPELINES
X_train_prepared = preprocessing_pipeline.fit_transform(X_train)
X_test_prepared = preprocessing_pipeline.transform(X_test)

In [40]:
# JUST FOR VISUALIZATION:

# STACK all names of Categorial and Numerical Attributes togetter 
attribute_labels = np.hstack((
    num_attribs,  # Numerical Attributes
    #*preprocessing_pipeline.named_transformers_['cat'].categories_  # Categorical Attributes
    *preprocessing_pipeline.named_transformers_['cat']['encoder'].categories_  # Try this if a seperate Pipeline for the categoris is used
))

# Visualize the data frame
data_prepared = pd.DataFrame(X_train_prepared, 
                                   columns=attribute_labels,
                                   index=X_train.index)


# 4. Train model 

In [93]:
# DECSION TREE CLASSIFIER 
pipeline = Pipeline(steps=[('preprocessing', preprocessing_pipeline),
                           ('tree', DecisionTreeClassifier(random_state=0))])


# RANDOM FOREST REGRESSOR !!!!!!!!!!!!!!! Watch out Regressor and Classifier!
pipeline = Pipeline(steps=[('preprocessing', preprocessing_pipeline),
                           ('tree', RandomForestRegressor())])

# LOGISTIC REGRESSION WITH LASSO REGULIZER (L1 PENALITY)
pipeline = Pipeline(steps=[('preprocessing',preprocessing_pipeline),
                           ('log_reg', LogisticRegression(penalty='l1', solver='saga'))])


# LASSO MODEL WITH POLYNOMIAL FEATURES 
pipeline = Pipeline(steps=[('preprocessing', preprocessing_pipeline),
                       ('poly', PolynomialFeatures()),
                       ('Lasso',Lasso(max_iter=1000))])

### Just use on Pipeline for all

In [6]:
# IF ALL INPFEATURES ARE NUMERIC
pipeline = Pipeline(steps=[
    ('scaler', MinMaxScaler()),
    ('lasso', Lasso(max_iter=1000))
])

In [None]:
# NIF FEAUTRES ARE NUMERIC ANS CATEGORICAL 
num_attribs = ['km4week', 'sp4week', 'Wall21']
cat_attribs = ['Category', 'CrossTraining']

pipeline = Pipeline(steps=[
    ('preprocessing', ColumnTransformer([
        ('num', StandardScaler(), num_attribs),
        ('cat', OneHotEncoder(), cat_attribs)
    ])),
    ('poly', PolynomialFeatures()),
    ('lasso', Lasso(max_iter=1000))
])

In [None]:
# just for Testing
pipeline = Pipeline([('scaler', MinMaxScaler()),
                     ('log_reg', LogisticRegression(penalty='l1',max_iter=200, solver='saga'))])

param_grid = {
    'log_reg__C': [0.01, 0,1,1,2,3,4,5,6,7,8,9,10,20,30,40,50,100]
}


grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=0),
    scoring='neg_mean_squared_error',
    refit=True,
    n_jobs=-1
)

grid_search.fit(X_train_c,t_train_c)
best_model_c = grid_search.best_estimator_
grid_search.best_params_


# Hyperparameter Tuning 

In [43]:
# pipeline.get_params()

## Define hyperparameters

In [94]:
## use get params first to get all the tunable hyperparamters. 

# SET OPTIMAZATION PARAMETERS (DICTIONARY)

# LOGISTIC REGRESSION
param_grid = {
    'log_reg__C': np.linspace(0.01,10,100)
}

# LINEAR REGRESSION WITH LASSO REGULIZER
param_grid = {'poly__degree': list(range(3,9)), # just for poly lasso model 
              'Lasso__alpha': [1e-5, 1e-4, 1e-3, 0.01, 0.1, 1, 10]}

# DECISION TREE
param_grid = {
    'tree__criterion': ['gini', 'entropy'],
    'tree__max_depth': np.arange(1,50),
    'tree__min_samples_split': np.arange(2,10),
    'tree__min_samples_leaf': np.arange(1,10)
}

# RANDOM FOREST
param_grid = {
    'tree__n_estimators': list(range(50,400)),
    'tree__criterion': ['gini', 'entropy'],
    'tree__max_depth': [5,6,7]
}

## Grid Search CV

In [None]:
# USE GRIDSEARCH OPTIMAZAION 
grid_search = GridSearchCV(
    estimator=pipeline,
    param_grid=param_grid,
    cv=KFold(n_splits=5, shuffle=True, random_state=0), # cv scheme
    scoring='neg_mean_squared_error',
    # scoring='f1_weighted'
    # scoring = 'accuracy'
    # scoring = 'r2'
    refit=True,
    n_jobs=-1  # Parallel processing
)

# FITTING THE MODEL 
grid_search.fit(X_train,t_train)

# USE THE BEST PARAMS FOR THE MODEL 
best_model = grid_search.best_estimator_

# PRINT THE BEST PARAMS 
grid_search.best_params_

## Randomized Search CV

In [None]:
# USE RANDOMIZED SEARCH OPTIMAZATION
random_search = RandomizedSearchCV(
    estimator=pipeline,
    param_distributions=param_grid,  
    cv=KFold(n_splits=5, shuffle=True, random_state=0), # CV scheme
    scoring='neg_mean_squared_error',
    # scoring = 'accuracy'
    # scoring = 'r2
    refit=True,
    n_jobs=-1  # Parallel processing
)

# FITTING THE MODEL 
random_search.fit(X_train,t_train)

# USE THE BEST PARAMS FOR THE MODEL 
best_model = grid_search.best_estimator_

# PRINT THE BEST PARAMS 
random_search.best_params_

# Evaluation Regression Models

## Model Prediction

In [96]:
y_train = best_model.predict(X_train)
y_test = best_model.predict(X_test)

# Evaluation Classification Models

## 95 Confidence Interval

In [None]:
# GENERAL FUNCTION TO PRINT CONFIDENCE INTERVALLS
scores_val = cross_val_score(best_model, X_train, t_train,
                            scoring='f1_weighted', 
                             cv=KFold(n_splits=5, shuffle=True, random_state=0))

confidence = 0.95
stats.t.interval(confidence,
                 len(scores_val)-1,
                 loc = scores_val.mean(),
                 scale=scores_val.std(ddof=1)/np.sqrt(len(scores_val)))

# R2 Score & 95CI

In [None]:
# CALCULATE THE R2 FOR TRAIN AND TEST SET 
r2_train = r2_score(t_train, y_train)
r2_test = r2_score(t_test, y_test)


# CROSS VALIDATION
r2_scores = cross_val_score(best_model,
                            X_train, 
                            t_train, 
                            scoring='r2', 
                            cv=10) 


# 95% CONFIDENCE INTERVALL FOR R2 SCORES
mean_r2 = r2_scores.mean()
std_r2 = r2_scores.std()
n = len(r2_scores)  # number of folds (should be > 5)

# 95% CONFIDENCE INTERVAL
confidence_interval = stats.t.interval(0.95, n-1, loc=mean_r2, scale=std_r2/np.sqrt(n))


# PRINT RESULT
print('model_name','\n---------------------------------')
print('R2 Train: ', r2_train)
print('R2 Test: ', r2_test)
print('Mean:', r2_scores.mean())
print('Standard deviation:', r2_scores.std())
print(f'95% confidence interval for R2: ({confidence_interval[0]:.3f}, {confidence_interval[1]:.3f})\n')

# PRINT RESULT OF SCORES
print('\nScores:', r2_scores)

# RMSE & CI95

In [None]:
# SET CONFIDENCE INTERVAL
confidence = 0.95

# TRAIN SET
squared_errors_train = (t_train - y_train)**2
a,b = stats.t.interval(confidence,
                       len(squared_errors_train)-1,
                       loc = squared_errors_train.mean(),
                       scale=squared_errors_train.std(ddof=1)/np.sqrt(len(squared_errors_train)))

# TEST SET
squared_errors_test = (t_test - y_test)**2
c,d=stats.t.interval(confidence,
                     len(squared_errors_test)-1,
                     loc = squared_errors_test.mean(),
                     scale=squared_errors_test.std(ddof=1)/np.sqrt(len(squared_errors_test)))
    
# PRINT RESULT
print('model_name','\n---------------------------------')
print('RMSE Train: ', np.sqrt(mean_squared_error(t_train, y_train)))
print(confidence*100,'% CI Train = [',np.sqrt(np.max([0,a])),',',np.sqrt(b),']')
    
print('\nRMSE Test: ', np.sqrt(mean_squared_error(t_test, y_test)))
print(confidence*100,'% CI Test = [',np.sqrt(np.max([0,c])),',',np.sqrt(d),']')

In [1]:
# best_model.named_steps['Lasso'].coef_

In [None]:
w = np.hstack((best_model.named_steps['Lasso'].intercept_,
              best_model.named_steps['Lasso'].coef_))
plt.figure(figsize=(15,5))
plt.stem(w)


#plt.ylabel('Weight values', size=15)
#plt.xticks(np.arange(len(w)), ['$w_{'+str(i)+'}$' for i in range(len(w))],rotation=0);

## Visualize Prediction vs true Label

In [None]:
xline = np.linspace(0,1300,1301)

plt.figure(figsize=(15,5))
plt.subplot(1,2,1)
plt.scatter(t_train, y_train); plt.plot(xline, xline, 'r')
plt.title('Training Set'); plt.xlabel('True Target'); plt.ylabel('Predicted Target')
plt.subplot(1,2,2)
plt.scatter(t_test, y_test); plt.plot(xline, xline, 'r')
plt.title('Test Set'); plt.xlabel('True Target'); plt.ylabel('Predicted Target');

## Plot the ROC/AUC Curve

In [None]:
# for multiclass prediction

AUCs_ROC = []
for i in range(7):
    fpr, tpr, thresholds = roc_curve(1*(t_test==i), 1*(y_test==i))
    AUCs_ROC += [auc(fpr, tpr)]
    plt.plot(fpr, tpr, label='Class '+str(i))

plt.legend(fontsize=15); plt.xlabel('FPR', size=15); plt.ylabel('TPR', size=15);

In [None]:
# Plot ROC/AUC curve
fpr, tpr, thresholds = metrics.roc_curve(t_test,best_model.predict_proba(X_test)[:,1])
roc_auc = metrics.auc(fpr, tpr)
display = metrics.RocCurveDisplay(fpr=fpr, tpr=tpr, roc_auc=roc_auc,
                                 estimator_name='Logistics-basic')
display.plot()

plt.show()

## Precision Recall curve

In [None]:
# For multiclass predictions
y_scores = best_model.decision_function(X_test)

AUCs = []
for i in range(7):
    precisions, recalls, thresholds = precision_recall_curve(1*(t_test==i), y_scores[:,i])
    AUCs += [auc(recalls, precisions)]
    plt.plot(recalls, precisions, label='Class '+str(i))

plt.legend(fontsize=15); plt.xlabel('Recall', size=15); plt.ylabel('Precision', size=15);

In [None]:
# For single class predictions using cross-validation
z_train = cross_val_predict(pipeline, X_train, t_train, 
                             cv=10, method='decision_function')

precision, recall, thresholds = precision_recall_curve(t_train, z_train)

plt.plot(recall, precision)
plt.xlabel('Recall', size=15)
plt.ylabel('Precision', size=15);

## Threshold for a Precision >= 90

In [None]:

idx = np.where(precision >= 0.90)[0]
idx_optimal = np.argmax(recall[idx])
print('precision:',precision[idx[idx_optimal]])
print('recall:',recall[idx[idx_optimal]])
print('thresholds',thresholds[idx[idx_optimal]])

## Precision, Recall, F1, Accuray and Confusion Matrix

In [None]:
# TRAINING SET 
print('Training set performance:')

# ACCURACY AND CLASSIFICATION REPORT
accuracy = metrics.accuracy_score(t_train, y_train)
print("Accuracy: %.3f" % accuracy)
print(classification_report(t_train, y_train))

# CONFUION MATRIX FOR THE TEST SET
accuracy = metrics.accuracy_score(t_train, y_train)
print("Accuracy: %.3f" % accuracy)

cm = metrics.confusion_matrix(t_train, y_train)
sns.heatmap(cm.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');

# TEST SET 
print('Test set performance:')

# ACCURACY AND CLASSIFICATION REPORT
accuracy = metrics.accuracy_score(t_test, y_test)
print("Accuracy: %.3f" % accuracy)
print(classification_report(t_test, y_test))

# CONFUION MATRIX FOR THE TEST SET
accuracy = metrics.accuracy_score(t_test, y_test)
print("Accuracy: %.3f" % accuracy)

cm = metrics.confusion_matrix(t_test, y_test)
sns.heatmap(cm.T, square=True, annot=True, fmt='d', cbar=False)
plt.xlabel('true label')
plt.ylabel('predicted label');

## Vizualize which vectors are set to zero (just for Lasso)

In [None]:
# simplest:
np.where(best_model.named_steps['Lasso'].coef_[0,:]!=0)[0].shape # erstes Target Feture
np.where(best_model.named_steps['Lasso'].coef_[1,:]!=0)[0].shape # zweites Target Feature 

In [None]:
# STACK all names of Categorial and Numerical Attributes together
attribute_labels = np.hstack((
    num_attribs,  # Numerical Attributes
    *preprocessing_pipeline.named_transformers_['cat']['encoder'].categories_  # Categorical Attributes
))

# Access the coefficients from the best model
lasso_coefs = best_model.named_steps['Lasso'].coef_

# Get excluded and included features without creating a DataFrame
excluded_features = attribute_labels[lasso_coefs == 0]
print(f"Excluded features in Lasso:\n {excluded_features}")

included_features = attribute_labels[lasso_coefs != 0]
print(f"\nIncluded features in Lasso:\n {included_features}")

In [None]:

# STACK all names of Categorial and Numerical Attributes togetter 
attribute_labels = np.hstack((
    num_attribs,  # Numerical Attributes
    #*preprocessing_pipeline.named_transformers_['cat'].categories_  # Categorical Attributes
    *preprocessing_pipeline.named_transformers_['cat']['encoder'].categories_  # Try this if a seperate Pipeline for the categoris is used
))

# Visualize the data frame
data_prepared = pd.DataFrame(X_train_prepared, 
                                   columns=attribute_labels,
                                   index=X_train.index)

# access the coefficients
lasso_coefs = best_model.named_steps['Lasso'].coef_

# Get included features
excluded_features = np.array(data_prepared.columns)[lasso_coefs == 0]
print(f"Excluded features in Lasso:\n {excluded_features}")

# Get the included features
included_features = np.array(prepared.columns)[lasso_coefs != 0]
print(f"\nIncluded features in Lasso:\n {included_features}")

## Visualize the final weight coefficients (end of lecture 6)

In [None]:
# Access coefficients for the linear regression model
linreg_coefs = best_model.named_steps['Lasso'].coef_ # acces the coeeficeints 
lin_reg_intersept = best_model.named_steps['Lasso'].intercept_ # acces the intercepts, look at the program down below to how to stack them 

# Get the feature names after preprocessing
feature_names = preprocessing_pipeline.get_feature_names_out()

# Combine feature names with coefficients
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': linreg_coefs
})

# Sort by the absolute value of the coefficients to see the most impactful features
coef_df['Abs_Coefficient'] = coef_df['Coefficient'].abs()
coef_df = coef_df.sort_values(by='Abs_Coefficient', ascending=False)
coef_df[['Feature', 'Coefficient']]

In [5]:
# concatenating all parameters into vector w (end of lecture 6)

w = np.vstack((best_model.named_steps['lasso'].intercept_, # the intercept is the w0 parameter (in y = ax + b, b is the intercept)
               best_model.named_steps['lasso'].coef_[:,np.newaxis])) # acces all the coefficients w1 to wN, a are the coeffiencts 

plt.figure(figsize=(15,5))
plt.stem(w)
plt.ylabel('Weight values', size=15)
plt.xticks(np.arange(len(w)), ['$w_{'+str(i)+'}$' for i in range(len(w))],rotation=0);

NameError: name 'lasso_model' is not defined

Summary:
R² Score: Measures how well the model explains the variance of the target.
Precision: Measures the accuracy of positive predictions.
F1 Score: Balances precision and recall to provide a single metric.
The 95% Confidence Interval around each of these metrics provides a range where the true metric value is likely to lie, offering insight into the reliability of the metric estimate

1. R² Score (Coefficient of Determination)
Definition: The R² score measures the proportion of the variance in the dependent variable (target) that is predictable from the independent variables (features). It ranges from 0 to 1, where 1 indicates perfect prediction, and 0 means the model doesn't explain any of the variance.
Interpretation:
R² = 0: The model does not explain the variability of the data at all.
R² = 1: The model perfectly predicts the outcome.
Negative R²: The model performs worse than a horizontal line (i.e., the model is worse than predicting the mean of the data).
95% Confidence Interval (CI): The CI for the R² score gives a range of values within which the true R² score lies with 95% confidence. A narrow interval suggests the R² score is reliable, while a wide interval implies greater uncertainty about the model's performance.

2. Precision (Positive Predictive Value)
Definition: Precision is the ratio of true positives (correct positive predictions) to the total number of predicted positives (true positives + false positives). It measures the accuracy of the positive predictions.
Precision = TP/(TP + FP)

 
Interpretation:
High precision: Indicates that most of the predicted positives are indeed correct (few false positives).
Low precision: Many of the predicted positives are incorrect (high false positive rate).
95% Confidence Interval: The CI for precision estimates the range in which the true precision lies with 95% confidence. It reflects the model’s ability to avoid false positives, and a narrow interval suggests a reliable precision estimate

 F1 Score
Definition: The F1 score is the harmonic mean of precision and recall, and it balances the two metrics. It’s particularly useful when you need a balance between precision and recall (especially in imbalanced datasets)
​Precision focuses on minimizing false positives, while recall focuses on minimizing false negatives, and F1 balances both.
Interpretation:
High F1 score: The model has both high precision and high recall, meaning it is good at both predicting positive cases and avoiding false negatives.
Low F1 score: The model may be either poor at precision or recall, meaning it either predicts too many false positives or misses a lot of true positives.
95% Confidence Interval: The CI for the F1 score provides a range in which the true F1 score is likely to fall, with 95% confidence. A narrow interval means the F1 score estimate is more reliable, while a wide interval indicates uncertainty.

Root Mean Squared Error (RMSE)
Definition: RMSE is a standard way to measure the error of a model in predicting continuous outcomes. It represents the square root of the average of the squared differences between predicted and actual values. The formula is:

Interpretation:
Low RMSE: Indicates that the predicted values are close to the actual values, meaning the model performs well.
High RMSE: Indicates that the model’s predictions are, on average, far from the actual values, meaning the model has poor performance.
RMSE has the same units as the dependent variable, making it easily interpretable as an average error in the same units as the predicted outcome.
95% Confidence Interval: The confidence interval for RMSE provides a range in which the true RMSE is likely to fall with 95% confidence. A narrow CI indicates the model's error estimate is reliable and consistent, while a wide CI suggests variability or uncertainty in the model’s performance. The CI can be calculated through bootstrapping or other statistical techniques.