In [2]:
import pandas as pd

In [3]:
cars= pd.read_csv("cars_preprocessed_data.csv")

In [4]:
cars.head(2)

Unnamed: 0,symboling,wheel_base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,88.6,168.8,64.1,48.8,2548,4,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,88.6,168.8,64.1,48.8,2548,4,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0


In [5]:
# divide data into IVs and target
X= cars.iloc[:,:-1]
y= cars.iloc[:,-1]

In [6]:
len(X)

205

# 1. K-Fold Cross Validation

In [7]:
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

In [7]:
# decide the K value in K-Folds i,e K-1 one will be train and remaing one fold will be test
k=5

In [8]:
# K-Fold object initialization
kf= KFold(n_splits=k, shuffle= True, random_state=42)

In [9]:
# loop through each fold now

r2_scores= []
mse_scores= []

for fold_num, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    
    X_train, X_test= X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test= y.iloc[train_idx], y.iloc[test_idx]
    
    # scale the train data
    scaler= StandardScaler()
    X_train_scaled= scaler.fit_transform(X_train)
    X_test_scaled= scaler.transform(X_test)
    
    # fit the linear regression model now
    lr= LinearRegression()
    lr.fit(X_train_scaled, y_train)
    
    # make predictions
    y_pred= lr.predict(X_test_scaled)
    
    # get the metrics
    r2= r2_score(y_test, y_pred)
    mse= mean_squared_error(y_test, y_pred)
    r2_scores.append(r2)
    mse_scores.append(mse)
    
    # Print fold-wise accuracy
    print(f"Fold {fold_num} - R-squared: {r2:.4f}, MSE: {mse:.4f}")    

Fold 1 - R-squared: 0.8091, MSE: 14973670.9176
Fold 2 - R-squared: 0.7179, MSE: 10570554.1392
Fold 3 - R-squared: 0.8035, MSE: 10140182.7215
Fold 4 - R-squared: 0.7944, MSE: 14056299.3449
Fold 5 - R-squared: 0.6330, MSE: 25957479.0173


In [10]:
# Print the mean and standard deviation of R-squared scores
print("Mean R-squared:", np.mean(r2_scores))
print("Standard Deviation of R-squared:", np.std(r2_scores))

Mean R-squared: 0.7515784880415255
Standard Deviation of R-squared: 0.06789398887821839


In [11]:
# Print the mean and standard deviation of MSE scores
print("Mean MSE:", np.mean(mse_scores))
print("Standard Deviation of MSE:", np.std(mse_scores))

Mean MSE: 15139637.228103694
Standard Deviation of MSE: 5728838.143957727


In [12]:
# Print average R-squared and MSE values
print(f"\nAverage R-squared across folds: {np.mean(r2_scores):.4f}")
print(f"\nAverage MSE across folds: {np.mean(mse_scores):.4f}")


Average R-squared across folds: 0.7516

Average MSE across folds: 15139637.2281


# 2. Stratified Cross Validation

In [13]:
from sklearn.model_selection import StratifiedKFold

In [14]:
# initialize stratified kfold cross validation
skf= StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [15]:
# look through each fold

scores= []

for fold_num, (train_idx, test_idx) in enumerate(skf.split(X,y), 1):
    X_train, X_test= X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test= y.iloc[train_idx], y.iloc[test_idx]
    
    # initialize the scaler object
    scaler= StandardScaler()
    
    # scale the data
    X_train_scaled= scaler.fit_transform(X_train)
    X_test_scaled= scaler.transform(X_test)
    
    # initialize the Linear Reg model
    lr= LinearRegression()
    
    # fit the model
    lr.fit(X_train_scaled, y_train)
    
    # prediction
    y_pred= lr.predict(X_test_scaled)
    
    # get the metrics
    r2= r2_score(y_test, y_pred)
    mse= mean_squared_error(y_test, y_pred)
    scores.append((r2, mse))
    
    # Print fold-wise accuracy
    print(f"Fold {fold_num} - R-squared: {r2:.4f}, MSE: {mse:.4f}")  

Fold 1 - R-squared: 0.7915, MSE: 15125283.4109
Fold 2 - R-squared: 0.6182, MSE: 30707415.5302
Fold 3 - R-squared: 0.8705, MSE: 8552031.3463
Fold 4 - R-squared: 0.4441, MSE: 20820857.6687
Fold 5 - R-squared: 0.8235, MSE: 8786795.0859




In [16]:
# Print the mean and standard deviation of R-squared and MSE scores
mean_r2, std_r2 = np.mean([score[0] for score in scores]), np.std([score[0] for score in scores])
mean_mse, std_mse = np.mean([score[1] for score in scores]), np.std([score[1] for score in scores])

In [17]:
# print mean R-Sqaured, Std and means MSE and Std
print("\nMean R-squared:", mean_r2)
print("\nMean MSE:", mean_mse)
print("\nStandard Deviation of R-squared:", std_r2)
print("\nStandard Deviation of MSE:", std_mse)


Mean R-squared: 0.7095585385344554

Mean MSE: 16798476.608379133

Standard Deviation of R-squared: 0.15773084761670123

Standard Deviation of MSE: 8302151.613638501


Note: stratified gave bad result than kfold as there could be a chance that only 1,2 or very very few values in at least one class. So that giving issue creating stratas.

# 3. Leave-P-Out Cross Validation(LPOCV)

In [8]:
from sklearn.model_selection import LeavePOut

In [9]:
# decide p value and initialize the leave p-out cv
p_value=
lpocv= LeavePOut(p=p_value)

In [None]:
# Perform leave-p-out cross-validation

scores= []

for fold_num, (train_idx, test_idx) in enumerate(lpocv.split(X), 1):
    # here i am giving only X bcuz leave p out happens on p rows those include of target columns too
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Initialize the scaler object
    scaler = StandardScaler()

    # Scale the data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialize the Linear Regression model
    lr = LinearRegression()

    # Fit the model
    lr.fit(X_train_scaled, y_train)

    # Prediction
    y_pred = lr.predict(X_test_scaled)

    # Get the metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    scores.append((r2, mse))
    
    # Print fold-wise accuracy
    print(f"Fold {fold_num} - R-squared: {r2:.4f}, MSE: {mse:.4f}") 

Fold 1 - R-squared: -71.4500, MSE: 4478569570.6082
Fold 2 - R-squared: -1.4975, MSE: 154124952.3886
Fold 3 - R-squared: -3.9512, MSE: 306093096.7261
Fold 4 - R-squared: -3.7233, MSE: 292840729.7733
Fold 5 - R-squared: -2.1088, MSE: 193021077.5544
Fold 6 - R-squared: 0.7180, MSE: 17510925.8356
Fold 7 - R-squared: -1.4446, MSE: 150935354.3666
Fold 8 - R-squared: -2.1513, MSE: 194916499.9997
Fold 9 - R-squared: -1.9649, MSE: 183911901.6667
Fold 10 - R-squared: -3.5712, MSE: 283958871.7509
Fold 11 - R-squared: 0.6905, MSE: 19228770.4140
Fold 12 - R-squared: -1.6425, MSE: 163168374.0259
Fold 13 - R-squared: -1.2644, MSE: 140223163.4261
Fold 14 - R-squared: -1.0548, MSE: 127426443.8849
Fold 15 - R-squared: -0.4270, MSE: 88517842.9347
Fold 16 - R-squared: -5.2849, MSE: 389888621.1767
Fold 17 - R-squared: -3.3089, MSE: 267689281.3619
Fold 18 - R-squared: 0.6625, MSE: 20969629.9876
Fold 19 - R-squared: -2.3756, MSE: 210306251.7675
Fold 20 - R-squared: 0.6771, MSE: 20123234.8481
Fold 21 - R-squa

In [None]:
# Print the mean and standard deviation of R-squared and MSE scores
mean_r2, std_r2 = np.mean([score[0] for score in scores]), np.std([score[0] for score in scores])
mean_mse, std_mse = np.mean([score[1] for score in scores]), np.std([score[1] for score in scores])


In [None]:
# print mean R-Sqaured, Std and means MSE and Std
print("\nMean R-squared:", mean_r2)
print("\nMean MSE:", mean_mse)
print("\nStandard Deviation of R-squared:", std_r2)
print("\nStandard Deviation of MSE:", std_mse)

Note: LOOCV and LPOCV is computationally very expensive. So my laptop is not able to perform this task and taking too much time.But this how we need to perform the both cv techniques