In [6]:
import pandas as pd

In [11]:
cars= pd.read_csv("cars_preprocessed_data.csv")

In [12]:
cars.head(2)

Unnamed: 0,symboling,wheel_base,length,width,height,curb_weight,num_of_cylinders,engine_size,bore,stroke,compression_ratio,horsepower,peak_rpm,city_mpg,highway_mpg,price
0,3,88.6,168.8,64.1,48.8,2548,4,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,88.6,168.8,64.1,48.8,2548,4,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0


In [13]:
# divide data into IVs and target
X= cars.iloc[:,:-1]
y= cars.iloc[:,-1]

In [14]:
len(X)

205

# 1. K-Fold Cross Validation

In [15]:
from sklearn.model_selection import KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error
import numpy as np

In [16]:
# decide the K value in K-Folds i,e K-1 one will be train and remaing one fold will be test
k=5

In [17]:
# K-Fold object initialization
kf= KFold(n_splits=k, shuffle= True, random_state=42)

In [18]:
# loop through each fold now

r2_scores= []
mse_scores= []

for fold_num, (train_idx, test_idx) in enumerate(kf.split(X, y), 1):
    
    X_train, X_test= X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test= y.iloc[train_idx], y.iloc[test_idx]
    
    # scale the train data
    scaler= StandardScaler()
    X_train_scaled= scaler.fit_transform(X_train)
    X_test_scaled= scaler.transform(X_test)
    
    # fit the linear regression model now
    lr= LinearRegression()
    lr.fit(X_train_scaled, y_train)
    
    # make predictions
    y_pred= lr.predict(X_test_scaled)
    
    # get the metrics
    r2= r2_score(y_test, y_pred)
    mse= mean_squared_error(y_test, y_pred)
    r2_scores.append(r2)
    mse_scores.append(mse)
    
    # Print fold-wise accuracy
    print(f"Fold {fold_num} - R-squared: {r2:.4f}, MSE: {mse:.4f}")    

Fold 1 - R-squared: 0.8091, MSE: 14973670.9176
Fold 2 - R-squared: 0.7179, MSE: 10570554.1392
Fold 3 - R-squared: 0.8035, MSE: 10140182.7215
Fold 4 - R-squared: 0.7944, MSE: 14056299.3449
Fold 5 - R-squared: 0.6330, MSE: 25957479.0173


In [19]:
# Print the mean and standard deviation of R-squared scores
print("Mean R-squared:", np.mean(r2_scores))
print("Standard Deviation of R-squared:", np.std(r2_scores))

Mean R-squared: 0.7515784880415255
Standard Deviation of R-squared: 0.06789398887821839


In [20]:
# Print the mean and standard deviation of MSE scores
print("Mean MSE:", np.mean(mse_scores))
print("Standard Deviation of MSE:", np.std(mse_scores))

Mean MSE: 15139637.228103694
Standard Deviation of MSE: 5728838.143957727


In [21]:
# Print average R-squared and MSE values
print(f"\nAverage R-squared across folds: {np.mean(r2_scores):.4f}")
print(f"\nAverage MSE across folds: {np.mean(mse_scores):.4f}")


Average R-squared across folds: 0.7516

Average MSE across folds: 15139637.2281


# 2. Stratified Cross Validation

In [22]:
from sklearn.model_selection import StratifiedKFold

In [23]:
# initialize stratified kfold cross validation
skf= StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [24]:
# look through each fold

scores= []

for fold_num, (train_idx, test_idx) in enumerate(skf.split(X,y), 1):
    X_train, X_test= X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test= y.iloc[train_idx], y.iloc[test_idx]
    
    # initialize the scaler object
    scaler= StandardScaler()
    
    # scale the data
    X_train_scaled= scaler.fit_transform(X_train)
    X_test_scaled= scaler.transform(X_test)
    
    # initialize the Linear Reg model
    lr= LinearRegression()
    
    # fit the model
    lr.fit(X_train_scaled, y_train)
    
    # prediction
    y_pred= lr.predict(X_test_scaled)
    
    # get the metrics
    r2= r2_score(y_test, y_pred)
    mse= mean_squared_error(y_test, y_pred)
    scores.append((r2, mse))
    
    # Print fold-wise accuracy
    print(f"Fold {fold_num} - R-squared: {r2:.4f}, MSE: {mse:.4f}")  

Fold 1 - R-squared: 0.7915, MSE: 15125283.4109
Fold 2 - R-squared: 0.6182, MSE: 30707415.5302
Fold 3 - R-squared: 0.8705, MSE: 8552031.3463
Fold 4 - R-squared: 0.4441, MSE: 20820857.6687
Fold 5 - R-squared: 0.8235, MSE: 8786795.0859




In [25]:
# Print the mean and standard deviation of R-squared and MSE scores
mean_r2, std_r2 = np.mean([score[0] for score in scores]), np.std([score[0] for score in scores])
mean_mse, std_mse = np.mean([score[1] for score in scores]), np.std([score[1] for score in scores])

In [26]:
# print mean R-Sqaured, Std and means MSE and Std
print("\nMean R-squared:", mean_r2)
print("\nMean MSE:", mean_mse)
print("\nStandard Deviation of R-squared:", std_r2)
print("\nStandard Deviation of MSE:", std_mse)


Mean R-squared: 0.7095585385344554

Mean MSE: 16798476.608379133

Standard Deviation of R-squared: 0.15773084761670123

Standard Deviation of MSE: 8302151.613638501


Note: stratified gave bad result than kfold as there could be a chance that only 1,2 or very very few values in at least one class. So that giving issue creating stratas.

# another way of performing K-fold or Stratified Cross Validation 

In [27]:
# decide the K value in K-Folds i,e K-1 one will be train and remaing one fold will be test
k=5

In [28]:
# K-Fold object initialization
kf= KFold(n_splits=k, shuffle= True, random_state=42)

In [29]:
# Initialize an empty list to store scores
mse_scores = []
r2_scores = []

for fold_num, (train_idx, test_idx) in enumerate(kf.split(X, y),1):
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

    # Initialize the scaler object
    scaler = StandardScaler()

    # Scale the data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)
    
    # initialize the model
    lin_reg= LinearRegression()

    # Use cross_val_score for linear regression with negative mean squared error scoring
    mse_scores_fold = cross_val_score(lin_reg, X_train_scaled, y_train, cv=kf, scoring='neg_mean_squared_error')
    r2_scores_fold = cross_val_score(lin_reg, X_train_scaled, y_train, cv=kf, scoring='r2')

    # Append the scores to the list
    mse_scores.extend(mse_scores_fold)
    r2_scores.extend(r2_scores_fold)


In [30]:
# Convert scores to positive mean squared error for consistency with cross_val_score
positive_mse_scores = -np.array(mse_scores)

In [31]:
print("K-Fold Cross Validation Scores (Negative MSE):", mse_scores) # you can give positive_mse_scores as well
print("\n Mean of Mean Squared Error:", np.mean(positive_mse_scores))

K-Fold Cross Validation Scores (Negative MSE): [-23973444.872037984, -24976282.251442764, -9726855.517172629, -16906592.47772231, -17095930.702452388, -25132174.23098715, -10629511.493623907, -10139700.476482736, -9327816.687313203, -44917335.38403027, -10645559.290984062, -13615951.46593609, -26936509.123610046, -11789447.295334019, -26168776.945961624, -37240240.14517642, -8136305.541966556, -13014725.475869901, -8703940.462190865, -25905823.96908365, -8765997.645473344, -6838758.32190828, -4755992.573596993, -18113564.082114115, -16055401.387503799]

 Mean of Mean Squared Error: 17180505.512799002


In [32]:
print("K-Fold Cross Validation Scores (R-squared):", r2_scores)
print("\n Mean R-squared:", np.mean(r2_scores))

K-Fold Cross Validation Scores (R-squared): [0.5469872078311093, 0.5172798265959171, 0.7258372990219003, 0.7038422173949692, 0.8030871453735979, 0.7095838135468853, 0.7921603818834444, 0.6905626138051502, 0.8305515891962381, 0.5075283304787475, 0.8220940635583325, 0.7537092203095692, 0.36125145338061826, 0.6966612937005685, 0.7681136459200336, 0.6351848290480038, 0.6725952577315334, 0.6730141747927769, 0.7859085966122298, 0.6807688452405801, 0.8860884273377867, 0.7681561119380558, 0.7390055797709432, 0.7591694577498573, 0.8083817486962445]

 Mean R-squared: 0.7055009252366038


Note: The cross_val_score function internally performs cross-validation, and when you use it with Kfold(n_splits=5), it will perform 5-fold cross-validation on each fold separately. Therefore, you'll get multiple R-squared scores corresponding to each fold of the outer loop.

# 3. Leave-P-Out Cross Validation(LPOCV)

In [33]:
from sklearn.model_selection import LeavePOut

In [35]:
# decide p value and initialize the leave p-out cv
p_value=10
lpocv= LeavePOut(p=p_value)

In [None]:
# Perform leave-p-out cross-validation

scores= []

for fold_num, (train_idx, test_idx) in enumerate(lpocv.split(X), 1):
    # here i am giving only X bcuz leave p out happens on p rows those include of target columns too
    X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
    y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
    
    # Initialize the scaler object
    scaler = StandardScaler()

    # Scale the data
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    # Initialize the Linear Regression model
    lr = LinearRegression()

    # Fit the model
    lr.fit(X_train_scaled, y_train)

    # Prediction
    y_pred = lr.predict(X_test_scaled)

    # Get the metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    scores.append((r2, mse))
    
    # Print fold-wise accuracy
    print(f"Fold {fold_num} - R-squared: {r2:.4f}, MSE: {mse:.4f}") 

Fold 1 - R-squared: -0.2799, MSE: 15115491.7994
Fold 2 - R-squared: -1.0280, MSE: 15642519.1108
Fold 3 - R-squared: -1.0244, MSE: 15543462.5788
Fold 4 - R-squared: -0.8279, MSE: 16530224.6735
Fold 5 - R-squared: -0.8135, MSE: 16575316.4735
Fold 6 - R-squared: -0.9271, MSE: 24529402.4371
Fold 7 - R-squared: 0.4092, MSE: 14496956.1782
Fold 8 - R-squared: 0.4314, MSE: 34439344.4813
Fold 9 - R-squared: 0.4378, MSE: 24168131.4399
Fold 10 - R-squared: 0.2791, MSE: 14753662.8117
Fold 11 - R-squared: 0.2291, MSE: 13976422.4844
Fold 12 - R-squared: 0.2056, MSE: 13975533.0118
Fold 13 - R-squared: 0.2860, MSE: 13979907.9249
Fold 14 - R-squared: 0.2216, MSE: 13988192.9896
Fold 15 - R-squared: 0.0757, MSE: 14006602.2724
Fold 16 - R-squared: 0.2326, MSE: 14010847.8031
Fold 17 - R-squared: 0.1955, MSE: 13977853.4744
Fold 18 - R-squared: 0.1077, MSE: 14041439.8583
Fold 19 - R-squared: 0.0161, MSE: 13972065.4798
Fold 20 - R-squared: -0.1395, MSE: 15560389.6421
Fold 21 - R-squared: -0.5960, MSE: 1467572

In [None]:
# Print the mean and standard deviation of R-squared and MSE scores
mean_r2, std_r2 = np.mean([score[0] for score in scores]), np.std([score[0] for score in scores])
mean_mse, std_mse = np.mean([score[1] for score in scores]), np.std([score[1] for score in scores])


In [None]:
# print mean R-Sqaured, Std and means MSE and Std
print("\nMean R-squared:", mean_r2)
print("\nMean MSE:", mean_mse)
print("\nStandard Deviation of R-squared:", std_r2)
print("\nStandard Deviation of MSE:", std_mse)

Note: LOOCV and LPOCV is computationally very expensive. So my laptop is not able to perform this task and taking too much time.But this how we need to perform the both cv techniques