In [4]:
import pandas as pd
import numpy as np
from sklearn.model_selection import KFold
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

# Function to calculate adjusted R2
def adjusted_r2_score(r2, n, p):
    """
    Calculate the adjusted R-squared score.
    :param r2: The R-squared score.
    :param n: The number of samples.
    :param p: The number of features.
    :return: The adjusted R-squared score.
    """
    return 1 - (1 - r2) * (n - 1) / (n - p - 1)

# Load the data
data = pd.read_csv('filtered_data.csv')

# Separate the target variable and features
X = data.drop('Price', axis=1)
y = data['Price']

# Define categorical and numerical features
categorical_features = X.select_dtypes(include=['object', 'bool']).columns.tolist()
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns.tolist()

# Define the preprocessing for categorical data
categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Define the preprocessing for numerical data
numerical_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

# Combine preprocessors
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Create the modeling pipeline
model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

# Initialize KFold for cross-validation
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# Lists to store metrics for each fold
r2_scores = []
mse_scores = []
mae_scores = []
adjusted_r2_scores = []

for train_index, test_index in kf.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]
    
    # Fit the model on the training data
    model.fit(X_train, y_train)
    
    # Predict on the test data
    y_pred = model.predict(X_test)
    
    # Calculate metrics
    r2 = r2_score(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    mae = mean_absolute_error(y_test, y_pred)
    
    # Calculate the number of features after preprocessing
    X_test_transformed = model.named_steps['preprocessor'].transform(X_test)
    p = X_test_transformed.shape[1]
    n = len(y_test)
    
    adjusted_r2 = adjusted_r2_score(r2, n, p)
    
    # Append to lists
    r2_scores.append(r2)
    mse_scores.append(mse)
    mae_scores.append(mae)
    adjusted_r2_scores.append(adjusted_r2)

# Display mean metrics
print("Cross-validation scores:")
print(f"Mean R2: {np.mean(r2_scores)}")
print(f"Mean Adjusted R2: {np.mean(adjusted_r2_scores)}")
print(f"Mean MSE: {np.mean(mse_scores)}")
print(f"Mean MAE: {np.mean(mae_scores)}")

Cross-validation scores:
Mean R2: 0.24279482857447193
Mean Adjusted R2: -0.1543212347794339
Mean MSE: 225555.82537425464
Mean MAE: 314.0605892631509
