In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error

# Sample data with duplicates
data = {'X': [1, 2, 3, 4, 5, 5, 6, 7, 8, 9], 'Y': [2, 4, 6, 8, 10, 10, 12, 14, 16, 18]}
df = pd.DataFrame(data)

# Remove duplicate rows
df = df.drop_duplicates()

# Splitting data into training and testing sets
X = df[['X']]
y = df['Y']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Linear regression model (High bias, Low variance)
linear_model = LinearRegression()
linear_model.fit(X_train, y_train)
y_pred_linear = linear_model.predict(X_test)
linear_mse = mean_squared_error(y_test, y_pred_linear)

# Polynomial regression model (Low bias, High variance)
poly = PolynomialFeatures(degree=3)
X_train_poly = poly.fit_transform(X_train)
X_test_poly = poly.transform(X_test)
poly_model = LinearRegression()
poly_model.fit(X_train_poly, y_train)
y_pred_poly = poly_model.predict(X_test_poly)
poly_mse = mean_squared_error(y_test, y_pred_poly)

# Cross-validation for linear model
cv_scores = cross_val_score(linear_model, X, y, cv=5, scoring='neg_mean_squared_error')
cv_mse = -cv_scores.mean()

print(f"Linear Model MSE: {linear_mse}")
print(f"Polynomial Model MSE: {poly_mse}")
print(f"Cross-validation MSE: {cv_mse}")

Linear Model MSE: 1.262177448353619e-29
Polynomial Model MSE: 1.6866240584077843e-26
Cross-validation MSE: 2.9187853493177438e-30
