In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score

In [2]:
# Read the CSV file into a Pandas DataFrame
Lp100km = pd.read_csv('https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_1/datasets/liters-per-100km.csv')
Lp100km.head()

Unnamed: 0,L/100km,cylinders,displacement,horsepower,weight (kg),acceleration
0,13.07,8,307.0,130,1589.12,12.0
1,15.68,8,350.0,165,1674.83,11.5
2,13.07,8,318.0,150,1558.28,11.0
3,14.7,8,304.0,150,1556.92,12.0
4,13.84,8,302.0,140,1564.17,10.5


In [3]:
# Assign the "weight (kg)" feature to X1
# Assign the "weight (kg)" and "cylinders" features to X2
# Note: Scikit-learn requires a two-dimensional array of values
# so we use reshape() to create this

X1 = Lp100km["weight (kg)"].values.reshape(-1, 1)
X2 = Lp100km[["weight (kg)", "cylinders"]].values.reshape(-1, 2)
y = Lp100km["L/100km"].values.reshape(-1, 1)

In [4]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data
from sklearn.model_selection import train_test_split
X1_train, X1_test, X2_train, X2_test, y_train, y_test = train_test_split(X1, X2, y, random_state=42)

In [5]:
# Create the models
lr1 = LinearRegression()
lr2 = LinearRegression()

In [6]:
# Fit the first model to the training data with a single X feature. 
lr1.fit(X1_train, y_train)

# Fit the second model to the training data with two X features.
lr2.fit(X2_train, y_train)

In [7]:
# Calculate the mean_squared_error and the r-squared value
# for the testing data

from sklearn.metrics import mean_squared_error, r2_score

# Use our model to make predictions
predicted1 = lr1.predict(X1_test)
predicted2 = lr2.predict(X2_test)

# Score the predictions with mse and r2
mse1 = mean_squared_error(y_test, predicted1)
r21 = r2_score(y_test, predicted1)
mse2 = mean_squared_error(y_test, predicted2)
r22 = r2_score(y_test, predicted2)

print(f"Single Feature:")
print(f"mean squared error (MSE): {mse1}")
print(f"R-squared (R2): {r21}")
print("---------------------")
print(f"Two Features:")
print(f"mean squared error (MSE): {mse2}")
print(f"R-squared (R2): {r22}")
print("---------------------")
print(f"Difference: {r21-r22}")

Single Feature:
mean squared error (MSE): 2.766420712894053
R-squared (R2): 0.8201810491643611
---------------------
Two Features:
mean squared error (MSE): 2.7646092087575314
R-squared (R2): 0.82029879798389
---------------------
Difference: -0.00011774881952897331


In [8]:
# Provided code to create the adjusted r-squared function
def r2_adj(x, y, model):
    r2 = model.score(x,y)
    n_cols = x.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)

In [9]:
# Calculate the adjusted r-squared value of the model
adj_score1 = r2_adj(X1_test, y_test, lr1)
adj_score2 = r2_adj(X2_test, y_test, lr2)
print(f"1 Feature Adjusted R2: {adj_score1}")
print(f"2 Feature Adjusted R2: {adj_score2}")
print(f"Difference: {adj_score1-adj_score2}")

1 Feature Adjusted R2: 0.8183461619109362
2 Feature Adjusted R2: 0.8165936185608775
Difference: 0.0017525433500586995


In [10]:
# Examine linear regression on the single feature data using cross validation
cv_scores = cross_val_score(LinearRegression(), X1_train, y_train, scoring = "r2")
print(f"All scores: {cv_scores}")
print(f"Mean score: {cv_scores.mean()}")
print(f"Standard Deviation: {cv_scores.std()}")

All scores: [0.79329972 0.75117114 0.78967385 0.77287265 0.70773285]
Mean score: 0.7629500430340798
Standard Deviation: 0.03137324863209967
