In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
# Import the data
car_data = pd.read_csv("https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_1/datasets/car-data-encoded.csv")
car_data.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price
0,3,,0,1,0,2.0,0,2,0,88.6,...,130,5,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0
1,3,,0,1,0,2.0,0,2,0,88.6,...,130,5,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0
2,1,,0,1,0,2.0,2,2,0,94.5,...,152,5,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0
3,2,164.0,1,1,0,4.0,3,1,0,99.8,...,109,5,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0
4,2,164.0,1,1,0,4.0,3,0,0,99.4,...,136,5,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0


In [3]:
# Drop rows with null values
car_data = car_data.dropna()

In [4]:
# Get the features (everything except the "price" column)
X = car_data.copy().drop(columns="price")
X.head()

Unnamed: 0,symboling,normalized-losses,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,...,num-of-cylinders,engine-size,fuel-system,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg
3,2,164.0,1,1,0,4.0,3,1,0,99.8,...,4,109,5,3.19,3.4,10.0,102.0,5500.0,24,30
4,2,164.0,1,1,0,4.0,3,0,0,99.4,...,5,136,5,3.19,3.4,8.0,115.0,5500.0,18,22
6,1,158.0,1,1,0,4.0,3,1,0,105.8,...,5,136,5,3.19,3.4,8.5,110.0,5500.0,19,25
8,1,158.0,1,1,1,4.0,3,1,0,105.8,...,5,131,5,3.13,3.4,8.3,140.0,5500.0,17,20
10,2,192.0,2,1,0,2.0,3,2,0,101.2,...,4,108,5,3.5,2.8,8.8,101.0,5800.0,23,29


In [5]:
# Get the target column
y = car_data["price"].values.reshape(-1,1)
y[0:5]

array([[13950.],
       [17450.],
       [17710.],
       [23875.],
       [16430.]])

In [6]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [7]:
import statsmodels.api as sm

# Use the statsmodels package to create and fit a linear regression
lr = sm.OLS(y_train, X_train).fit()

In [8]:
# Show the p-values of all columns sorted in ascending order
lr.pvalues.sort_values()

make                 0.000003
drive-wheels         0.000262
stroke               0.000480
curb-weight          0.001978
aspiration           0.013004
normalized-losses    0.040073
engine-size          0.058900
horsepower           0.128488
symboling            0.154754
height               0.182381
engine-type          0.186890
num-of-doors         0.191195
width                0.265105
fuel-system          0.282373
compression-ratio    0.369149
city-mpg             0.371712
engine-location      0.417107
fuel-type            0.427430
num-of-cylinders     0.510472
bore                 0.638141
length               0.717638
peak-rpm             0.717904
body-style           0.748983
highway-mpg          0.955024
wheel-base           0.980805
dtype: float64

In [9]:
# Create an X variable with all features and another with
# only features that meet the 0.05 threshold.

X_full = X
X_sel = X[['make', 'drive-wheels', 'stroke', 
           'curb-weight', 'aspiration', 'normalized-losses']]

In [10]:
# Split the data into training and testing sets
X_full_train, X_full_test, X_sel_train, X_sel_test, y_train, y_test = train_test_split(X_full, X_sel, y)

In [11]:
# Train two models using the different X variables

# Create the models
lr1 = LinearRegression()
lr2 = LinearRegression()

# Fit the models
lr1.fit(X_full_train, y_train)
lr2.fit(X_sel_train, y_train)

In [12]:
# Provided code to create the adjusted r-squared function
def r2_adj(x, y, model):
    r2 = model.score(x,y)
    n_cols = x.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)

In [13]:
# Compare the adjusted r-squared of the two models
adj_score1 = r2_adj(X_full_test, y_test, lr1)
adj_score2 = r2_adj(X_sel_test, y_test, lr2)
print(f"1 Feature Adjusted R2: {adj_score1}")
print(f"2 Feature Adjusted R2: {adj_score2}")

1 Feature Adjusted R2: 0.5438850484895039
2 Feature Adjusted R2: 0.8316818603410603
