In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [2]:
# Import the data
df = pd.read_csv("https://static.bc-edx.com/ai/ail-v-1-0/m12/lesson_2/datasets/rent-data-cleaned.csv").dropna()
df.head()

Unnamed: 0,category,bathrooms,bedrooms,fee,has_photo,pets_allowed,price,price_type,square_feet,cityname,...,Gated,TV,Hot Tub,Tennis,Wood Floors,View,Alarm,Doorman,Luxury,Golf
2,0,1.0,0.0,0,1,3,1390,0,107,52,...,0,0,0,0,0,0,0,0,0,0
3,0,1.0,0.0,0,1,3,925,0,116,1285,...,0,0,0,0,0,0,0,0,0,0
5,0,1.0,0.0,0,1,4,2475,0,130,821,...,0,0,0,0,0,0,0,0,0,0
8,0,1.0,0.0,0,1,3,1495,0,138,1247,...,0,0,0,0,0,0,0,0,0,0
14,0,1.0,0.0,0,1,3,1695,0,190,1247,...,0,0,0,0,0,0,0,0,0,0


In [3]:
# Get the features (everything except the "price" column)
X = df.copy().drop(columns="price")
X.head()

Unnamed: 0,category,bathrooms,bedrooms,fee,has_photo,pets_allowed,price_type,square_feet,cityname,state,...,Gated,TV,Hot Tub,Tennis,Wood Floors,View,Alarm,Doorman,Luxury,Golf
2,0,1.0,0.0,0,1,3,0,107,52,45,...,0,0,0,0,0,0,0,0,0,0
3,0,1.0,0.0,0,1,3,0,116,1285,47,...,0,0,0,0,0,0,0,0,0,0
5,0,1.0,0.0,0,1,4,0,130,821,34,...,0,0,0,0,0,0,0,0,0,0
8,0,1.0,0.0,0,1,3,0,138,1247,4,...,0,0,0,0,0,0,0,0,0,0
14,0,1.0,0.0,0,1,3,0,190,1247,4,...,0,0,0,0,0,0,0,0,0,0


In [4]:
# Get the target column
y = df["price"].values.reshape(-1,1)
y[0:5]

array([[1390],
       [ 925],
       [2475],
       [1495],
       [1695]])

In [5]:
# Use the Sklearn `train_test_split()` function to split the data into training and testing data
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)

In [6]:
import statsmodels.api as sm

# Use the statsmodels package to create and fit a linear regression
lr = sm.OLS(y_train, X_train).fit()

In [7]:
# Create a variable to hold the p-values of all columns sorted in ascending order
p_values = lr.pvalues.sort_values()
p_values

square_feet           3.261951e-121
longitude              1.585841e-74
state                  7.250670e-64
bathrooms              7.174654e-16
bedrooms               2.742034e-14
Elevator               3.509969e-14
Garbage Disposal       2.991747e-08
time                   2.122291e-06
TV                     2.504534e-06
Playground             1.475447e-05
Wood Floors            9.486830e-05
AC                     8.106059e-04
cityname               1.239758e-03
pets_allowed           2.482536e-03
Fireplace              2.773143e-03
Gated                  3.970255e-02
Gym                    7.288411e-02
Clubhouse              8.699257e-02
Basketball             1.358878e-01
category               1.407839e-01
Internet Access        1.485159e-01
View                   1.590230e-01
Storage                1.887753e-01
latitude               1.947155e-01
Doorman                2.343941e-01
Refrigerator           2.404750e-01
Washer Dryer           3.246991e-01
Hot Tub                3.529

In [8]:
# Use loc to filter to columns with p-values below 0.05
select_cols = p_values.loc[p_values < 0.05]

# Show the index of the results
select_cols.index

Index(['square_feet', 'longitude', 'state', 'bathrooms', 'bedrooms',
       'Elevator', 'Garbage Disposal', 'time', 'TV', 'Playground',
       'Wood Floors', 'AC', 'cityname', 'pets_allowed', 'Fireplace', 'Gated'],
      dtype='object')

In [9]:
# Create an X variable with all features and another with
# only features that meet the 0.05 threshold.

# Hint: Use the index from the previous cell

X_full = X
X_sel = X[select_cols.index]

In [10]:
# Split the data into training and testing sets
X_full_train, X_full_test, X_sel_train, X_sel_test, y_train, y_test = train_test_split(X_full, X_sel, y)

In [11]:
# Train two models using the different X variables

# Create the models
lr1 = LinearRegression()
lr2 = LinearRegression()

# Fit the models
lr1.fit(X_full_train, y_train)
lr2.fit(X_sel_train, y_train)

In [12]:
# Provided code to create the adjusted r-squared function
def r2_adj(x, y, model):
    r2 = model.score(x,y)
    n_cols = x.shape[1]
    return 1 - (1 - r2) * (len(y) - 1) / (len(y) - n_cols - 1)

In [13]:
# Compare the adjusted r-squared of the two models
adj_score1 = r2_adj(X_full_test, y_test, lr1)
adj_score2 = r2_adj(X_sel_test, y_test, lr2)
print(f"1 Feature Adjusted R2: {adj_score1}")
print(f"2 Feature Adjusted R2: {adj_score2}")

1 Feature Adjusted R2: 0.35073394257907986
2 Feature Adjusted R2: 0.3580034150416014
