In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

### Load data

In [3]:
sat_df = pd.read_csv('http://bit.ly/PMR-ch7')

In [None]:
sat_df.head()

In [None]:
sat_df.describe()

In [None]:
sat_df.dtypes

In [7]:
sat_df['is_weekend'] = sat_df['is_weekend'].astype(pd.CategoricalDtype())

In [None]:
sat_df.dtypes

In [9]:
sat_df['is_weekend'] = sat_df['is_weekend'].cat.codes

In [None]:
sat_df.dtypes

In [None]:
sat_df['is_weekend']

In [None]:
sat_df.describe().round(2)

## Fitting linear models with lm()

In [None]:
sat_df.distance.hist()

In [14]:
sat_df['log_dist'] = sat_df.distance.apply(np.log)

In [None]:
sat_df.log_dist.hist()
plt.xlabel('log distance')
plt.ylabel('Count')

### Linear model with a single predictor

In [None]:
import statsmodels.formula.api as smf
smf.ols('overall ~ rides', data=sat_df).fit().summary()

### ols objects

In [17]:
m1 = smf.ols('overall ~ rides', data=sat_df).fit()

In [None]:
m1.params

In [None]:
m1.summary()

In [None]:
m1.resid

In [None]:
m1.fittedvalues

In [None]:
plt.hist(m1.resid)
plt.xlabel('m1 residual value')
plt.ylabel('Count')

In [None]:
plt.scatter(m1.fittedvalues, m1.resid)
plt.xlabel('m1 residual value')
plt.ylabel('Count')

## Fitting Linear Models with Multiple Predictors

In [None]:
sat_df.corr()

In [25]:
 from statsmodels.stats.outliers_influence import variance_inflation_factor


In [26]:
X = sat_df[['is_weekend','num_child', 'log_dist','wait', 'clean','rides','games']]


In [27]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

In [None]:
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data)

In [None]:
sat_df.plot(kind='scatter', x='rides', y='overall')
plt.xlabel('Satisfaction with rides')
plt.ylabel('Satisfaction overall')

In [None]:
m2 = smf.ols('overall ~ is_weekend' , data=sat_df).fit()
m2.summary()

In [None]:
m3 = smf.ols('overall ~ num_child + is_weekend' , data=sat_df).fit()
m3.summary()

In [None]:
m4 = smf.ols('overall ~ is_weekend + num_child + log_dist', data=sat_df).fit()
m4.summary()

In [None]:
m5 = smf.ols('overall ~ is_weekend + num_child + log_dist + wait', data=sat_df).fit()
m5.summary()

In [None]:
m6 = smf.ols('overall ~ is_weekend + num_child + log_dist + wait', data=sat_df).fit()
m6.summary()

In [41]:
X = sat_df[['is_weekend','num_child', 'log_dist','wait']]


In [42]:
vif_data = pd.DataFrame()
vif_data["feature"] = X.columns

In [None]:
vif_data["VIF"] = [variance_inflation_factor(X.values, i) for i in range(len(X.columns))]

print(vif_data)

## Interaction Terms

In [None]:
m5 = smf.ols('overall ~ is_weekend + num_child + log_dist + wait + wait:num_child', data=sat_df).fit()
m5.summary()

# Prediction

In [51]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [52]:
train_df, test_df = train_test_split(sat_df, test_size=0.3, random_state=42)

In [53]:
model = smf.ols('overall ~  is_weekend + num_child + log_dist + wait', data=train_df).fit()

In [None]:
model.summary()

In [55]:
test_df['Predicted_overall'] = model.predict(test_df)

In [None]:
r2 = r2_score(test_df['overall'], test_df['Predicted_overall'])
print(f"R-squared (R²): {r2:.2f}")

In [None]:
mse = mean_squared_error(test_df['overall'], test_df['Predicted_overall'])
print(f"Mean Squared Error (MSE): {mse:.2f}")