# Practice Exercise: Multiple Linear Regression

## Scenario:
You are analyzing data to predict car prices based on horsepower, engine size, and age.
## Data:
```python
data = pd.DataFrame({
    'Horsepower': [100, 150, 200, 250, 300, 350, 400],
    'Engine Size': [1.6, 2.0, 2.4, 3.0, 3.6, 4.0, 4.5],
    'Age': [10, 8, 6, 5, 4, 3, 2],
    'Price': [15000, 20000, 25000, 30000, 35000, 40000, 45000]
})

In [1]:
from faker import Faker
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.api import OLS, add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [2]:
# Generate Data
fake = Faker()
np.random.seed(42)

horsepower = np.random.randint(100, 400, 100)
engine_size = np.random.uniform(0, 5, 100)
age = np.random.randint(1, 100, 100)
price = (horsepower * 1000) + (engine_size * 100) - (age * 500) + np.random.randint(-10000, 10000, 100)

df = pd.DataFrame({
    'horsepower': horsepower,
    'engine_size': engine_size,
    'age': age,
    'price': price
})

df

Unnamed: 0,horsepower,engine_size,age,price
0,202,0.597971,48,168628.797123
1,370,3.566224,12,359798.622394
2,206,3.803925,69,163775.392524
3,171,2.806386,37,161897.638599
4,288,3.854836,32,266248.483590
...,...,...,...,...
95,198,3.299920,67,162082.992023
96,271,4.086111,18,271536.611100
97,313,2.776004,25,300212.600406
98,134,2.648253,95,88074.825289


In [3]:
# Basic Statistics
df.describe()

Unnamed: 0,horsepower,engine_size,age,price
count,100.0,100.0,100.0,100.0
mean,252.2,2.389216,50.27,227526.451577
std,87.852378,1.438124,28.855563,89492.33133
min,101.0,0.034761,3.0,70465.651248
25%,163.5,1.195404,26.5,142132.826899
50%,260.5,2.47761,48.0,245728.342056
75%,327.0,3.584938,72.75,300932.97396
max,395.0,4.928252,99.0,384489.447436


In [4]:
# Correlation Matrix
df.corr()

Unnamed: 0,horsepower,engine_size,age,price
horsepower,1.0,-0.003844,-0.069672,0.986931
engine_size,-0.003844,1.0,0.066332,-0.016168
age,-0.069672,0.066332,1.0,-0.217019
price,0.986931,-0.016168,-0.217019,1.0


In [5]:
# Define Dependent and Independent Variables
X = df[['horsepower', 'engine_size', 'age']]
Y = df['price']

In [6]:
# Check for Multicollinearity
scaler = StandardScaler()
x_scaled = scaler.fit_transform(X)

vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(x_scaled, i) for i in range(x_scaled.shape[1])]
print(f'Variance Inflation Factor\n{vif_data}')

Variance Inflation Factor
       Feature       VIF
0   horsepower  1.004878
1  engine_size  1.004420
2          age  1.009305


In [7]:
# Feature Selection using OLS
x_constant = add_constant(X)
ols_model = OLS(Y, x_constant).fit()
print(f'OLS Summary\n{ols_model.summary()}')

OLS Summary
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.996
Model:                            OLS   Adj. R-squared:                  0.996
Method:                 Least Squares   F-statistic:                     8228.
Date:                Wed, 05 Feb 2025   Prob (F-statistic):          1.34e-115
Time:                        08:50:21   Log-Likelihood:                -1003.9
No. Observations:                 100   AIC:                             2016.
Df Residuals:                      96   BIC:                             2026.
Df Model:                           3                                         
Covariance Type:            nonrobust                                         
                  coef    std err          t      P>|t|      [0.025      0.975]
-------------------------------------------------------------------------------
const         221.1777   2231.960     

In [8]:
# Selecting Significant Features
significant_features = ols_model.pvalues[ols_model.pvalues < 0.05].index.tolist()
if 'const' in significant_features:
    significant_features.remove('const')

significant_features

['horsepower', 'age']

In [9]:
# Split into Training & Testing Data
x_selected = X[significant_features]
x_train, x_test, y_train, y_test = train_test_split(x_selected, Y, test_size = 0.2, random_state = 42)

In [10]:
# Multiple Linear Regression
model = LinearRegression()
model.fit(x_train, y_train)

In [11]:
# Display the Coefficients
coefficients = pd.DataFrame({
    'Feature': x_selected.columns,
    'Coefficients': model.coef_
})

coefficients

Unnamed: 0,Feature,Coefficients
0,horsepower,989.698693
1,age,-479.683053


In [12]:
predictions = model.predict(x_test)

mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse}\nR2: {r2}')

Mean Squared Error: 44466486.46302932
R2: 0.9918233174964829


In [13]:
# Visualize Predictions
results = pd.DataFrame({'Actual': y_test, 'Predicted': predictions.flatten()})
fig = px.scatter(results, x = 'Actual', y = 'Predicted', title = 'Actual vs Predicted', template = 'plotly_dark')
fig.add_trace(go.Scatter(x = results['Actual'], y = results['Actual'], mode = 'lines', name = 'Regression Line'))
fig.update_traces(marker=dict(size=10, color='cyan'), selector=dict(mode='markers'))
fig.update_layout(xaxis_title = 'Actual Prices', yaxis_title = 'Predicted Prices')
fig.show()

In [15]:
# Check residuals
residuals = y_test - predictions
fig_residuals = px.scatter(x=predictions, y=residuals, labels={'x': 'Predicted', 'y': 'Residuals'}, title='Residuals vs Predicted', template = 'plotly_dark')
fig_residuals.add_hline(y=0, line_dash='dash')
fig_residuals.show()

Random Residuals = Good Model Fit  
Patterns in Residuals = Model Issues (non-linearity, heteroscedasticity, or missing variables)  
Normal Distribution of Residuals = Better Assumptions for Statistical Inference  