# Multiple Linear Regression

In [2]:
from faker import Faker
import pandas as pd
import numpy as np
import plotly.express as px
import plotly.graph_objects as go
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error, r2_score
from statsmodels.api import OLS, add_constant
from statsmodels.stats.outliers_influence import variance_inflation_factor

In [3]:
# Generate Data
fake = Faker()
np.random.seed(42)

owners = [fake.name() for i in range(100)]
addresses = [fake.address() for i in range(100)]

square_footage = np.random.randint(500, 4000, 100)
bedrooms = np.random.randint(1, 6, 100)
age = np.random.randint(1, 100, 100)
location_score = np.random.randint(1, 10, 100)
price = (square_footage * 150) + (bedrooms * 1000) - (age * 500) + (location_score * 2000) + np.random.randint(-10000, 10000, 100)

df = pd.DataFrame({
    'owner': owners,
    'address': addresses,
    'square_footage': square_footage,
    'bedrooms': bedrooms,
    'age': age,
    'location_score': location_score,
    'price': price
})

df

Unnamed: 0,owner,address,square_footage,bedrooms,age,location_score,price
0,Mrs. April Barnes,Unit 8992 Box 5254\nDPO AP 21572,3674,1,62,9,548608
1,Sharon Gonzalez,"41341 Haynes Cliffs\nSavageton, PA 65484",1360,5,58,3,179051
2,Heidi Russell,"3752 Jeremy Meadow Apt. 548\nSouth Debra, NM 6...",1794,5,52,9,272079
3,Donald Welch,"98745 Mary Stream\nSouth Ashley, AS 92668",1630,2,12,2,250888
4,Darrell Huffman,"533 Miller Harbor Suite 833\nWest Martin, WA 4...",1595,5,39,2,219626
...,...,...,...,...,...,...,...
95,Eric Stokes,"791 Duffy Road\nEast Stevenshire, WA 22638",2817,2,72,6,399358
96,Jason Howard,"60404 Theodore Square Apt. 706\nJuanchester, O...",1315,4,36,6,188517
97,Robert Martin,"756 Cooper Manors Suite 126\nEast Robertfurt, ...",3842,3,38,5,577509
98,Christopher Mendoza,"6393 Duncan Plains Suite 639\nLake Kimmouth, O...",955,1,84,1,99995


In [4]:
# Basic Statistics
print('Basic Statistics:')

df.describe()

Basic Statistics:


Unnamed: 0,square_footage,bedrooms,age,location_score,price
count,100.0,100.0,100.0,100.0,100.0
mean,2368.7,2.97,48.78,4.66,343040.52
std,1018.723617,1.480206,31.132684,2.644682,153956.188484
min,521.0,1.0,1.0,1.0,64527.0
25%,1566.75,1.0,20.0,2.0,217044.0
50%,2468.0,3.0,51.0,5.0,358264.0
75%,3288.75,4.0,73.5,7.0,468598.5
max,3999.0,5.0,99.0,9.0,622421.0


In [5]:
# Correlation Matrix
corr = df[['square_footage', 'bedrooms', 'age', 'location_score', 'price']]
corr.corr()

Unnamed: 0,square_footage,bedrooms,age,location_score,price
square_footage,1.0,-0.141454,-0.03331,-0.047679,0.994578
bedrooms,-0.141454,1.0,-0.008036,0.056715,-0.131349
age,-0.03331,-0.008036,1.0,0.174638,-0.124962
location_score,-0.047679,0.056715,0.174638,1.0,-0.031832
price,0.994578,-0.131349,-0.124962,-0.031832,1.0


In [6]:
# Define Independent and Dependent Variables
X = df[['square_footage', 'bedrooms', 'age', 'location_score']]
Y = df['price']

In [7]:
# Check for Multicollinearity
scaler = StandardScaler()
x_scaled = scaler.fit_transform(X)

vif_data = pd.DataFrame()
vif_data['Feature'] = X.columns
vif_data['VIF'] = [variance_inflation_factor(x_scaled, i) for i in range(x_scaled.shape[1])]
print(f'Variance Inflation Factor:\n{vif_data}')

Variance Inflation Factor:
          Feature       VIF
0  square_footage  1.022878
1        bedrooms  1.023526
2             age  1.032624
3  location_score  1.036303


In [10]:
# Feature Selection using OLS
x_constant = add_constant(X)
ols_model = OLS(Y, x_constant).fit()
print(f'OLS Summary:\n{ols_model.summary()}')

OLS Summary:
                            OLS Regression Results                            
Dep. Variable:                  price   R-squared:                       0.999
Model:                            OLS   Adj. R-squared:                  0.999
Method:                 Least Squares   F-statistic:                 1.820e+04
Date:                Fri, 07 Feb 2025   Prob (F-statistic):          4.50e-136
Time:                        08:48:02   Log-Likelihood:                -1003.7
No. Observations:                 100   AIC:                             2017.
Df Residuals:                      95   BIC:                             2030.
Df Model:                           4                                         
Covariance Type:            nonrobust                                         
                     coef    std err          t      P>|t|      [0.025      0.975]
----------------------------------------------------------------------------------
const              9.1031   235

In [11]:
# Selecting Significant Features
significant_features = ols_model.pvalues[ols_model.pvalues < 0.05].index.tolist()
if 'const' in significant_features:
    significant_features.remove('const')

print(f'Significant Features:\n{significant_features}')

Significant Features:
['square_footage', 'age', 'location_score']


In [8]:
# Feature Selection Using Stepwise Function
import statsmodels.api as sm

def stepwise_selection(X, y, p_value_threshold=0.05):
    """
    Perform stepwise regression using bidirectional elimination.

    Parameters:
    X (pd.DataFrame): Feature matrix
    y (pd.Series): Target variable
    p_value_threshold (float): Maximum p-value to keep a variable in the model

    Returns:
    list: Final list of selected features
    """
    X = X.copy()
    selected_features = list(X.columns)

    while True:
        # Fit model with selected features
        X_with_const = sm.add_constant(X[selected_features])
        model = sm.OLS(y, X_with_const).fit()

        # Get p-values
        p_values = model.pvalues.iloc[1:]  # Exclude constant

        # Identify the feature with the highest p-value
        worst_feature = p_values.idxmax()
        worst_p_value = p_values.max()

        # If the highest p-value is greater than the threshold, remove the feature
        if worst_p_value > p_value_threshold:
            selected_features.remove(worst_feature)
            print(f"Removing {worst_feature} (p-value: {worst_p_value:.4f})")
        else:
            break  # Stop when all p-values are below the threshold

    return selected_features

# Example Usage with an Existing Dataset
selected_features = stepwise_selection(X, Y)
print(f"Final selected features: {selected_features}")

Removing bedrooms (p-value: 0.0805)
Final selected features: ['square_footage', 'age', 'location_score']


In [12]:
# Split into training and testing data (OLS)
x_selected = X[significant_features]
x_train, x_test, y_train, y_test = train_test_split(x_selected, Y, test_size = 0.2, random_state = 42)

In [13]:
# Split into training and testing (stepwise)
X_selected = X[selected_features]
X_train, X_test, Y_train, Y_test = train_test_split(X_selected, Y, test_size=0.2, random_state=42)

In [None]:
# Multiple Linear Regression (OLS)
model = LinearRegression()
model.fit(x_train, y_train)

In [None]:
# Multiple Linear Regression (stepwise)
model = LinearRegression()
model.fit(X_train, Y_train)

In [12]:
# Display the Coefficients
coefficients = pd.DataFrame({
    'Feature': x_selected.columns,
    'Coefficient': model.coef_
})

print(f'Model Coefficients\n{coefficients}')

Model Coefficients
          Feature  Coefficient
0  square_footage   149.747266
1             age  -495.162216
2  location_score  1800.406370


In [None]:
# Predictions and Evaluation (OLS)
predictions = model.predict(x_test)

mse = mean_squared_error(y_test, predictions)
r2 = r2_score(y_test, predictions)

print(f'Mean Squared Error: {mse}')
print(f'R2_score: {r2}')

Mean Squared Error: 35365669.29119181
R2_score: 0.9986507037531676


In [None]:
# Predictions and Evaluation (stepwise)
predictions = model.predict(X_test)
mse = mean_squared_error(Y_test, predictions)
r2 = r2_score(Y_test, predictions)

print(f"Mean Squared Error: {mse}")
print(f"R² Score: {r2}")

In [14]:
# Visualize Predictions
results = pd.DataFrame({'Actual': y_test, 'Predicted': predictions.flatten()})
fig = px.scatter(results, x = 'Actual', y = 'Predicted', title = 'Actual vs Predicted House Prices', template = 'plotly_dark')
fig.add_trace(go.Scatter(x = results['Actual'], y = results['Actual'], mode = 'lines', name = 'Regression Line'))
fig.update_layout(xaxis_title = 'Actual Prices', yaxis_title = 'Predicted Prices')
fig.show()