In [1]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sns 
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split , cross_val_score 
from sklearn.linear_model import LinearRegression 
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [13]:
df_supply = pd.read_csv("supply.csv")
print(df_supply.head())

         DATE      TLRESCONS       PERMIT     COMPUTSA  EVACANTUSQ176N   
0  2003-01-01  421328.666667  1806.333333  1660.000000         14908.0  \
1  2003-04-01  429308.666667  1837.666667  1678.666667         15244.0   
2  2003-07-01  458890.000000  1937.333333  1656.333333         15614.0   
3  2003-10-01  491437.333333  1972.333333  1712.000000         15654.0   
4  2004-01-01  506856.333333  1994.666667  1740.333333         15895.0   

     MSACSR   CSUSHPISA  
0  4.200000  129.320667  
1  3.833333  131.755667  
2  3.633333  135.013000  
3  3.966667  138.834667  
4  3.700000  143.298667  


In [14]:
df_demand = pd.read_csv("Demand.csv")
print(df_demand.head())

         DATE         POPTHM        GDP  InterestRateRealEstate  MORTGAGE15US   
0  2003-01-01  289609.000000  11174.129                155242.0      5.204615  \
1  2003-04-01  290252.666667  11312.766                153368.0      4.867692   
2  2003-07-01  290974.000000  11566.669                149239.0      5.356923   
3  2003-10-01  291669.333333  11772.234                151454.0      5.248571   
4  2004-01-01  292236.666667  11923.447                160762.0      4.897500   

     UMCSENT     MSPUS   CSUSHPISA  
0  79.966667  186000.0  129.320667  
1  89.266667  191800.0  131.755667  
2  89.300000  191900.0  135.013000  
3  91.966667  198800.0  138.834667  
4  98.000000  212700.0  143.298667  


In [19]:
df_demand_features = df_demand.drop( ["DATE","CSUSHPISA"]  , axis=1)
df_supply_features = df_supply.drop( ["DATE","CSUSHPISA"] , axis=1)
df_demand_target = df_demand["CSUSHPISA"]
df_supply_target = df_supply["CSUSHPISA"]

In [17]:
scaler  = MinMaxScaler()
scaler.fit_transform(df_supply_features)
supply_scaled = scaler.transform(df_supply_features)
df_supply_scaled = pd.DataFrame(supply_scaled , columns=df_supply_features.columns)
print(df_supply_scaled.head())

   TLRESCONS    PERMIT  COMPUTSA  EVACANTUSQ176N    MSACSR
0   0.240093  0.750247  0.710017        0.196160  0.103734
1   0.251080  0.768791  0.722055        0.260027  0.058091
2   0.291810  0.827777  0.707653        0.330355  0.033195
3   0.336623  0.848491  0.743551        0.337959  0.074689
4   0.357853  0.861708  0.761823        0.383767  0.041494


In [18]:
scaler.fit_transform(df_demand_features)
demand_scaled = scaler.transform(df_demand_features)
df_demand_scaled = pd.DataFrame(demand_scaled , columns=df_demand_features.columns)
print(df_demand_scaled.head())

     POPTHM       GDP  InterestRateRealEstate  MORTGAGE15US   UMCSENT   
0  0.000000  0.000000                0.028940      0.747637  0.557198  \
1  0.014294  0.009028                0.019906      0.664590  0.774319   
2  0.030312  0.025563                0.000000      0.785178  0.775097   
3  0.045753  0.038950                0.010678      0.758471  0.837354   
4  0.058351  0.048798                0.055552      0.671937  0.978210   

      MSPUS  
0  0.000000  
1  0.019761  
2  0.020102  
3  0.043612  
4  0.090971  


In [20]:
# First Model that uses only target 
x_train_supply , x_test_supply , y_train_supply , y_test_supply = train_test_split(df_supply_scaled , df_supply_features , test_size=0.2 , random_state=42)

In [None]:
models = {
    'Linear Regression': LinearRegression(),
    'Decision Tree': DecisionTreeRegressor(),
    'Random Forest': RandomForestRegressor(),
}

results = {}
for model_name, model in models.items():
    scores = cross_val_score(model, X_train, y_train, cv=5, scoring='neg_mean_squared_error')
    mse_scores = -scores
    avg_mse = mse_scores.mean()
    results[model_name] = avg_mse


best_model = min(results, key=results.get)
best_model_instance = models[best_model]


best_model_instance.fit(X_train, y_train)


predictions = best_model_instance.predict(X_test)
mse = mean_squared_error(y_test, predictions)


print("Model Selection Results:")
for model, mse_score in results.items():
    print(f"{model}: MSE={mse_score}")

print(f"\nBest Model: {best_model}")
print(f"Best Model MSE on Testing Set: {mse}")