### import the necessary libraries

In [16]:
import pandas as pd 
import numpy as np 
import seaborn as sns 


## Import machine learninhg libraries 
# import the regression algorithm 

from sklearn.linear_model import LinearRegression, Lasso, Ridge, ElasticNet,HuberRegressor,SGDRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
import lightgbm as lgb
import xgboost as xgb
from sklearn.neural_network import MLPRegressor
from sklearn.pipeline import Pipeline


# import the pickle to dump these file into the pkl file 
import pickle as pkl



### Read the datset 

In [3]:
df = pd.read_csv(r"D:\NIT Course\Practice\ML Projects\House Price Prediction\USA_Housing.csv")
df.head()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price,Address
0,79545.458574,5.682861,7.009188,4.09,23086.800503,1059034.0,"208 Michael Ferry Apt. 674\nLaurabury, NE 3701..."
1,79248.642455,6.0029,6.730821,3.09,40173.072174,1505891.0,"188 Johnson Views Suite 079\nLake Kathleen, CA..."
2,61287.067179,5.86589,8.512727,5.13,36882.1594,1058988.0,"9127 Elizabeth Stravenue\nDanieltown, WI 06482..."
3,63345.240046,7.188236,5.586729,3.26,34310.242831,1260617.0,USS Barnett\nFPO AP 44820
4,59982.197226,5.040555,7.839388,4.23,26354.109472,630943.5,USNS Raymond\nFPO AE 09386


### Explore the dataset 

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 7 columns):
 #   Column                        Non-Null Count  Dtype  
---  ------                        --------------  -----  
 0   Avg. Area Income              5000 non-null   float64
 1   Avg. Area House Age           5000 non-null   float64
 2   Avg. Area Number of Rooms     5000 non-null   float64
 3   Avg. Area Number of Bedrooms  5000 non-null   float64
 4   Area Population               5000 non-null   float64
 5   Price                         5000 non-null   float64
 6   Address                       5000 non-null   object 
dtypes: float64(6), object(1)
memory usage: 273.6+ KB


In [5]:
df.describe()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0
mean,68583.108984,5.977222,6.987792,3.98133,36163.516039,1232073.0
std,10657.991214,0.991456,1.005833,1.234137,9925.650114,353117.6
min,17796.63119,2.644304,3.236194,2.0,172.610686,15938.66
25%,61480.562388,5.322283,6.29925,3.14,29403.928702,997577.1
50%,68804.286404,5.970429,7.002902,4.05,36199.406689,1232669.0
75%,75783.338666,6.650808,7.665871,4.49,42861.290769,1471210.0
max,107701.748378,9.519088,10.759588,6.5,69621.713378,2469066.0


In [7]:
## Drop the unnecessary columns
df.drop(["Address"], axis = 1, inplace = True )

In [9]:
df.corr()

Unnamed: 0,Avg. Area Income,Avg. Area House Age,Avg. Area Number of Rooms,Avg. Area Number of Bedrooms,Area Population,Price
Avg. Area Income,1.0,-0.002007,-0.011032,0.019788,-0.016234,0.639734
Avg. Area House Age,-0.002007,1.0,-0.009428,0.006149,-0.018743,0.452543
Avg. Area Number of Rooms,-0.011032,-0.009428,1.0,0.462695,0.00204,0.335664
Avg. Area Number of Bedrooms,0.019788,0.006149,0.462695,1.0,-0.022168,0.171071
Area Population,-0.016234,-0.018743,0.00204,-0.022168,1.0,0.408556
Price,0.639734,0.452543,0.335664,0.171071,0.408556,1.0


In [10]:
## check if the datset have any null values
df.isnull().sum()

Avg. Area Income                0
Avg. Area House Age             0
Avg. Area Number of Rooms       0
Avg. Area Number of Bedrooms    0
Area Population                 0
Price                           0
dtype: int64

### Seperate the Dependent and independent variable 

In [11]:
X = df.drop(["Price"], axis = 1)
y = df["Price"]

### seperate the data into training and testing sets 

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)

### Fit the data into the models 

In [17]:
models = {
    "Linear Regression" : LinearRegression(),
    "Lasso" : Lasso(),
    "Ridge" : Ridge(),
    "KNeighbors Regressor" : KNeighborsRegressor(),
    "Random Forest" : RandomForestRegressor(),
    "ANN Regression" : MLPRegressor(hidden_layer_sizes=(100,), max_iter=1000),
    'LGBM': lgb.LGBMRegressor(),
    'XGBoost': xgb.XGBRegressor(),
    "PolynomialRegression" : Pipeline([
               ("poly", PolynomialFeatures(degree=4)),
             ("Linear Regression",LinearRegression())
         ]),
    "SVR Regression" : SVR(),
    "HuberRegressor" : HuberRegressor(),
    "Elastic Net Regression" : ElasticNet()
    
}

In [None]:
result = []

for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    
    # find the metrics 
    MSE = mean_squared_error(y_test, y_pred)
    MAE = mean_absolute_error(y_test, y_pred)
    r2score = r2_score(y_test, y_pred)
    
    # append the MSE , MAE and r2 Score
    result.append(
        {
            "Name" : name,
            "MSE" : MSE,
            "MAE" : MAE,
            "R2 Score" : r2score
        }
    )
    
    # pickle the file 
    

[{'Name': 'Linear Regression', 'MSE': 10549721686.160313, 'MAE': 82657.94605892441, 'R2 Score': 0.9146454505137985}]
[{'Name': 'Linear Regression', 'MSE': 10549721686.160313, 'MAE': 82657.94605892441, 'R2 Score': 0.9146454505137985}, {'Name': 'Lasso', 'MSE': 10549717660.356379, 'MAE': 82657.94662172231, 'R2 Score': 0.9146454830853384}]
[{'Name': 'Linear Regression', 'MSE': 10549721686.160313, 'MAE': 82657.94605892441, 'R2 Score': 0.9146454505137985}, {'Name': 'Lasso', 'MSE': 10549717660.356379, 'MAE': 82657.94662172231, 'R2 Score': 0.9146454830853384}, {'Name': 'Ridge', 'MSE': 10549745186.670168, 'MAE': 82659.67244409773, 'R2 Score': 0.9146452603784101}]
[{'Name': 'Linear Regression', 'MSE': 10549721686.160313, 'MAE': 82657.94605892441, 'R2 Score': 0.9146454505137985}, {'Name': 'Lasso', 'MSE': 10549717660.356379, 'MAE': 82657.94662172231, 'R2 Score': 0.9146454830853384}, {'Name': 'Ridge', 'MSE': 10549745186.670168, 'MAE': 82659.67244409773, 'R2 Score': 0.9146452603784101}, {'Name': 'KN