In [1]:
# Connect colab with google drive
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [7]:
# Importing the required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error

In [3]:
# Importing the Dataset

housing_data = pd.read_csv("/content/drive/MyDrive/Datasets/USA_Housing.csv")
print(housing_data.head())
print("Column names : ",housing_data.columns)
print("Dataset info : ",housing_data.info())

   Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
0      79545.458574             5.682861                   7.009188   
1      79248.642455             6.002900                   6.730821   
2      61287.067179             5.865890                   8.512727   
3      63345.240046             7.188236                   5.586729   
4      59982.197226             5.040555                   7.839388   

   Avg. Area Number of Bedrooms  Area Population         Price  \
0                          4.09     23086.800503  1.059034e+06   
1                          3.09     40173.072174  1.505891e+06   
2                          5.13     36882.159400  1.058988e+06   
3                          3.26     34310.242831  1.260617e+06   
4                          4.23     26354.109472  6.309435e+05   

                                             Address  
0  208 Michael Ferry Apt. 674\nLaurabury, NE 3701...  
1  188 Johnson Views Suite 079\nLake Kathleen, CA...  
2  9127 Eli

In [4]:
# Data preprocessing

# Since we cannot process address data we drop that from data set

data = housing_data.drop(columns=["Address"],axis=1)
print(data.head())

print(data.columns)

   Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
0      79545.458574             5.682861                   7.009188   
1      79248.642455             6.002900                   6.730821   
2      61287.067179             5.865890                   8.512727   
3      63345.240046             7.188236                   5.586729   
4      59982.197226             5.040555                   7.839388   

   Avg. Area Number of Bedrooms  Area Population         Price  
0                          4.09     23086.800503  1.059034e+06  
1                          3.09     40173.072174  1.505891e+06  
2                          5.13     36882.159400  1.058988e+06  
3                          3.26     34310.242831  1.260617e+06  
4                          4.23     26354.109472  6.309435e+05  
Index(['Avg. Area Income', 'Avg. Area House Age', 'Avg. Area Number of Rooms',
       'Avg. Area Number of Bedrooms', 'Area Population', 'Price'],
      dtype='object')


In [8]:
# Setting independent variable
x = data.drop(columns = ["Price"],axis = 1)
print(x.head())

# Setting dependent variable
y = data["Price"]
print(y)

   Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
0      79545.458574             5.682861                   7.009188   
1      79248.642455             6.002900                   6.730821   
2      61287.067179             5.865890                   8.512727   
3      63345.240046             7.188236                   5.586729   
4      59982.197226             5.040555                   7.839388   

   Avg. Area Number of Bedrooms  Area Population  
0                          4.09     23086.800503  
1                          3.09     40173.072174  
2                          5.13     36882.159400  
3                          3.26     34310.242831  
4                          4.23     26354.109472  
0       1.059034e+06
1       1.505891e+06
2       1.058988e+06
3       1.260617e+06
4       6.309435e+05
            ...     
4995    1.060194e+06
4996    1.482618e+06
4997    1.030730e+06
4998    1.198657e+06
4999    1.298950e+06
Name: Price, Length: 5000, dtype: fl

In [9]:
# Splitting data

x_train, x_test, y_train, y_test = train_test_split(x , y, test_size = 0.3)
print("X  Train : ",x_train)
print("X Test : ",x_test)
print("y Train : ",y_train)
print("y Test : ",y_test)

X  Train :        Avg. Area Income  Avg. Area House Age  Avg. Area Number of Rooms  \
1071      51519.618537             6.944747                   5.543897   
4661      64168.858836             6.360502                   7.536445   
4236      56315.351575             5.064975                   8.708239   
537       79550.641384             6.637634                   7.531228   
409       67710.112979             5.854378                   8.482523   
...                ...                  ...                        ...   
198       67371.420857             5.333821                   6.624951   
236       60640.931881             5.165922                   5.355114   
2875      66537.252717             6.346080                   9.268967   
2495      70213.646002             5.913557                   5.829801   
1686      57869.268480             5.625299                   7.601622   

      Avg. Area Number of Bedrooms  Area Population  
1071                          4.13     43258.

In [17]:
# Training the mode

def train_model(model, x_train, y_train):
  model.fit(x_train,y_train)
  return model

# Evaluating model
def evaluate_model(model, x_test,y_test):

  y_predict = model.predict(x_test)

  print("y test : ",y_test.shape)
  print("y pred : ",y_predict.shape)
  r2 = r2_score(y_test, y_predict)
  print("R-squared (R2) Score:", r2)
  mse = mean_squared_error(y_test, y_predict)
  print("Mean Squared Error:", mse)



trained_model = train_model(LinearRegression(), x_train, y_train)
evaluate_model(trained_model,x_test,y_test)


y test :  (1500,)
y pred :  (1500,)
R-squared (R2) Score: 0.9167604756599034
Mean Squared Error: 10192648984.217157
