In [37]:
# Import necessary packages
import numpy as np
import pandas as pd
from sklearn.impute import SimpleImputer
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression

from sklearn.preprocessing import MinMaxScaler, OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [38]:
# Load the training data
train_df = pd.read_csv("../Dataset/data.csv")
train_df.head()

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02 00:00:00,313000.0,3.0,1.5,1340,7912,1.5,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02 00:00:00,2384000.0,5.0,2.5,3650,9050,2.0,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02 00:00:00,342000.0,3.0,2.0,1930,11947,1.0,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02 00:00:00,420000.0,3.0,2.25,2000,8030,1.0,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02 00:00:00,550000.0,4.0,2.5,1940,10500,1.0,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [39]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   date           4600 non-null   object 
 1   price          4600 non-null   float64
 2   bedrooms       4600 non-null   float64
 3   bathrooms      4600 non-null   float64
 4   sqft_living    4600 non-null   int64  
 5   sqft_lot       4600 non-null   int64  
 6   floors         4600 non-null   float64
 7   waterfront     4600 non-null   int64  
 8   view           4600 non-null   int64  
 9   condition      4600 non-null   int64  
 10  sqft_above     4600 non-null   int64  
 11  sqft_basement  4600 non-null   int64  
 12  yr_built       4600 non-null   int64  
 13  yr_renovated   4600 non-null   int64  
 14  street         4600 non-null   object 
 15  city           4600 non-null   object 
 16  statezip       4600 non-null   object 
 17  country        4600 non-null   object 
dtypes: float

In [40]:
train_df.shape

(4600, 18)

In [41]:
train_df['price']     = train_df['price'].astype('int64')
train_df['bedrooms']  = train_df['bedrooms'].astype('int64')
train_df['bathrooms'] = train_df['bathrooms'].astype('int64')
train_df['floors']    = train_df['floors'].astype('int64')
train_df['street']    = train_df['street'].astype('string')
train_df['city']      = train_df['city'].astype('string')
train_df['statezip']  = train_df['statezip'].astype('string')
train_df['country']   = train_df['country'].astype('string')

In [42]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4600 entries, 0 to 4599
Data columns (total 18 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   date           4600 non-null   object
 1   price          4600 non-null   int64 
 2   bedrooms       4600 non-null   int64 
 3   bathrooms      4600 non-null   int64 
 4   sqft_living    4600 non-null   int64 
 5   sqft_lot       4600 non-null   int64 
 6   floors         4600 non-null   int64 
 7   waterfront     4600 non-null   int64 
 8   view           4600 non-null   int64 
 9   condition      4600 non-null   int64 
 10  sqft_above     4600 non-null   int64 
 11  sqft_basement  4600 non-null   int64 
 12  yr_built       4600 non-null   int64 
 13  yr_renovated   4600 non-null   int64 
 14  street         4600 non-null   string
 15  city           4600 non-null   string
 16  statezip       4600 non-null   string
 17  country        4600 non-null   string
dtypes: int64(13), object(1), str

In [43]:
train_df.drop_duplicates(inplace=True)

In [44]:
train_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
price,4600.0,551962.975435,563834.706028,0.0,322875.0,460943.0,654962.5,26590000.0
bedrooms,4600.0,3.40087,0.908848,0.0,3.0,3.0,4.0,9.0
bathrooms,4600.0,1.788913,0.752185,0.0,1.0,2.0,2.0,8.0
sqft_living,4600.0,2139.346957,963.206916,370.0,1460.0,1980.0,2620.0,13540.0
sqft_lot,4600.0,14852.516087,35884.436145,638.0,5000.75,7683.0,11001.25,1074218.0
floors,4600.0,1.45913,0.552194,1.0,1.0,1.0,2.0,3.0
waterfront,4600.0,0.007174,0.084404,0.0,0.0,0.0,0.0,1.0
view,4600.0,0.240652,0.778405,0.0,0.0,0.0,0.0,4.0
condition,4600.0,3.451739,0.67723,1.0,3.0,3.0,4.0,5.0
sqft_above,4600.0,1827.265435,862.168977,370.0,1190.0,1590.0,2300.0,9410.0


In [45]:
(train_df.price == 0).sum()

49

In [46]:
train_df['price'].replace(0, np.nan, inplace = True)

In [47]:
(train_df.price == 0).sum()

0

In [48]:
train_df.dropna(inplace=True)
(train_df.price == 0).sum()

0

In [49]:
train_df['date'] = pd.to_datetime(train_df['date'])
train_df

Unnamed: 0,date,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02,313000.0,3,1,1340,7912,1,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02,2384000.0,5,2,3650,9050,2,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02,342000.0,3,2,1930,11947,1,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02,420000.0,3,2,2000,8030,1,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02,550000.0,4,2,1940,10500,1,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,2014-07-09,308166.0,3,1,1510,6360,1,0,0,4,1510,0,1954,1979,501 N 143rd St,Seattle,WA 98133,USA
4596,2014-07-09,534333.0,3,2,1460,7573,2,0,0,3,1460,0,1983,2009,14855 SE 10th Pl,Bellevue,WA 98007,USA
4597,2014-07-09,416904.0,3,2,3010,7014,2,0,0,3,3010,0,2009,0,759 Ilwaco Pl NE,Renton,WA 98059,USA
4598,2014-07-10,203400.0,4,2,2090,6630,1,0,0,3,1070,1020,1974,0,5148 S Creston St,Seattle,WA 98178,USA


In [50]:
train_df.insert(1, "year", train_df.date.dt.year)
train_df.head()

Unnamed: 0,date,year,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,street,city,statezip,country
0,2014-05-02,2014,313000.0,3,1,1340,7912,1,0,0,3,1340,0,1955,2005,18810 Densmore Ave N,Shoreline,WA 98133,USA
1,2014-05-02,2014,2384000.0,5,2,3650,9050,2,0,4,5,3370,280,1921,0,709 W Blaine St,Seattle,WA 98119,USA
2,2014-05-02,2014,342000.0,3,2,1930,11947,1,0,0,4,1930,0,1966,0,26206-26214 143rd Ave SE,Kent,WA 98042,USA
3,2014-05-02,2014,420000.0,3,2,2000,8030,1,0,0,4,1000,1000,1963,0,857 170th Pl NE,Bellevue,WA 98008,USA
4,2014-05-02,2014,550000.0,4,2,1940,10500,1,0,0,4,1140,800,1976,1992,9105 170th Ave NE,Redmond,WA 98052,USA


In [51]:
train_df = train_df.drop(['date', 'street', 'statezip', 'country','city'], axis = 1)
train_df

Unnamed: 0,year,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated
0,2014,313000.0,3,1,1340,7912,1,0,0,3,1340,0,1955,2005
1,2014,2384000.0,5,2,3650,9050,2,0,4,5,3370,280,1921,0
2,2014,342000.0,3,2,1930,11947,1,0,0,4,1930,0,1966,0
3,2014,420000.0,3,2,2000,8030,1,0,0,4,1000,1000,1963,0
4,2014,550000.0,4,2,1940,10500,1,0,0,4,1140,800,1976,1992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,2014,308166.0,3,1,1510,6360,1,0,0,4,1510,0,1954,1979
4596,2014,534333.0,3,2,1460,7573,2,0,0,3,1460,0,1983,2009
4597,2014,416904.0,3,2,3010,7014,2,0,0,3,3010,0,2009,0
4598,2014,203400.0,4,2,2090,6630,1,0,0,3,1070,1020,1974,0


In [52]:
train_df['Age'] = 2023 - train_df['yr_built']
train_df['AgeRenovated'] = 2023 - train_df['yr_renovated']
train_df.head()

Unnamed: 0,year,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,yr_built,yr_renovated,Age,AgeRenovated
0,2014,313000.0,3,1,1340,7912,1,0,0,3,1340,0,1955,2005,68,18
1,2014,2384000.0,5,2,3650,9050,2,0,4,5,3370,280,1921,0,102,2023
2,2014,342000.0,3,2,1930,11947,1,0,0,4,1930,0,1966,0,57,2023
3,2014,420000.0,3,2,2000,8030,1,0,0,4,1000,1000,1963,0,60,2023
4,2014,550000.0,4,2,1940,10500,1,0,0,4,1140,800,1976,1992,47,31


In [53]:
train_df = train_df.drop(['yr_built','yr_renovated'], axis = 1)
train_df

Unnamed: 0,year,price,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,Age,AgeRenovated
0,2014,313000.0,3,1,1340,7912,1,0,0,3,1340,0,68,18
1,2014,2384000.0,5,2,3650,9050,2,0,4,5,3370,280,102,2023
2,2014,342000.0,3,2,1930,11947,1,0,0,4,1930,0,57,2023
3,2014,420000.0,3,2,2000,8030,1,0,0,4,1000,1000,60,2023
4,2014,550000.0,4,2,1940,10500,1,0,0,4,1140,800,47,31
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4595,2014,308166.0,3,1,1510,6360,1,0,0,4,1510,0,69,44
4596,2014,534333.0,3,2,1460,7573,2,0,0,3,1460,0,40,14
4597,2014,416904.0,3,2,3010,7014,2,0,0,3,3010,0,14,2023
4598,2014,203400.0,4,2,2090,6630,1,0,0,3,1070,1020,49,2023


In [54]:
columns = train_df.columns

In [55]:
#scaler = MinMaxScaler(feature_range = (0, 1))
#normal = pd.DataFrame(scaler.fit_transform(train_df), columns = columns)
#normal.head(20)

In [56]:
#train_df = normal

In [57]:
y = train_df["price"]
train_df = train_df.drop('price', axis=1)

In [58]:
train_df.tail(20)

Unnamed: 0,year,bedrooms,bathrooms,sqft_living,sqft_lot,floors,waterfront,view,condition,sqft_above,sqft_basement,Age,AgeRenovated
4579,2014,2,1,1990,4000,1,0,0,5,1090,900,71,25
4580,2014,3,1,1560,7552,1,0,0,4,910,650,75,2023
4581,2014,3,1,1740,4200,1,0,0,4,1640,100,103,2023
4582,2014,2,1,1290,4650,1,0,0,4,1290,0,117,33
4583,2014,4,2,1700,8640,1,0,0,3,850,850,68,13
4584,2014,3,1,1880,10032,1,0,0,4,1880,0,39,2023
4585,2014,3,1,1890,3330,1,0,0,4,1390,500,122,2023
4586,2014,3,2,1620,1075,3,0,0,3,1540,80,14,2023
4587,2014,2,1,770,8149,1,0,0,3,770,0,75,29
4589,2014,3,1,1040,8892,1,0,0,4,800,240,65,51


In [59]:
X_train, X_test, y_train, y_test = train_test_split(train_df, y, test_size=0.3, random_state=42)

In [60]:
X_train.shape

(3185, 13)

In [61]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 3185 entries, 1356 to 860
Data columns (total 13 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   year           3185 non-null   int32
 1   bedrooms       3185 non-null   int64
 2   bathrooms      3185 non-null   int64
 3   sqft_living    3185 non-null   int64
 4   sqft_lot       3185 non-null   int64
 5   floors         3185 non-null   int64
 6   waterfront     3185 non-null   int64
 7   view           3185 non-null   int64
 8   condition      3185 non-null   int64
 9   sqft_above     3185 non-null   int64
 10  sqft_basement  3185 non-null   int64
 11  Age            3185 non-null   int64
 12  AgeRenovated   3185 non-null   int64
dtypes: int32(1), int64(12)
memory usage: 335.9 KB


In [62]:
model = LinearRegression()
model.fit(X_train, y_train)

Y_pred = model.predict(X_test)

In [63]:
print("R2 Score                : ", r2_score(y_test, Y_pred))

R2 Score                :  0.5820636710088838


In [64]:
y_train_pred = model.predict(X_train)
r2_score(y_train,y_train_pred )

0.18821891031220372

In [65]:
# Initialize the Linear Regression model
linear_model = LinearRegression()

# Train the model
linear_model.fit(X_train, y_train)

# Make predictions on the training set
y_train_pred = linear_model.predict(X_train)

# Make predictions on the test set
y_test_pred = linear_model.predict(X_test)

# Evaluate the model
mse_train = mean_squared_error(y_train, y_train_pred)
print(f'Mean Squared Error (MSE) for Training Set: {mse_train}')

r2_train = r2_score(y_train, y_train_pred)
print(f'R-squared Score for Training Set: {r2_train}')

mse_test = mean_squared_error(y_test, y_test_pred)
print(f'Mean Squared Error (MSE) for Test Set: {mse_test}')

r2_test = r2_score(y_test, y_test_pred)
print(f'R-squared Score for Test Set: {r2_test}')

Mean Squared Error (MSE) for Training Set: 323783591834.0823
R-squared Score for Training Set: 0.18821891031220372
Mean Squared Error (MSE) for Test Set: 53971786621.21683
R-squared Score for Test Set: 0.5820636710088838


In [72]:
#import numpy as np
# Your input data
input_data = np.array([[2014,	2,	1,	1990,	4000,	1,	0,	0,	5,	1090,	900,	71,	25]])

# Use the trained linear model to make predictions for the single input
prediction = linear_model.predict(input_data)

# Print the prediction
print(prediction[0])


619734.5828384324


