In [1]:
#importing required modules
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_log_error

In [2]:
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")

In [3]:
train["source"] = "train"
test["source"] = "test"

In [4]:
train_concat = train.drop(columns=["TARGET(PRICE_IN_LACS)"])

In [5]:
df = pd.concat([train_concat,test])

In [6]:
df.shape

(98171, 12)

In [8]:
df.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE,source
0,Owner,0,0,2,BHK,1300.236407,1,1,"Ksfc Layout,Bangalore",12.96991,77.59796,train
1,Dealer,0,0,2,BHK,1275.0,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605,train
2,Owner,0,0,2,BHK,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191,train
3,Owner,0,1,2,BHK,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.6423,77.3445,train
4,Dealer,1,0,2,BHK,999.009247,0,1,"New Town,Kolkata",22.5922,88.484911,train


In [13]:
replace_POSTED_BY = {'Dealer': 0, 'Owner': 1, 'Builder': 2}
df['POSTED_BY'] = df.POSTED_BY.map(replace_POSTED_BY)
df['POSTED_BY'].unique()

array([1, 0, 2], dtype=int64)

In [9]:
replace_BHK_OR_RK = {'BHK': 0, 'RK': 1}
df["BHK_OR_RK"] = df.BHK_OR_RK.map(replace_BHK_OR_RK)

In [12]:
df["BHK_OR_RK"].unique()

array([0, 1], dtype=int64)

In [14]:
train_preprocessed = df[df["source"]=="train"]
test_preprocessed = df[df["source"]=="test"]

In [15]:
train_preprocessed.shape

(29451, 12)

In [16]:
test_preprocessed.shape

(68720, 12)

In [17]:
df_category_columns = df.select_dtypes(exclude = np.number).columns
df_category_columns

Index(['ADDRESS', 'source'], dtype='object')

In [18]:
del train_preprocessed["source"]
train_preprocessed

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE
0,1,0,0,2,0,1300.236407,1,1,"Ksfc Layout,Bangalore",12.969910,77.597960
1,0,0,0,2,0,1275.000000,1,1,"Vishweshwara Nagar,Mysore",12.274538,76.644605
2,1,0,0,2,0,933.159722,1,1,"Jigani,Bangalore",12.778033,77.632191
3,1,0,1,2,0,929.921143,1,1,"Sector-1 Vaishali,Ghaziabad",28.642300,77.344500
4,0,1,0,2,0,999.009247,0,1,"New Town,Kolkata",22.592200,88.484911
...,...,...,...,...,...,...,...,...,...,...,...
29446,1,0,0,3,0,2500.000000,1,1,"Shamshabad Road,Agra",27.140626,78.043277
29447,1,0,0,2,0,769.230769,1,1,"E3-108, Lake View Recidency,,Vapi",39.945409,-86.150721
29448,0,0,0,2,0,1022.641509,1,1,"Ajmer Road,Jaipur",26.928785,75.828002
29449,1,0,0,2,0,927.079009,1,1,"Sholinganallur,Chennai",12.900150,80.227910


In [19]:
del test_preprocessed["source"]
test_preprocessed

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,ADDRESS,LONGITUDE,LATITUDE
0,1,0,0,1,0,545.171340,1,1,"Kamrej,Surat",21.262000,73.047700
1,0,1,1,2,0,800.000000,0,0,"Panvel,Lalitpur",18.966114,73.148278
2,0,0,0,2,0,1257.096513,1,1,"New Town,Kolkata",22.592200,88.484911
3,0,0,0,3,0,1400.329489,1,1,"Kalwar Road,Jaipur",26.988300,75.584600
4,1,0,0,1,0,430.477830,1,1,"Mai Mandir,Nadiad",22.700000,72.870000
...,...,...,...,...,...,...,...,...,...,...,...
68715,0,0,1,2,0,856.555505,1,1,"Thane West,Maharashtra",19.180000,72.963330
68716,0,0,1,3,0,2304.147465,1,1,"Sector-66A Mohali,Mohali",30.661104,76.746082
68717,0,1,1,1,0,33362.792750,0,0,"Balkum,Maharashtra",19.222101,72.988231
68718,0,0,0,2,0,1173.708920,1,1,"Hadapsar,Pune",18.496670,73.941670


In [20]:
train_x = train_preprocessed.drop(columns=["ADDRESS"])

In [21]:
test_x = test_preprocessed.drop(columns=["ADDRESS"])

In [22]:
train_y = train["TARGET(PRICE_IN_LACS)"]

In [23]:
train_x.shape, test_x.shape, train_y.shape

((29451, 10), (68720, 10), (29451,))

In [24]:
model = LinearRegression()

In [25]:
model.fit(train_x, train_y)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [46]:
pred_train = model.predict(train_x)
pred_test = model.predict(test_x)
pred_test = abs(pred_test)
pred_train = abs(pred_train)

In [42]:
sample = pd.read_csv("sample_submission.csv")

In [43]:
sample.head()

Unnamed: 0,TARGET(PRICE_IN_LACS)
0,0.0
1,0.0
2,0.0
3,0.0
4,0.0


In [44]:
sample["TARGET(PRICE_IN_LACS)"] = pred_test
sample.to_csv("HousePricePrediction_Submission.csv", index = False)

In [47]:
mean_squared_log_error(train_y, pred_train) 

1.1605873720731001

In [37]:
train_x.head()

Unnamed: 0,POSTED_BY,UNDER_CONSTRUCTION,RERA,BHK_NO.,BHK_OR_RK,SQUARE_FT,READY_TO_MOVE,RESALE,LONGITUDE,LATITUDE
0,1,0,0,2,0,1300.236407,1,1,12.96991,77.59796
1,0,0,0,2,0,1275.0,1,1,12.274538,76.644605
2,1,0,0,2,0,933.159722,1,1,12.778033,77.632191
3,1,0,1,2,0,929.921143,1,1,28.6423,77.3445
4,0,1,0,2,0,999.009247,0,1,22.5922,88.484911
