In [26]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression

train_data = pd.read_csv('kc_house_train_data.csv')
test_data = pd.read_csv('kc_house_test_data.csv')
print(train_data.head())
print(test_data.head())

In [9]:
print(train_data.info())
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17384 entries, 0 to 17383
Data columns (total 21 columns):
id               17384 non-null int64
date             17384 non-null object
price            17384 non-null float64
bedrooms         17384 non-null int64
bathrooms        17384 non-null float64
sqft_living      17384 non-null int64
sqft_lot         17384 non-null int64
floors           17384 non-null float64
waterfront       17384 non-null int64
view             17384 non-null int64
condition        17384 non-null int64
grade            17384 non-null int64
sqft_above       17384 non-null int64
sqft_basement    17384 non-null int64
yr_built         17384 non-null int64
yr_renovated     17384 non-null int64
zipcode          17384 non-null int64
lat              17384 non-null float64
long             17384 non-null float64
sqft_living15    17384 non-null int64
sqft_lot15       17384 non-null int64
dtypes: float64(5), int64(15), object(1)
memory usage: 2.8+ MB
None
<class 'pandas

Change column type

In [13]:
dtype_dict = {'bathrooms':float, 'waterfront':int, 'sqft_above':int, 'sqft_living15':float, 'grade':int, 'yr_renovated':int, 'price':float, 'bedrooms':float, 'zipcode':str, 'long':float, 'sqft_lot15':float, 'sqft_living':float, 'floors':str, 'condition':int, 'lat':float, 'date':str, 'sqft_basement':int, 'yr_built':int, 'id':str, 'sqft_lot':int, 'view':int}
for cols in train_data.columns:
    train_data[cols] = train_data[cols].astype(dtype_dict[cols])
    test_data[cols] = test_data[cols].astype(dtype_dict[cols])

In [15]:
print(train_data.info())
print(test_data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17384 entries, 0 to 17383
Data columns (total 21 columns):
id               17384 non-null object
date             17384 non-null object
price            17384 non-null float64
bedrooms         17384 non-null float64
bathrooms        17384 non-null float64
sqft_living      17384 non-null float64
sqft_lot         17384 non-null int64
floors           17384 non-null object
waterfront       17384 non-null int64
view             17384 non-null int64
condition        17384 non-null int64
grade            17384 non-null int64
sqft_above       17384 non-null int64
sqft_basement    17384 non-null int64
yr_built         17384 non-null int64
yr_renovated     17384 non-null int64
zipcode          17384 non-null object
lat              17384 non-null float64
long             17384 non-null float64
sqft_living15    17384 non-null float64
sqft_lot15       17384 non-null float64
dtypes: float64(8), int64(9), object(4)
memory usage: 2.8+ MB
None
<class

Add new Columns to Train and Test Datasets

In [17]:
train_data['bedrooms_squared'] = train_data['bedrooms'] * train_data['bedrooms']
train_data['bed_bath_rooms'] = train_data['bedrooms'] * train_data['bathrooms']
train_data['log_sqft_living'] = train_data['sqft_living'].apply(lambda x:np.log(x))
train_data['lat_plus_long'] = train_data['lat'] + train_data['long']

In [18]:
test_data['bedrooms_squared'] = test_data['bedrooms'] * test_data['bedrooms']
test_data['bed_bath_rooms'] = test_data['bedrooms'] * test_data['bathrooms']
test_data['log_sqft_living'] = test_data['sqft_living'].apply(lambda x:np.log(x))
test_data['lat_plus_long'] = test_data['lat'] + test_data['long']

In [19]:
print(train_data.head())
print(test_data.head())

           id             date     price  bedrooms  bathrooms  sqft_living  \
0  7129300520  20141013T000000  221900.0       3.0       1.00       1180.0   
1  6414100192  20141209T000000  538000.0       3.0       2.25       2570.0   
2  5631500400  20150225T000000  180000.0       2.0       1.00        770.0   
3  2487200875  20141209T000000  604000.0       4.0       3.00       1960.0   
4  1954400510  20150218T000000  510000.0       3.0       2.00       1680.0   

   sqft_lot floors  waterfront  view      ...        yr_renovated  zipcode  \
0      5650    1.0           0     0      ...                   0    98178   
1      7242    2.0           0     0      ...                1991    98125   
2     10000    1.0           0     0      ...                   0    98028   
3      5000    1.0           0     0      ...                   0    98136   
4      8080    1.0           0     0      ...                   0    98074   

       lat     long  sqft_living15  sqft_lot15 bedrooms_square

Qn1

In [20]:
test_data['bedrooms_squared'].mean()

12.4466777015843

Qn2

In [22]:
test_data['bed_bath_rooms'].mean()

7.5039016315913925

Qn3

In [23]:
test_data['log_sqft_living'].mean()

7.550274679645921

Qn4

In [24]:
test_data['lat_plus_long'].mean()

-74.65333355403185

Linear Regression

In [52]:
model = LinearRegression()

Model1

In [53]:
X_train1 = train_data.loc[:,['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']]
X_test1 = test_data.loc[:,['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long']]
y_train = train_data.loc[:,['price']]
y_test = test_data.loc[:,['price']]

In [54]:
print(X_train1.head())
print(X_test1.head())
print(y_train.head())
print(y_test.head())

   sqft_living  bedrooms  bathrooms      lat     long
0       1180.0       3.0       1.00  47.5112 -122.257
1       2570.0       3.0       2.25  47.7210 -122.319
2        770.0       2.0       1.00  47.7379 -122.233
3       1960.0       4.0       3.00  47.5208 -122.393
4       1680.0       3.0       2.00  47.6168 -122.045
   sqft_living  bedrooms  bathrooms      lat     long
0       1430.0       3.0        1.0  47.7558 -122.229
1       2950.0       4.0        3.0  47.5714 -122.375
2       1710.0       3.0        2.0  47.3048 -122.218
3       2320.0       3.0        2.5  47.5391 -122.070
4       1090.0       3.0        1.0  47.6889 -122.375
      price
0  221900.0
1  538000.0
2  180000.0
3  604000.0
4  510000.0
      price
0  310000.0
1  650000.0
2  233000.0
3  580500.0
4  535000.0


Training and Predicting Using Model1

In [55]:
model.fit(X=X_train1,y=y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [56]:
model.coef_ ### coeff of bathroom - positive

array([[  3.12258646e+02,  -5.95865332e+04,   1.57067421e+04,
          6.58619264e+05,  -3.09374351e+05]])

Calculate Training and Test RSS

In [73]:
train_pred1 = model.predict(X_train1)

In [74]:
rss_train1 = (y_train - train_pred1).apply(lambda x:x**2).sum()
rss_train1

price    9.678800e+14
dtype: float64

In [75]:
test_pred1 = model.predict(X_test1)

In [76]:
rss_test1 = (y_test - test_pred1).apply(lambda x:x**2).sum()
rss_test1

price    2.255005e+14
dtype: float64

Model2

In [77]:
X_train2 = train_data.loc[:,['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']]
X_test2 = test_data.loc[:,['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms']]

In [78]:
print(X_train2.head())
print(X_test2.head())

   sqft_living  bedrooms  bathrooms      lat     long  bed_bath_rooms
0       1180.0       3.0       1.00  47.5112 -122.257            3.00
1       2570.0       3.0       2.25  47.7210 -122.319            6.75
2        770.0       2.0       1.00  47.7379 -122.233            2.00
3       1960.0       4.0       3.00  47.5208 -122.393           12.00
4       1680.0       3.0       2.00  47.6168 -122.045            6.00
   sqft_living  bedrooms  bathrooms      lat     long  bed_bath_rooms
0       1430.0       3.0        1.0  47.7558 -122.229             3.0
1       2950.0       4.0        3.0  47.5714 -122.375            12.0
2       1710.0       3.0        2.0  47.3048 -122.218             6.0
3       2320.0       3.0        2.5  47.5391 -122.070             7.5
4       1090.0       3.0        1.0  47.6889 -122.375             3.0


Training and Predicting Using Model2

In [79]:
model.fit(X=X_train2,y=y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [80]:
model.coef_ ### coeff of bathroom - negative

array([[  3.06610053e+02,  -1.13446368e+05,  -7.14613083e+04,
          6.54844630e+05,  -2.94298969e+05,   2.55796520e+04]])

In [81]:
train_pred2 = model.predict(X_train2)

In [82]:
rss_train2 = (y_train - train_pred2).apply(lambda x:x**2).sum()
rss_train2

price    9.584196e+14
dtype: float64

In [83]:
test_pred2 = model.predict(X_test2)

In [84]:
rss_test2 = (y_test - test_pred2).apply(lambda x:x**2).sum()
rss_test2

price    2.233775e+14
dtype: float64

Model3

In [85]:
X_train3 = train_data.loc[:,['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']]
X_test3 = test_data.loc[:,['sqft_living', 'bedrooms', 'bathrooms', 'lat', 'long', 'bed_bath_rooms', 'bedrooms_squared', 'log_sqft_living', 'lat_plus_long']]

In [86]:
print(X_train3.head())
print(X_test3.head())

   sqft_living  bedrooms  bathrooms      lat     long  bed_bath_rooms  \
0       1180.0       3.0       1.00  47.5112 -122.257            3.00   
1       2570.0       3.0       2.25  47.7210 -122.319            6.75   
2        770.0       2.0       1.00  47.7379 -122.233            2.00   
3       1960.0       4.0       3.00  47.5208 -122.393           12.00   
4       1680.0       3.0       2.00  47.6168 -122.045            6.00   

   bedrooms_squared  log_sqft_living  lat_plus_long  
0               9.0         7.073270       -74.7458  
1               9.0         7.851661       -74.5980  
2               4.0         6.646391       -74.4951  
3              16.0         7.580700       -74.8722  
4               9.0         7.426549       -74.4282  
   sqft_living  bedrooms  bathrooms      lat     long  bed_bath_rooms  \
0       1430.0       3.0        1.0  47.7558 -122.229             3.0   
1       2950.0       4.0        3.0  47.5714 -122.375            12.0   
2       1710.0    

Training and Predicting Using Model3

In [87]:
model.fit(X=X_train3,y=y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [88]:
train_pred3 = model.predict(X_train3)

In [89]:
rss_train3 = (y_train - train_pred3).apply(lambda x:x**2).sum()
rss_train3

price    9.034365e+14
dtype: float64

In [91]:
test_pred3 = model.predict(X_test3)

In [92]:
rss_test3 = (y_test - test_pred3).apply(lambda x:x**2).sum()
rss_test3

price    2.592363e+14
dtype: float64