# Import

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns

# Modeling imports
from sklearn.linear_model import LinearRegression, RANSACRegressor, HuberRegressor,TheilSenRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error
from sklearn.tree import DecisionTreeRegressor
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import OneHotEncoder

In [3]:
train_df = pd.read_csv('../Data/trian_cleaned.csv')
test_df = pd.read_json('../Data/test.json')

# Preprocessing

In [5]:
# Filling null value with medium value of amount of bedrooms, baths and floor_level by property type
def replace_bedrooms(df):
    if not np.isnan(df['bedrooms']):
        return df['bedrooms']
    if df['property_type'] == 'Condo':
        return 1
    if df['property_type'] == 'Townhouse':
        return 3
    if df['property_type'] == 'Detached House':
        return 3

def replace_baths(df):
    if not np.isnan(df['baths']):
        return df['baths']
    if df['property_type'] == 'Condo':
        return 1
    if df['property_type'] == 'Townhouse':
        return 3
    if df['property_type'] == 'Detached House':
        return 3

def replace_floor(df):
    if not np.isnan(df['floor_level']):
        return df['floor_level']
    if df['property_type'] == 'Condo':
        return 18
    if df['property_type'] == 'Townhouse':
        return 2
    if df['property_type'] == 'Detached House':
        return 2

train_df['baths'] = train_df.apply(replace_baths,axis=1)
train_df['bedrooms'] = train_df.apply(replace_bedrooms,axis=1)
train_df['floor_level'] = train_df.apply(replace_floor,axis=1)

In [6]:
train_df.head()

Unnamed: 0,id,province,district,subdistrict,address,property_type,total_units,bedrooms,baths,floor_area,...,nearby_stations,nearby_station_distance,nearby_bus_stops,nearby_supermarkets,nearby_shops,year_built,month_built,facilities,price,price_sqm
0,8448321,Bangkok,Watthana,Phra Khanong Nuea,"36 Soi Sukhumvit 63, Ekamai Road",Condo,273.0,2.0,2.0,66,...,2,"[['E7 Ekkamai BTS', 270], ['E6 Thong Lo BTS', ...",,16.0,20,2011,June,"['Car Park', 'Community Garden', 'CCTV', 'Fitn...",8500000,128787.878788
1,10936325,Bangkok,Watthana,Khlong Toei Nuea,31 สุขุมวิท,Condo,74.0,1.0,1.0,49,...,3,"[['BL22 Sukhumvit MRT', 720], ['BL21 Phetchabu...",,11.0,20,2012,September,"['CCTV', 'Fitness corner', '24 hours security'...",5900000,120408.163265
2,10927931,Bangkok,Khlong Toei,Khlong Tan,"68 Sukhumvit 24 Alley, Khong Tan",Condo,940.0,1.0,1.0,34,...,2,"[['E5 Phrom Phong BTS', 650], ['BL23 Queen Sir...",,20.0,20,2017,January,"['Car Park', 'Clubhouse', 'Community Garden', ...",6290000,185000.0
3,11004792,Nonthaburi,Bang Kruai,Bang Khun Kong,Bang Khun Kong,Detached House,,3.0,3.0,170,...,0,,,2.0,4,0,,"['Covered car park', 'Playground', '24 hours s...",8900000,
4,10757452,Nonthaburi,Mueang Nonthaburi,Bang Phai,พระราม5-นครอินทร์,Townhouse,,3.0,2.0,120,...,1,"[['PP09 Yaek Nonthaburi 1 MRT', 10]]",,6.0,15,0,,"['Covered car park', '24 hours security']",2390000,


In [7]:
# Dummies
train_df = pd.get_dummies(train_df, columns=['province','property_type','district'
                                             ,'nearby_stations','address'], drop_first=True)

# List of train new column names
train_new_columns = list(set(test_df.columns) - set(train_df.columns))
# Add the new columns with 'False' as default values
for col in train_new_columns:
    train_df[col] = False

# Drop unused rows
train_df = train_df.drop(columns=['subdistrict','nearby_station_distance', 
                          'nearby_bus_stops', 'nearby_supermarkets','nearby_shops', 'year_built', 
                          'month_built','total_units','facilities','price_sqm','land_area'])

# Poly six columns
poly_cols = train_df[['bedrooms', 'baths', 'floor_area', 'floor_level','latitude','longitude']]
poly = PolynomialFeatures(include_bias=False)
X_poly = poly.fit_transform(poly_cols)
pd.concat([train_df,pd.DataFrame(X_poly)],axis=1)



Unnamed: 0,id,bedrooms,baths,floor_area,floor_level,latitude,longitude,price,province_Nonthaburi,province_Samut Prakan,...,17,18,19,20,21,22,23,24,25,26
0,8448321,2.0,2.0,66,10.0,13.721944,100.584850,8500000,False,False,...,4356.0,660.0,905.648331,6638.600119,100.0,137.219444,1005.848503,188.291758,1380.219724,10117.312108
1,10936325,1.0,1.0,49,8.0,13.741904,100.566949,5900000,False,False,...,2401.0,392.0,673.353296,4927.780501,64.0,109.935232,804.535592,188.839926,1381.981359,10113.711231
2,10927931,1.0,1.0,34,4.0,13.725395,100.565660,6290000,False,False,...,1156.0,136.0,466.663440,3419.232425,16.0,54.901581,402.262638,188.386476,1380.303429,10113.451882
3,11004792,3.0,3.0,170,2.0,13.821687,100.428438,8900000,True,False,...,28900.0,340.0,2349.686875,17072.834375,4.0,27.643375,200.856875,191.039045,1388.090479,10085.871059
4,10757452,3.0,2.0,120,2.0,13.865849,100.494129,2390000,True,False,...,14400.0,240.0,1663.901904,12059.295420,4.0,27.731698,200.988257,192.261774,1393.436431,10099.069863
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
14085,10738911,2.0,1.0,58,18.0,13.862024,100.504931,4000000,True,False,...,3364.0,1044.0,803.997392,5829.285998,324.0,249.516432,1809.088758,192.155709,1393.201766,10101.241155
14086,11031178,3.0,2.0,85,1.0,13.934188,100.359562,1790000,True,False,...,7225.0,85.0,1184.405937,8530.562813,1.0,13.934188,100.359562,194.161581,1398.428961,10072.041785
14087,10945909,3.0,3.0,170,2.0,13.791162,100.712196,4550000,False,False,...,28900.0,340.0,2344.497540,17121.073320,4.0,27.582324,201.424392,190.196149,1388.938210,10142.946423
14088,10768246,2.0,2.0,150,2.0,13.917938,100.573063,1950000,False,False,...,22500.0,300.0,2087.690625,15085.959375,4.0,27.835875,201.146125,193.708984,1399.769598,10114.940901


# Modeling

In [9]:
X = train_df.drop(columns= 'price')
y = train_df['price']

In [10]:
X_train, X_dev, y_train, y_dev = train_test_split(X,y,random_state = 42)

In [11]:
lr = LinearRegression()
lr.fit(X_train, y_train)
pred_train = lr.predict(X_train)
pred_dev = lr.predict(X_dev)

In [12]:
print("R2 Score of train :",  r2_score(y_train, pred_train))
print("R2 Score of dev   :",  r2_score(y_dev, pred_dev))

print("RMSE of train :", root_mean_squared_error(y_train, pred_train))
print("RMSE of test  :", root_mean_squared_error(y_dev, pred_dev))

R2 Score of train : 0.8397914066051332
R2 Score of dev   : 0.7497555179346485
RMSE of train : 869685.6531088555
RMSE of test  : 1098341.6073382136


# Preprocessing

In [14]:
test_df['baths'] = test_df.apply(replace_baths,axis=1)
test_df['bedrooms'] = test_df.apply(replace_bedrooms,axis=1)
test_df['floor_level'] = test_df.apply(replace_floor,axis=1)

In [15]:
# Dummies
test_df = pd.get_dummies(test_df, columns=['province','property_type','district',
                                             'nearby_stations','address'], drop_first=True)

# List of test new column names
test_new_columns = list(set(train_df.columns) - set(test_df.columns))
# Add the new columns with 'False' as default values
for col in test_new_columns:
    test_df[col] = False

  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[col] = False
  test_df[c

In [16]:
# Drop unused rows
test_df = test_df.drop(columns=['subdistrict','nearby_station_distance', 
                          'nearby_bus_stops', 'nearby_supermarkets','nearby_shops', 'year_built', 
                          'month_built','total_units','facilities','land_area'])

# Poly six columns
poly_cols = test_df[['bedrooms', 'baths', 'floor_area', 'floor_level','latitude','longitude']]
poly = PolynomialFeatures(include_bias=False)
X_poly = poly.fit_transform(poly_cols)
pd.concat([test_df,pd.DataFrame(X_poly)],axis=1)

Unnamed: 0,id,bedrooms,baths,floor_area,floor_level,latitude,longitude,province_Nonthaburi,province_Samut Prakan,property_type_Detached House,...,17,18,19,20,21,22,23,24,25,26
0,10317868,4.0,3.0,120,2.0,13.614196,100.680610,False,True,True,...,14400.0,240.0,1633.703520,12081.673200,4.0,27.228392,201.361220,185.346333,1370.685558,10136.585230
1,10885829,4.0,3.0,188,2.0,13.702222,100.341833,False,False,True,...,35344.0,376.0,2576.017736,18864.264604,4.0,27.404444,200.683666,187.750888,1374.906072,10068.483450
2,10765951,1.0,1.0,22,23.0,13.755713,100.566385,False,False,False,...,484.0,506.0,302.625694,2212.460475,529.0,316.381407,2313.026861,189.219650,1383.362367,10113.597842
3,10003549,1.0,1.0,41,5.0,13.724295,100.577202,False,False,False,...,1681.0,205.0,562.696080,4123.665294,25.0,68.621473,502.886011,188.356263,1380.351158,10115.773620
4,10663026,1.0,1.0,29,25.0,13.855437,100.547923,False,False,False,...,841.0,725.0,401.807676,2915.889759,625.0,346.385928,2513.698068,191.973137,1393.135419,10109.884766
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2495,10889363,1.0,1.0,33,12.0,13.733237,100.526269,False,False,False,...,1089.0,396.0,453.196821,3317.366877,144.0,164.798844,1206.315228,188.601798,1380.551077,10105.530759
2496,10975839,1.0,1.0,65,33.0,13.719377,100.531092,False,False,False,...,4225.0,2145.0,891.759474,6534.521011,1089.0,452.739425,3317.526052,188.221292,1379.223910,10106.500554
2497,10986832,1.0,1.0,42,10.0,13.772913,100.492732,False,False,False,...,1764.0,420.0,578.462360,4220.694726,100.0,137.729133,1004.927316,189.693142,1384.077682,10098.789098
2498,10687627,1.0,1.0,28,8.0,13.653037,100.592954,False,True,False,...,784.0,224.0,382.285026,2816.602705,64.0,109.224293,804.743630,186.405410,1373.399284,10118.942341


In [17]:
# prediction
pred_test = lr.predict(test_df)
pred_test[:10]

ValueError: The feature names should match those that were passed during fit.
Feature names unseen at fit time:
- address_1 ทวีวัฒนา กาญจนาภิเษก
- address_1/15 พหลโยธิน
- address_103/2 สุขุมวิท
- address_1035/1-2 Ratchadaphisek Soi 17, Ratchadaphisek Road
- address_107 Sukhuvit 107
- ...


In [None]:
# save as csv file
test_df[['id','price']].to_csv('submission.csv', index=False)