# Import

In [67]:
import pandas as pd
import numpy as np
import seaborn as sns

# Modeling imports
from sklearn.linear_model import LinearRegression, RANSACRegressor, HuberRegressor,TheilSenRegressor
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score, root_mean_squared_error

In [69]:
train_df = pd.read_csv('../Data/trian_cleaned.csv')
test_df = pd.read_json('../Data/test.json')

# Preprocessing

In [108]:
# Filling null value with medium value of amount of bedrooms, baths and floor_level by property type
def replace_bedrooms(df):
    if not np.isnan(df['bedrooms']):
        return df['bedrooms']
    if df['property_type'] == 'Condo':
        return 1
    if df['property_type'] == 'Townhouse':
        return 3
    if df['property_type'] == 'Detached House':
        return 3

def replace_baths(df):
    if not np.isnan(df['baths']):
        return df['baths']
    if df['property_type'] == 'Condo':
        return 1
    if df['property_type'] == 'Townhouse':
        return 3
    if df['property_type'] == 'Detached House':
        return 3

def replace_floor(df):
    if not np.isnan(df['floor_level']):
        return df['floor_level']
    if df['property_type'] == 'Condo':
        return 18
    if df['property_type'] == 'Townhouse':
        return 2
    if df['property_type'] == 'Detached House':
        return 2

In [110]:
test_df['baths'] = test_df.apply(replace_baths,axis=1)
test_df['bedrooms'] = test_df.apply(replace_bedrooms,axis=1)
test_df['floor_level'] = test_df.apply(replace_floor,axis=1)

# Modeling

In [113]:
train_df.columns

Index(['id', 'province', 'district', 'subdistrict', 'address', 'property_type',
       'total_units', 'bedrooms', 'baths', 'floor_area', 'floor_level',
       'land_area', 'latitude', 'longitude', 'nearby_stations',
       'nearby_station_distance', 'nearby_bus_stops', 'nearby_supermarkets',
       'nearby_shops', 'year_built', 'month_built', 'facilities', 'price'],
      dtype='object')

In [115]:
train_df.shape

(14271, 23)

In [117]:
X = pd.get_dummies(train_df, columns=['province','property_type','district','nearby_stations'], drop_first=True)
y = train_df['price']

# exclude unneed column
used_features = ['bedrooms','baths','floor_level','district','latitude', 'longitude','floor_area']
features = [x for x in X.columns if x in used_features or 'property_type' in x or 'province_' in x or 'district_' in x or 'nearby_stations' in x]
y = X[(~X['bedrooms'].isnull()) & (~X['baths'].isnull())]['price']
X = X[(~X['bedrooms'].isnull()) & (~X['baths'].isnull())][features]

X_train, X_dev, y_train, y_dev = train_test_split(X, y, train_size=0.8, random_state=42)


In [119]:
model = LinearRegression()
model.fit(X_train, y_train)


pred_train = model.predict(X_train)
pred_dev = model.predict(X_dev)

print("R2 Score of train :",  r2_score(y_train, pred_train))
print("R2 Score of dev   :",  r2_score(y_dev, pred_dev))
print()
print("RMSE of train :", root_mean_squared_error(y_train, pred_train))
print("RMSE of test  :", root_mean_squared_error(y_dev, pred_dev))

ValueError: Input X contains NaN.
LinearRegression does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values

# How to submit

- preprocess test data in the same way as train data
- after that, predict `price` and create `submission.csv`
- `submission.csv` must have 2 columns: `id` and `price`
- **NEVER** drop any row in the test data

In [None]:
# preprocessing
test_df['bedrooms'] = test_df.apply(replace_bedrooms,axis=1)
test_df['baths'] = test_df.apply(replace_baths,axis=1)
test_df['floor_level'] = test_df.apply(replace_floor,axis=1)

X_test = pd.get_dummies(test_df, columns=['province','property_type','district','nearby_stations'], drop_first=True)
used_features = ['bedrooms','baths','floor_level','district','latitude', 'longitude','floor_area']
features = [x for x in X.columns if x in used_features or 'property_type' in x or 'province_' in x or 'district_' in x or 'nearby_stations' in x]
X_test = X_test[features]


In [85]:
# prediction
pred_test = model.predict(X_test)
pred_test[:10]

NameError: name 'X_test' is not defined

In [None]:
# save as csv file
test_df['price'] = pred_test
test_df[['id','price']].to_csv('submission.csv', index=False)