In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [None]:
pd.set_option('display.max_columns', None)

In [None]:
mrt_stations = pd.read_csv("auxiliary-data/sg-mrt-stations.csv")
primary_schools = pd.read_csv("auxiliary-data/sg-primary-schools.csv")
shopping_malls = pd.read_csv("auxiliary-data/sg-shopping-malls.csv")

In [None]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")
df_train.head()

### EDA

In [None]:
#Delete the data with 0 price
df_train = df_train[df_train["price"] != 0]

In [None]:
# Fill na with the nearest data or data from the same subzone
def min_dist_feature_in_same_subzone (fill_in_feature, subzone, lat, lng, df):
    df_subset = df[df['subzone'] == subzone]
    df_subset = df_subset.reset_index(drop=True)
    return df.iloc[np.argmin(np.sqrt((df["lat"]-lat)**2+(df["lng"]-lng)**2))][fill_in_feature]

def fill_NA_with_nearest_record(df, empty_feature):
    df_empty = df[(df[empty_feature].isna())]
    df_empty[empty_feature] = df_empty.apply(lambda row: min_dist_feature_in_same_subzone(empty_feature, row['subzone'], row['lat'], row['lng'], df[(df[empty_feature].notna())]), axis=1)
    return df_empty

In [None]:
# property_type: First character uppercase -> lowercase
df_test['property_type'] = df_test['property_type'].str.lower()
# df_test.head()

# property_type: (hdb 2 rooms, hdb 3 rooms, hdb 4 rooms, hdb 5 rooms) -> hdb
temp = df_test['property_type'].str.startswith(('hdb 2', 'hdb 3', 'hdb 4', 'hdb 5'))
df_test['property_type'] = np.where((temp == True), 'hdb', df_test['property_type'])

# tenure: fill all hdb property_type with hdb defult tenure value - '99-year-leasehold'
hdb_tenure = df_test[(df_test['property_type'].str.startswith('hdb')) & df_test['tenure'].notna() ]['tenure'].unique()[0]
df_test['tenure'] = np.where((df_test['property_type'].str.startswith('hdb')) & (df_test['tenure'].isna()), hdb_tenure, df_test['tenure'])

# tenure: fill in NaN tenure with value from same address or property_name, otherwise fill in value from the nearest property in the same subzone
df_test.tenure = df_test.groupby('property_name').tenure.transform('first')
df_test.tenure = df_test.groupby('address').tenure.transform('first')

df_train_empty_tenure_filled = fill_NA_with_nearest_record(df_test, 'tenure')
df_test.loc[df_test.listing_id.isin(df_train_empty_tenure_filled.listing_id), ['tenure']] = df_train_empty_tenure_filled[['tenure']]


# built_year: fill in NaN built_year with value from same property_name or address, otherwise fill in with the nearest location record within the same subzone
df_test.built_year = df_test.groupby('property_name').built_year.transform('first')
df_test.built_year = df_test.groupby('address').built_year.transform('first')

df_train_empty_built_year_filled = fill_NA_with_nearest_record(df_test, 'built_year')
df_test.loc[df_test.listing_id.isin(df_train_empty_built_year_filled.listing_id), ['built_year']] = df_train_empty_built_year_filled[['built_year']]

df_test.head()

In [None]:
# property_type: First character uppercase -> lowercase
df_train['property_type'] = df_train['property_type'].str.lower()

# property_type: (hdb 2 rooms, hdb 3 rooms, hdb 4 rooms, hdb 5 rooms) -> hdb
temp = df_train['property_type'].str.startswith(('hdb 2', 'hdb 3', 'hdb 4', 'hdb 5'))
df_train['property_type'] = np.where((temp == True), 'hdb', df_train['property_type'])

In [None]:
# num_beds: fill 1 with studio
df_train['num_beds'] = np.where((df_train['num_beds'].isna() & df_train['title'].str.startswith('studio ')), 
                                1, df_train['num_beds'])
df_test['num_beds'] = np.where((df_test['num_beds'].isna() & df_test['title'].str.startswith('studio ')), 
                                1, df_test['num_beds'])
df_test[df_test['num_beds'].isna()]

In [None]:
# price: delete rows with price value 0
df_train = df_train[df_train['price'] != 0]

# subzone & planning_area: delete rows with subzone and planning_area values NaN 
df_train = df_train[(df_train['subzone'].notna() & df_train['planning_area'].notna())]

# tenure: fill all hdb property_type with hdb defult tenure value - '99-year-leasehold'
hdb_tenure = df_train[(df_train['property_type'].str.startswith('hdb')) & df_train['tenure'].notna() ]['tenure'].unique()[0]
df_train['tenure'] = np.where((df_train['property_type'].str.startswith('hdb')) & (df_train['tenure'].isna()), hdb_tenure, df_train['tenure'])

# tenure: fill in NaN tenure with value from same property_name, otherwise drop
df_train.tenure = df_train.groupby('property_name').tenure.transform('first')

# tenure: delete rows with tenure values NaN as no useful records can be used to fill in NaN values
df_train = df_train[df_train['tenure'].notna()]

# built_year: fill in NaN built_year with value from same property_name, otherwise drop
df_train.built_year = df_train.groupby('property_name').built_year.transform('first')

df_train = df_train[df_train['built_year'].notna()]

df_train['lease_end_year'] = df_train.tenure.str.extract('(\d+)')
df_train['lease_end_year'] = np.where((df_train['tenure'] == 'freehold'), 9999, df_train['lease_end_year'])
df_train['lease_end_year'] = np.where((df_train['tenure'] == 'freehold'), 9999, df_train['lease_end_year'].astype(int) + df_train.built_year)

In [None]:
#Process the subzones in the test set that haven't appeared in the train set.
for i in range(df_test.shape[0]):
    if (df_test["subzone"].isnull()[i] == True) or (df_test["subzone"][i] not in list(df_train["subzone"])):
        temp = df_test.loc[i]
        temp_lat = temp["lat"]
        temp_lng = temp["lng"]
        index = np.argmin(np.sqrt((df_train["lat"]-temp_lat)**2+(df_train["lng"]-temp_lng)**2))
        df_test["subzone"][i] = df_train.loc[index]["subzone"]

In [None]:
#Delete the data with unreasonable size
from sklearn.cluster import DBSCAN

# Reset index
df_train = df_train.reset_index(drop=True)

# Remove NaN in num_beds and num_baths
df_train = df_train.dropna(subset=['num_beds'])
df_train = df_train.dropna(subset=['num_baths'])

# DBSCAN using beds to baths ratio
df_train['beds_to_baths'] = df_train['num_beds'] / df_train['num_baths']
sk_clustering_iris = DBSCAN(eps=0.5, min_samples=5).fit(df_train[['beds_to_baths']])
sk_noise_iris = np.argwhere(sk_clustering_iris.labels_ < 0).squeeze()
sk_noise_iris.sort()

for i in sk_noise_iris:
    df_train = df_train.drop(i)
    df_train = df_train.reset_index(drop=True)

    
# DBSCAN using baths to beds ratio
df_train['baths_to_beds'] = df_train['num_baths'] / df_train['num_beds']
sk_clustering_iris = DBSCAN(eps=0.5, min_samples=5).fit(df_train[['baths_to_beds']])
sk_noise_iris = np.argwhere(sk_clustering_iris.labels_ < 0).squeeze()
sk_noise_iris.sort()

for i in sk_noise_iris:
    df_train = df_train.drop(i)
    df_train = df_train.reset_index(drop=True)

In [None]:
#Delete the data with unreasonable size
# DBSCAN using size to rooms ratio
df_train['sqft_to_rooms'] = df_train['size_sqft'] / (df_train['num_beds'] + df_train['num_baths'])
sk_clustering_iris = DBSCAN(eps=50, min_samples=5).fit(df_train[['sqft_to_rooms']])
sk_noise_iris = np.argwhere(sk_clustering_iris.labels_ < 0).squeeze()
sk_noise_iris.sort()

for i in sk_noise_iris:
    df_train = df_train.drop(i)
    df_train = df_train.reset_index(drop=True)


# DBSCAN using rooms to size ratio
df_train['rooms_to_sqft'] = (df_train['num_beds'] + df_train['num_baths']) / df_train['size_sqft']
sk_clustering_iris = DBSCAN(eps=0.0005, min_samples=5).fit(df_train[['rooms_to_sqft']])
sk_noise_iris = np.argwhere(sk_clustering_iris.labels_ < 0).squeeze()
sk_noise_iris.sort()

for i in sk_noise_iris:
    df_train = df_train.drop(i)
    df_train = df_train.reset_index(drop=True)

df_train.drop(['rooms_to_sqft', 'sqft_to_rooms', 'baths_to_beds', 'beds_to_baths'], axis=1, inplace=True)

In [None]:
# Use new feature price_per_sqft to detect unreasonable price
df_train["price_per_sqft"] = df_train["price"]/df_train["size_sqft"]

In [None]:
import seaborn as sns
print(df_train["price_per_sqft"].describe())
sns.boxplot(y = df_train["price_per_sqft"])

In [None]:
# Drop the unreasonable data based on 3-sigma rules
while True:
    mean = np.mean(df_train["price_per_sqft"])
    std = np.std(df_train["price_per_sqft"])
    high = mean + 3*std
    low = mean - 3*std
    if ((df_train["price_per_sqft"]>low).all() and (df_train["price_per_sqft"]<high).all()) == True:
        break
    else:
        df_train = df_train[df_train["price_per_sqft"] > low]
        df_train = df_train[df_train["price_per_sqft"] < high]

In [None]:
print(df_train["price_per_sqft"].describe())
sns.boxplot(y = df_train["price_per_sqft"])

In [None]:
# Find a boundary to remove the unreasonably small data
fig, ax =plt.subplots(1,3,constrained_layout=True, figsize=(12, 3))
s1=sns.distplot(df_train["price_per_sqft"], ax=ax[0])
s1.set_title("all data")
s2=sns.distplot(df_train[df_train["price_per_sqft"]<1000]["price_per_sqft"], ax=ax[1])
s2.set_title("<1000")
s3=sns.distplot(df_train[df_train["price_per_sqft"]<400]["price_per_sqft"], ax=ax[2])
s3.set_title("<400")

In [None]:
# Unreasonable data still exists, and drop them.
df_train = df_train[df_train["price_per_sqft"] > 300]

In [None]:
# Create new feature num_rooms
df_train["num_rooms"] = df_train["num_beds"] + df_train["num_baths"]
df_test["num_rooms"] = df_test["num_beds"] + df_test["num_baths"]

#Fill num_rooms NA value
from sklearn.linear_model import LinearRegression
from math import floor
regressor0 = LinearRegression()
regressor0 = regressor0.fit(np.array(df_train[df_train["num_rooms"].notnull()]["size_sqft"]).reshape(-1, 1), np.array(df_train[df_train["num_rooms"].notnull()]["num_rooms"]).reshape(-1, 1))
for i in range(df_test.shape[0]):
    if df_test["num_rooms"].isnull()[i] == True:
        df_test.iloc[i, -1] =floor(regressor0.predict(np.array(df_test["size_sqft"][i]).reshape(-1, 1)))

In [None]:
df_train = df_train.reset_index(drop = True)
df_test = df_test.reset_index(drop = True)

### Feature Engineering

In [None]:
import copy
train = copy.deepcopy(df_train)
test = copy.deepcopy(df_test)

In [None]:
import matplotlib.pyplot as plt
import seaborn as sns
temp = train.groupby("property_type").mean()
temp = temp.sort_values(by="price",ascending=True)
ax_order = temp.index.tolist()
fig, ax =plt.subplots(1,2,constrained_layout=True, figsize=(12, 5))
s1 = sns.barplot(y="price", x="property_type", data=train, ax=ax[0])
s1.set_xticklabels(s1.get_xticklabels(),rotation = 80)
s2 = sns.barplot(y="price", x="property_type", data=train, order=ax_order, ax=ax[1])
s2.set_xticklabels(s2.get_xticklabels(),rotation = 80)

In [None]:
# Encoding property_type based on the visualization above
train = train.replace(["hdb", "hdb executive", "walk-up", "executive condo", "shophouse"],[0,0,0,0,0])
train = train.replace(["condo", "apartment", "landed", "terraced house", "cluster house"],[1,1,1,1,1])
train = train.replace(["townhouse", "corner terrace", "good class bungalow", "semi-detached house"],[2, 2, 2, 2])
train = train.replace(["bungalow"], [3])

test = test.replace(["hdb", "hdb executive", "walk-up", "executive condo", "shophouse"],[0,0,0,0,0])
test = test.replace(["condo", "apartment", "landed", "terraced house", "cluster house"],[1,1,1,1,1])
test = test.replace(["townhouse", "corner terrace", "good class bungalow", "semi-detached house"],[2, 2, 2, 2])
test = test.replace(["bungalow", "conservation house"], [3, 3])

In [None]:
# Encoding the tenure
train = train.replace(["99-year leasehold", "110-year leasehold", "103-year leasehold", "102-year leasehold", "100-year leasehold"],[0,0,0,0,0])
train = train.replace(["999-year leasehold", "946-year leasehold", "956-year leasehold", "929-year leasehold", "947-year leasehold"],[1,1,1,1,1])
train = train.replace(["freehold"],[2])
test = test.replace(["99-year leasehold", "110-year leasehold", "103-year leasehold", "102-year leasehold", "100-year leasehold"],[0,0,0,0,0])
test = test.replace(["999-year leasehold", "946-year leasehold", "956-year leasehold", "929-year leasehold", "947-year leasehold"],[1,1,1,1,1])
test = test.replace(["freehold"],[2])

In [None]:
# Encoding the built_year
#discrete
#0-1963-1970
#1-1971-1990
#2-1991-2005
#3-2006-2020
#4-2021-2028
for i in range(train.shape[0]):
    if train.iloc[i,6]<=1970:
        train.iloc[i,6]=0
    if train.iloc[i,6]<=1990 and train.iloc[i,6]>=1971:
        train.iloc[i,6]=1
    if train.iloc[i,6]>=1991 and train.iloc[i,6]<=2005:
        train.iloc[i,6]=2
    if train.iloc[i,6]>=2006 and train.iloc[i,6]<=2020:
        train.iloc[i,6]=3
    if train.iloc[i,6]>=2021:
        train.iloc[i,6]=4
for i in range(df_test.shape[0]):
    if df_test.iloc[i,6]<=1970:
        df_test.iloc[i,6]=0
    if df_test.iloc[i,6]<=1990 and df_test.iloc[i,6]>=1971:
        df_test.iloc[i,6]=1
    if df_test.iloc[i,6]>=1991 and df_test.iloc[i,6]<=2005:
        df_test.iloc[i,6]=2
    if df_test.iloc[i,6]>=2006 and df_test.iloc[i,6]<=2020:
        df_test.iloc[i,6]=3
    if df_test.iloc[i,6]>=2021:
        df_test.iloc[i,6]=4

In [None]:
# Define a function to find the mininum distance, to deal with the auxilary data.
def min_dist(lat, lng, df):
    return min(np.sqrt((df["lat"]-lat)**2+(df["lng"]-lng)**2))

In [None]:
# Find the distance of nearest mrt station, primary school and shopping mall.
min_dist_mrt = []
min_dist_pri = []
min_dist_mall = []
for i in range(train.shape[0]):
    min_dist_mrt.append(min_dist(train["lat"][i],train["lng"][i],mrt_stations))
    min_dist_pri.append(min_dist(train["lat"][i],train["lng"][i],primary_schools))
    min_dist_mall.append(min_dist(train["lat"][i],train["lng"][i],shopping_malls))

In [None]:
# Normalize the data
train["dist_mrt"] = (np.array(min_dist_mrt)-min(min_dist_mrt))/(max(min_dist_mrt)-min(min_dist_mrt))
train["dist_pri"] = (np.array(min_dist_pri)-min(min_dist_pri))/(max(min_dist_pri)-min(min_dist_pri))
train["dist_mall"] = (np.array(min_dist_mall)-min(min_dist_mall))/(max(min_dist_mall)-min(min_dist_mall))

In [None]:
# Find the distance of nearest mrt station, primary school and shopping mall.
min_dist_mrt = []
min_dist_pri = []
min_dist_mall = []
for i in range(test.shape[0]):
    min_dist_mrt.append(min_dist(test["lat"][i],test["lng"][i],mrt_stations))
    min_dist_pri.append(min_dist(test["lat"][i],test["lng"][i],primary_schools))
    min_dist_mall.append(min_dist(test["lat"][i],test["lng"][i],shopping_malls))

In [None]:
# Normalize the data
test["dist_mrt"] = (np.array(min_dist_mrt)-min(min_dist_mrt))/(max(min_dist_mrt)-min(min_dist_mrt))
test["dist_pri"] = (np.array(min_dist_pri)-min(min_dist_pri))/(max(min_dist_pri)-min(min_dist_pri))
test["dist_mall"] = (np.array(min_dist_mall)-min(min_dist_mall))/(max(min_dist_mall)-min(min_dist_mall))

In [None]:
# Drop the useless features
train = train.drop(columns = ["listing_id", "title", "address", "property_name", "num_beds", "num_baths", "floor_level", "available_unit_types", "total_num_units", "property_details_url", "elevation", "planning_area", "lease_end_year", "price_per_sqft", "furnishing"])
test = test.drop(columns = ["listing_id", "title", "address", "property_name", "num_beds", "num_baths", "floor_level", "furnishing", "available_unit_types", "total_num_units", "property_details_url", "elevation", "planning_area"])

In [None]:
temp = train.groupby("subzone").mean()
temp = temp.sort_values(by="price",ascending=True)
ax_order = temp.index.tolist()
fig, ax =plt.subplots(2, 1,constrained_layout=True, figsize=(60, 40))
s1 = sns.barplot(y="price", x="subzone", data=train, ax=ax[0])
s1.set_xticklabels(s1.get_xticklabels(),rotation = 80)
s2 = sns.barplot(y="price", x="subzone", data=train, order=ax_order, ax=ax[1])
s2.set_xticklabels(s2.get_xticklabels(),rotation = 80)
plt.subplot(2, 1, 1)
plt.xlabel("Subzone", fontsize = 30)
plt.ylabel("Average Price", fontsize = 30)
plt.subplot(2, 1, 2)
plt.xlabel("Subzone", fontsize = 30)
plt.ylabel("Average Price", fontsize = 30)

In [None]:
# Use the average price of all the houses in a subzone to encode the feature subzone
for i in list(set(train["subzone"])):
    temp = train[train["subzone"] == i]
    train = train.replace(i, np.mean(temp["price"]))
    test = test.replace(i, np.mean(temp["price"]))

In [None]:
train

In [None]:
test

In [None]:
# Split the input data and labels
X_train = train.drop(columns = ["price"])
y_train = train["price"]
X_test = test

In [None]:
# Normalize the dataset
X_train_N = copy.deepcopy(X_train)
X_test_N = copy.deepcopy(X_test)
for i in list(X_train_N.columns):
    X_train_N[i] = (X_train[i]-min(X_train[i]))/(max(X_train[i]) - min(X_train[i]))
for i in list(X_test_N.columns):
    X_test_N[i] = (X_test[i]-min(X_test[i]))/(max(X_test[i]) - min(X_test[i]))

In [None]:
#pd.DataFrame(X_train).to_csv("X_train.csv")
#pd.DataFrame(X_test).to_csv("X_test.csv")
#pd.DataFrame(y_train).to_csv("y_train.csv")
#pd.DataFrame(X_train_N).to_csv("X_train_N.csv")
#pd.DataFrame(X_test_N).to_csv("X_test_N.csv")

### Models

#### Regression Models

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import *
import time

##### Linear Regressor

In [None]:
regressor = LinearRegression()

start = time.time()
score = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
print("The score for Linear Regression: {}".format(-np.mean(score)*10))
end = time.time()
print("Time for 1 fold: {}".format((end - start)/10))

regressor = regressor.fit(X_train, y_train)
y_predict = regressor.predict(X_test)
pd.DataFrame(y_predict).to_csv("predictions/LinearRegression_prediction.csv")

##### Lasso Regressor

In [None]:
param_grid = [{'alpha': np.linspace(5, 600, 50)}]

regressor = Lasso()
grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
print("The best model is: {} \n".format(grid_search.best_estimator_))

means = grid_search.cv_results_['mean_test_score']
params = grid_search.cv_results_['params']
for mean,param in zip(means,params):
    print("%f  with:   %r" % (mean,param))

In [None]:
regressor = Lasso(alpha=442.1428571428571)

start = time.time()
score = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
print("The score for Lasso Regression: {}".format(-np.mean(score)))
end = time.time()
print("Time for 1 fold: {}".format((end - start)/10))

regressor = regressor.fit(X_train, y_train)
y_predict = regressor.predict(X_test)
pd.DataFrame(y_predict).to_csv("predictions/Lasso_prediction.csv")

##### Ridge Regression

In [None]:
param_grid = [{'alpha': np.linspace(0.0000000001, 10, 50)}]

regressor = Ridge()
grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
print("The best model is: {} \n".format(grid_search.best_estimator_))

means = grid_search.cv_results_['mean_test_score']
params = grid_search.cv_results_['params']
for mean,param in zip(means,params):
    print("%f  with:   %r" % (mean,param))

In [None]:
regressor = Ridge(alpha=1.8367346939591838) 

start = time.time()
score = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
print("The score for Ridge Regression: {}".format(-np.mean(score)))
end = time.time()
print("Time for 1 fold: {}".format((end - start)/10))

regressor = regressor.fit(X_train, y_train)
y_predict = regressor.predict(X_test)
pd.DataFrame(y_predict).to_csv("predictions/Ridge_prediction.csv")

#### Tree Based Models

In [None]:
#import packages
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor

##### Random Forest

In [None]:
# Grid Search(Change the param_grid to tune the parameters)

#param_grid = [{'n_estimators': [50, 100, 150, 200, 250, 300], 'max_depth': [50, 100, 150, 200, 250, 300], 'min_samples_split': [2, 4, 6, 8, 10]}]
#param_grid = [{'n_estimators': [70, 80, 90, 100, 110, 120, 130], 'max_depth': [30, 40, 50, 60, 70, 80], 'min_samples_split': [2]}]
param_grid = [{'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150, 170, 190, 210, 230, 250], 'max_depth': [230], 'min_samples_split': [2]}]

#param_grid = [{'n_estimators': [100], 'max_depth': [10, 30, 50, 70, 90, 110, 130, 150, 170, 190, 210, 230, 250], 'min_samples_split': [2]}]


regressor = RandomForestRegressor()
grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
# Grid Search
print("The best model is: {} \n".format(grid_search.best_estimator_))

RF_score = []

means = grid_search.cv_results_['mean_test_score']
params = grid_search.cv_results_['params']
for mean,param in zip(means,params):
    print("%f  with:   %r" % (mean,param))
    RF_score.append(mean)

In [None]:
#10-fold cross validation
regressor = RandomForestRegressor(max_depth = 230, n_estimators = 100)

start = time.time()
score = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
print("The score for RandomForest: {}".format(-np.mean(score)))
end = time.time()
print("Time for 1 fold: {}".format((end - start)/10))

regressor = regressor.fit(X_train, y_train)
y_predict = regressor.predict(X_test)
pd.DataFrame(y_predict).to_csv("predictions/RandomForest_prediction.csv")

##### Decision Tree

In [None]:
# GridSearch(Change the param_grid to tune the parameters)

#param_grid = [{'max_depth':[10, 30, 50, 70, 90, 110, 130, 150, 170, 190, 210, 230, 250], 'min_samples_split': [2, 4, 6, 8, 10]}]
param_grid = [{'max_depth': [10, 30, 50, 70, 90, 110, 130, 150, 170, 190, 210, 230, 250], 'min_samples_split': [6]}]

regressor = DecisionTreeRegressor(splitter = "best")
grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
# Grid Search
print("The best model is: {} \n".format(grid_search.best_estimator_))

DT_score = []

means = grid_search.cv_results_['mean_test_score']
params = grid_search.cv_results_['params']
for mean,param in zip(means,params):
    print("%f  with:   %r" % (mean,param))
    DT_score.append(mean)

In [None]:
#10-fold cross validation
regressor = DecisionTreeRegressor(max_depth = 110, min_samples_split = 6)

start = time.time()
score = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
print("The score for RandomForest: {}".format(-np.mean(score)))
end = time.time()
print("Time for 1 fold: {}".format((end - start)/10))

regressor = regressor.fit(X_train, y_train)
y_predict = regressor.predict(X_test)
pd.DataFrame(y_predict).to_csv("predictions/DecistionTree_prediction.csv")

##### Gradient Boosting Tree

In [None]:
#GridSearch(Change the param_grid to tune the parameters)

#param_grid = [{'leaning_rate':[0.1, 0.01, 0.001, 0.0001], 'max_depth': [50, 100, 150, 200, 250, 300], "n_estimators": [50, 100, 150, 200, 250, 300]}]
param_grid = [{'learning_rate': [0.1], 'max_depth': [10, 30, 50, 70, 90, 110, 130, 150, 170, 190, 210, 230, 250]}]

regressor = GradientBoostingRegressor(n_estimators = 100)
grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
# Grid Search
print("The best model is: {} \n".format(grid_search.best_estimator_))

GBT_score = []

means = grid_search.cv_results_['mean_test_score']
params = grid_search.cv_results_['params']
for mean,param in zip(means,params):
    print("%f  with:   %r" % (mean,param))
    GBT_score.append(mean)

In [None]:
regressor = GradientBoostingRegressor(max_depth = 10, n_estimators = 100, learning_rate = 0.1)

start = time.time()
score = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
print("The score for RandomForest: {}".format(-np.mean(score)))
end = time.time()
print("Time for 1 fold: {}".format((end - start)/10))

regressor = regressor.fit(X_train, y_train)
y_predict = regressor.predict(X_test)
pd.DataFrame(y_predict).to_csv("predictions/GradiantBoostingTree_prediction.csv")

##### Visualize Models' tuning

In [None]:
#Tuning Plot
import seaborn as sns
import matplotlib.pyplot as plt

sns.lineplot(x = list(param_grid[0]["max_depth"]), y = -np.array(RF_score), label = "Random Forest Regressor")
sns.scatterplot(x = list(param_grid[0]["max_depth"]), y = -np.array(RF_score))
sns.lineplot(x = list(param_grid[0]["max_depth"]), y = -np.array(DT_score), label = "Decision Tree Regressor")
sns.scatterplot(x = list(param_grid[0]["max_depth"]), y = -np.array(DT_score))
sns.lineplot(x = list(param_grid[0]["max_depth"]), y = -np.array(GBT_score), label = "Gradient Boosted Regressor")
sns.scatterplot(x = list(param_grid[0]["max_depth"]), y = -np.array(GBT_score))

plt.xlabel("max_depth")
plt.ylabel("10-fold cross validation MSE")
plt.legend(loc = 'best')
plt.title("Max_depth Tuning")

#### Ensemble Learning Models

In [None]:
from sklearn.ensemble import *
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

##### AdaBoost

In [None]:
# Grid Search(Change param_grid to tune the models)
decisionTree = DecisionTreeRegressor(max_depth = 150, min_samples_split = 6)

#param_grid = [
#    {'n_estimators': [10, 20, 30, 50, 80, 100],
#     'learning_rate': [0.001, 0.01, 0.1, 1.0]}
#]

param_grid = [
    {'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150, 170, 190, 210, 230, 250],
     'learning_rate': [1]}
]

regressor = AdaBoostRegressor(decisionTree)

grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
# Grid Search
print("The best model is: {} \n".format(grid_search.best_estimator_))

AB_score = []

means = grid_search.cv_results_['mean_test_score']
params = grid_search.cv_results_['params']
for mean,param in zip(means,params):
    print("%f  with:   %r" % (mean,param))
    AB_score.append(mean)

In [None]:
#10-fold cross validation
regressor = AdaBoostRegressor(decisionTree, n_estimators = 50, learning_rate = 1)

start = time.time()
score = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
print("The score for RandomForest: {}".format(-np.mean(score)))
end = time.time()
print("Time for 1 fold: {}".format((end - start)/10))

regressor = regressor.fit(X_train, y_train)
y_predict = regressor.predict(X_test)
pd.DataFrame(y_predict).to_csv("predictions/AdaBoost_prediction.csv")

##### XGBoost

In [None]:
# Grid Search (Change param_grid to tune the model)
#param_grid = [
#    {'n_estimators': [10, 20, 30, 50, 80, 100],
#     'learning_rate': [0.001, 0.01, 0.1, 1.0]}
#]

#param_grid = [{'learning_rate': [0.1], 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}]

param_grid = [{'learning_rate': [0.1], 'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150, 170, 190, 210, 230, 250]}]

regressor = XGBRegressor()

grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
# Grid Search
print("The best model is: {} \n".format(grid_search.best_estimator_))

XG_score = []

means = grid_search.cv_results_['mean_test_score']
params = grid_search.cv_results_['params']
for mean,param in zip(means,params):
    print("%f  with:   %r" % (mean,param))
    XG_score.append(mean)

In [None]:
#10-fold cross validation
regressor = XGBRegressor(learning_rate=0.1, n_estimators=900)

start = time.time()
score = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
print("The score for XGBoost: {}".format(-np.mean(score)))
end = time.time()
print("Time for 1 fold: {}".format((end - start)/10))

regressor = regressor.fit(X_train, y_train)
y_predict = regressor.predict(X_test)
pd.DataFrame(y_predict).to_csv("predictions/XGBoost_prediction.csv")

In [None]:
df_test["predictions"] = np.array(pd.read_csv("D:\\kaggle\\XGBoost_prediction.csv")["0"])
for i in range(df_test.shape[0]):
    if df_test["lng"][i] in list(df_train["lng"]):
        temp = df_train[df_train["lng"] == df_test["lng"][i]]
        if df_test["size_sqft"][i] in list(temp["size_sqft"]):
            temp2 = temp[temp["size_sqft"] == df_test["size_sqft"][i]]
            df_test.iloc[i, -1] = np.mean(temp2["price"])
for i in range(temp_test.shape[0]):
    if temp_test["prediction"][i] == "null":
        if temp_test["lat"][i] in list(df_train["lat"]):
            temp = df_train[df_train["lat"] == temp_test["lat"][i]]
            if temp_test["size_sqft"][i] in list(temp["size_sqft"]):
                temp2 = temp[temp["size_sqft"] == temp_test["size_sqft"][i]]
                temp_test.iloc[i, -1] = np.mean(temp2["price"])
pd.DataFrame(df_test["predictions"]).to_csv("D:\\kaggle\\XGBoost_prediction.csv")

##### LightBoost

In [None]:
# Grid Search (Change param_grid to tune the model)

#param_grid = [
#    {'n_estimators': [10, 20, 30, 50, 80, 100],
#     'learning_rate': [0.001, 0.01, 0.1, 1.0]}
#]

#param_grid = [{'learning_rate': [0.1], 'n_estimators': [100, 200, 300, 400, 500, 600, 700, 800, 900, 1000]}]

param_grid = [{'learning_rate': [0.1], 'n_estimators': [10, 30, 50, 70, 90, 110, 130, 150, 170, 190, 210, 230, 250]}]

regressor = LGBMRegressor()

grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train, y_train)

In [None]:
# Grid Search
print("The best model is: {} \n".format(grid_search.best_estimator_))

LB_score = []

means = grid_search.cv_results_['mean_test_score']
params = grid_search.cv_results_['params']
for mean,param in zip(means,params):
    print("%f  with:   %r" % (mean,param))
    LB_score.append(mean)

In [None]:
#10-fold cross validation
regressor = LGBMRegressor(learning_rate=0.1, n_estimators=300)

start = time.time()
score = cross_val_score(regressor, X_train, y_train, cv=10, scoring='neg_mean_squared_error')
print("The score for LGBM: {}".format(-np.mean(score)))
end = time.time()
print("Time for 1 fold: {}".format((end - start)/10))

regressor = regressor.fit(X_train, y_train)
y_predict = regressor.predict(X_test)
pd.DataFrame(y_predict).to_csv("predictions/LGBM_prediction.csv")

In [None]:
#Tuning Plot
import seaborn as sns
import matplotlib.pyplot as plt

sns.lineplot(x = list(param_grid[0]["n_estimators"]), y = -np.array(RF_score), label = "Random Forest Regressor")
sns.scatterplot(x = list(param_grid[0]["n_estimators"]), y = -np.array(RF_score))
sns.lineplot(x = list(param_grid[0]["n_estimators"]), y = -np.array(AB_score), label = "AdaBoost")
sns.scatterplot(x = list(param_grid[0]["n_estimators"]), y = -np.array(AB_score))
sns.lineplot(x = list(param_grid[0]["n_estimators"]), y = -np.array(XG_score), label = "XGBoost")
sns.scatterplot(x = list(param_grid[0]["n_estimators"]), y = -np.array(XG_score))
sns.lineplot(x = list(param_grid[0]["n_estimators"]), y = -np.array(LB_score), label = "LightBoost")
sns.scatterplot(x = list(param_grid[0]["n_estimators"]), y = -np.array(LB_score))

plt.xlabel("n_estimators")
plt.ylabel("10-fold cross validation MSE")
plt.legend(loc = 'best')
plt.title("n_estimators Tuning")

#### Deep Learning Models

As deep learning models always have extremely high time cost while training, we train all deep learning models in the Google Colab Environment with GPU resource and Pytorch Framework. 

In [None]:
import torch.utils.data as Data
import copy
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import random
from torch import optim
import time

In [None]:
#X_train = pd.read_csv("X_train.csv")
#X_test = pd.read_csv("X_test.csv")
#y_train = pd.read_csv("y_train.csv")

In [None]:
X_train = X_train.iloc[:, 1:]
X_test = X_test.iloc[:, 1:]
y_train = y_train.iloc[:, 1:]
X_train_N = copy.deepcopy(X_train)
X_test_N = copy.deepcopy(X_test)

In [None]:
for i in list(X_train.columns):
    X_train_N[i] = (X_train[i]-min(X_train[i]))/(max(X_train[i]) - min(X_train[i]))
for i in list(X_test.columns):
    X_test_N[i] = (X_test[i]-min(X_test[i]))/(max(X_test[i]) - min(X_test[i]))

In [None]:
X_train = torch.tensor(np.array(X_train))
y_train = torch.tensor(np.array(y_train))
X_test = torch.tensor(np.array(X_test))
X_train_N = torch.tensor(np.array(X_train_N))
X_test_N = torch.tensor(np.array(X_test_N))

In [None]:
def get_k_fold_data(k, i, X, y):
    assert k > 1
    fold_size = X.shape[0] // k 
    
    X_train, y_train = None, None
    for j in range(k):
        idx = slice(j * fold_size, (j + 1) * fold_size)
        X_part, y_part = X[idx, :], y[idx]
        if j == i:
            X_valid, y_valid = X_part, y_part
        elif X_train is None:
            X_train, y_train = X_part, y_part
        else:
            X_train = torch.cat((X_train, X_part), dim=0)
            y_train = torch.cat((y_train, y_part), dim=0)
    #print(X_train.size(),X_valid.size())
    return X_train, y_train, X_valid,y_valid

##### MLP

In [None]:
class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, act_layer=nn.GELU, drop=0.):
        super().__init__()
        hidden_features = hidden_features or in_features
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.fc2 = nn.Linear(hidden_features, 1)
        self.act = act_layer()
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = x.to(torch.float32)
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        x = x.squeeze(0)
        return x

In [None]:
k_fold = 10
fold_loss = []
learning_process = []
val_loss = []
time_cost = []

for fold in range(k_fold):

    # Initialize the model
    model = Mlp(in_features = 11).cuda()
    criterion=nn.MSELoss()
    optimizer=optim.Adam(model.parameters(),lr=0.001,betas=(0.9,0.999),eps=1e-08,weight_decay=0)
    train_epoch, train_loss = [], []
    avg_train_loss_Mlp = []
    epoch_time=[]

    # Split the 10 folds
    X_train_1, y_train_1, X_valid_1, y_valid_1 = get_k_fold_data(k_fold, fold, X_train, y_train)

    # Load the data
    train_loader = Data.DataLoader(
    dataset=Data.TensorDataset(torch.Tensor(X_train_1),y_train_1),      
    batch_size=128,      
    shuffle=True,               
    num_workers=2, 
    drop_last=True
    )

    model.train()
    start0 = time.time()
    print("This is the #{} fold.".format(fold+1))
    for epoch in range(128):
        running_loss = 0  
        start1 = time.time()
        for i, data in enumerate(train_loader, 0):
            start = time.time()
            t_image, mask = data[0].cuda(),data[1].cuda()
            optimizer.zero_grad()
            outputs = model(t_image) # forward
            ###########################################################################
            mask=mask.to(torch.float32)
            loss = criterion(outputs, mask) # calculate the loss
            loss.backward() # back propagation
            optimizer.step() # update gradients
            running_loss += loss.item()

            if i % 5 == 0:
                end = time.time()
                print('Epoch {}:[{}/{}], Current Loss: {}, Time: {} ms'.format(epoch+1, i, len(train_loader), loss.item(), end - start))      
                train_loss.append(loss.item())
                train_epoch.append(str(epoch+1) + '/' + str(i))
        end1 = time.time()
        print('Epoch {}, train Loss: {:.3f} '.format(epoch+1, running_loss/len(train_loader)), "Epoch Time: {} ms".format(end1 - start1))
        epoch_time.append(end1-start1)
        avg_train_loss_Mlp.append(running_loss/len(train_loader))
    learning_process.append(avg_train_loss_Mlp)

    model.eval()
    X_valid_1 = X_valid_1.cuda()
    y_valid_1 = y_valid_1.cuda()
    predictions = model(X_valid_1)
    valid_loss = criterion(predictions, y_valid_1)
    print("The #{} fold's cross validation score is : {}".format(fold, valid_loss))
    val_loss.append(float((valid_loss.detach().cpu()).numpy()))
    end0 = time.time()
    time_cost.append(end0-start0)
print("The cross_val_score is: {}".format(np.mean(val_loss)))

In [None]:
Mlp_process = []
for i in range(len(learning_process[0])):
    temp = []
    for j in range(len(learning_process)):
        temp.append(learning_process[j][i])
    Mlp_process.append(np.mean(temp))

Mlp_time = np.mean(time_cost)

Mlp_val_loss = np.mean(val_loss)

In [None]:
print("The time cost for Mlp is {}".format(Mlp_time))
print("The 10-fold MSE score for Mlp is {}".format(Mlp_val_loss))

In [None]:
df_test = pd.read_csv("X_test.csv").iloc[:, 1:]
df_test["predictions"] = model(X_test.cuda()).detach().cpu().numpy()
df_test["predictions"].to_csv("predictions/Mlp_predictions.csv")

##### Mlp+Attention

In [None]:
class Mlp(nn.Module):
    def __init__(self, in_features, hidden_features=None, act_layer=nn.GELU, drop=0., pred=True):
        super().__init__()
        #out_features = out_features or in_features
        hidden_features = hidden_features or in_features
        self.q = nn.Linear(in_features, in_features)
        self.k = nn.Linear(in_features, in_features)
        self.v = nn.Linear(in_features, in_features)
        self.fc1 = nn.Linear(in_features, hidden_features)
        self.act = act_layer()
        self.pred = pred
        if pred==True:
            self.fc2 = nn.Linear(hidden_features,1)
        else:
            self.fc2 = nn.Linear(hidden_features, in_features)
        self.drop = nn.Dropout(drop)

    def forward(self, x):
        x = x.to(torch.float32)
        x0 = x
        q = self.q(x).unsqueeze(2)
        k = self.k(x).unsqueeze(2)
        v = self.v(x).unsqueeze(2)
        attn = (q @ k.transpose(-2, -1))
        #print(attn.size())
        attn = attn.softmax(dim=-1)
        x = (attn @ v).squeeze(2)
        #print(x.size())
        x += x0
        x1 = x
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)
        if self.pred==False:
            x += x1

        x = x.squeeze(0)

        return x


class Mlp_Attn(nn.Module):
    def __init__(self, in_features, drop=0.):
        super().__init__()
        self.Block1 = Mlp(in_features=in_features, hidden_features=64, act_layer=nn.GELU, drop=drop, pred=False)
        self.Block2 = Mlp(in_features=in_features, hidden_features=64, act_layer=nn.GELU, drop=drop, pred=True)

    def forward(self, x):
        return self.Block2(self.Block1(x))

In [None]:
k_fold = 10
fold_loss = []
learning_process = []
val_loss = []
time_cost = []

for fold in range(k_fold):

    # Initialize the model
    model = Mlp_Attn(in_features = 11, drop = 0.1).cuda()
    criterion=nn.MSELoss()
    optimizer=optim.Adam(model.parameters(),lr=0.001,betas=(0.9,0.999),eps=1e-08,weight_decay=0)
    train_epoch, train_loss = [], []
    avg_train_loss_Mlp = []
    epoch_time=[]

    # Split the 10 folds
    X_train_1, y_train_1, X_valid_1, y_valid_1 = get_k_fold_data(k_fold, fold, X_train_N, y_train)

    # Load the data
    train_loader = Data.DataLoader(
    dataset=Data.TensorDataset(torch.Tensor(X_train_1),y_train_1),      
    batch_size=128,      
    shuffle=True,               
    num_workers=2, 
    drop_last=True
    )

    model.train()
    start0 = time.time()
    print("This is the #{} fold.".format(fold+1))
    for epoch in range(128):
        running_loss = 0  
        start1 = time.time()
        for i, data in enumerate(train_loader, 0):
            start = time.time()
            t_image, mask = data[0].cuda(),data[1].cuda()
            optimizer.zero_grad()
            outputs = model(t_image) # forward
            ###########################################################################
            mask=mask.to(torch.float32)
            loss = criterion(outputs, mask) # calculate the loss
            loss.backward() # back propagation
            optimizer.step() # update gradients
            running_loss += loss.item()

            if i % 5 == 0:
                end = time.time()
                print('Epoch {}:[{}/{}], Current Loss: {}, Time: {} ms'.format(epoch+1, i, len(train_loader), loss.item(), end - start))      
                train_loss.append(loss.item())
                train_epoch.append(str(epoch+1) + '/' + str(i))
        end1 = time.time()
        print('Epoch {}, train Loss: {:.3f} '.format(epoch+1, running_loss/len(train_loader)), "Epoch Time: {} ms".format(end1 - start1))
        epoch_time.append(end1-start1)
        avg_train_loss_Mlp.append(running_loss/len(train_loader))
    learning_process.append(avg_train_loss_Mlp)

    model.eval()
    X_valid_1 = X_valid_1.cuda()
    y_valid_1 = y_valid_1.cuda()
    predictions = model(X_valid_1)
    valid_loss = criterion(predictions, y_valid_1)
    print("The #{} fold's cross validation score is : {}".format(fold, valid_loss))
    val_loss.append(float((valid_loss.detach().cpu()).numpy()))
    end0 = time.time()
    time_cost.append(end0-start0)
print("The cross_val_score is: {}".format(np.mean(val_loss)))

In [None]:
Mlp_Attn_process = []
for i in range(len(learning_process[0])):
    temp = []
    for j in range(len(learning_process)):
        temp.append(learning_process[j][i])
    Mlp_Attn_process.append(np.mean(temp))

Mlp_Attn_time = np.mean(time_cost)

Mlp_Attn_val_loss = np.mean(val_loss)

In [None]:
df_test = pd.read_csv("X_test.csv").iloc[:, 1:]
df_test["predictions"] = model(X_test_N.cuda()).detach().cpu().numpy()
df_test["predictions"].to_csv("predictions/Mlp_ATTN_predictions.csv")

In [None]:
import matplotlib.pyplot as plt

plt.plot(Mlp_process, label = 'Mlp')
plt.plot(Mlp_Attn_process, label = 'Mlp+Attention')

plt.legend()
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.title("Training Loss")

In [None]:
print("The time cost(10-fold cross validation) for Mlp is {}s".format(Mlp_time * 10))
print("The time cost(10-fold cross validation) for Mlp+Attention is {}s".format(Mlp_Attn_time * 10))
print("The 10-fold cross validation MSE score for Mlp is {}".format(Mlp_val_loss))
print("The 10-fold cross validation MSE score for Mlp+Attention is {}".format(Mlp_Attn_val_loss))

##### Hybrid Model-1(KNN+Random Forest+Decision Tree)

In [None]:
from xgboost import XGBRegressor
from sklearn.ensemble import *
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor

In [None]:
decisionTree = DecisionTreeRegressor(max_depth = 150, min_samples_split = 6)
regressor_1 = AdaBoostRegressor(decisionTree, learning_rate = 1, n_estimators = 30)
regressor_1 = regressor_1.fit(X_train, y_train)
y_predict_1_train = regressor_1.predict(X_train)
y_predict_1_test = regressor_1.predict(X_test)

regressor_2 = RandomForestRegressor(max_depth = 50, n_estimators = 100)
regressor_2 = regressor_2.fit(X_train, y_train)
y_predict_2_train = regressor_2.predict(X_train)
y_predict_2_test = regressor_2.predict(X_test)

regressor_3 = KNeighborsRegressor(n_neighbors = 2)
regressor_3 = regressor_3.fit(X_train_N, y_train)
y_predict_3_train = regressor_3.predict(X_train_N)
y_predict_3_test = regressor_3.predict(X_test_N)

In [None]:
temp_train = np.concatenate((y_predict_1_train.reshape(-1, 1), y_predict_2_train.reshape(-1, 1), y_predict_3_train.reshape(-1, 1)), axis = 1)
temp_test = np.concatenate((y_predict_1_test.reshape(-1, 1), y_predict_2_test.reshape(-1, 1), y_predict_3_test.reshape(-1, 1)), axis = 1)
temp_train = torch.tensor(temp_train)
temp_test = torch.tensor(temp_test)

In [None]:
class hybrid_1(nn.Module):
    def __init__(self, in_features, act_layer=nn.GELU, drop=0.1, num_regressors = 3):
        super().__init__()
        self.act = act_layer()
        self.drop = nn.Dropout(drop)
        self.in_features = in_features
        self.fc1 = nn.Linear(3, 3)
        self.fc2 = nn.Linear(3, 1)

    def forward(self, x):
        predictions = x.to(torch.float32)
        predictions = self.fc1(predictions)
        predictions = self.act(predictions)
        predictions = self.drop(predictions)
        prediction = self.fc2(predictions)
        return prediction

In [None]:
k_fold = 10
fold_loss = []
learning_process = []
val_loss = []
time_cost = []
model_list = []

for fold in range(k_fold):

    # Initialize the model
    model = hybrid_1(in_features = 11, drop = 0.1).cuda()
    criterion=nn.MSELoss()
    optimizer=optim.Adam(model.parameters(),lr=0.001,betas=(0.9,0.999),eps=1e-08,weight_decay=0)
    train_epoch, train_loss = [], []
    avg_train_loss_Hybrid = []
    epoch_time=[]

    # Split the 10 folds
    X_train_1, y_train_1, X_valid_1, y_valid_1 = get_k_fold_data(k_fold, fold, temp_train, y_train)

    # Load the data
    train_loader = Data.DataLoader(
    dataset=Data.TensorDataset(torch.Tensor(X_train_1),y_train_1),      
    batch_size=128,      
    shuffle=True,               
    num_workers=2, 
    drop_last=True
    )

    model.train()
    start0 = time.time()
    print("This is the #{} fold.".format(fold+1))
    for epoch in range(128):
        running_loss = 0  
        start1 = time.time()
        for i, data in enumerate(train_loader, 0):
            start = time.time()
            t_image, mask = data[0].cuda(),data[1].cuda()
            optimizer.zero_grad()
            outputs = model(t_image) # forward
            ###########################################################################
            mask=mask.to(torch.float32)
            loss = criterion(outputs, mask) # calculate the loss
            loss.backward() # back propagation
            optimizer.step() # update gradients
            running_loss += loss.item()

            if i % 5 == 0:
                end = time.time()
                print('Epoch {}:[{}/{}], Current Loss: {}, Time: {} ms'.format(epoch+1, i, len(train_loader), loss.item(), end - start))      
                train_loss.append(loss.item())
                train_epoch.append(str(epoch+1) + '/' + str(i))
        end1 = time.time()
        print('Epoch {}, train Loss: {:.3f} '.format(epoch+1, running_loss/len(train_loader)), "Epoch Time: {} ms".format(end1 - start1))
        epoch_time.append(end1-start1)
        avg_train_loss_Hybrid.append(running_loss/len(train_loader))
    learning_process.append(avg_train_loss_Hybrid)

    model.eval()
    X_valid_1 = X_valid_1.cuda()
    y_valid_1 = y_valid_1.cuda()
    predictions = model(X_valid_1)
    valid_loss = criterion(predictions, y_valid_1)
    print("The #{} fold's cross validation score is : {}".format(fold, valid_loss))
    val_loss.append(float((valid_loss.detach().cpu()).numpy()))
    end0 = time.time()
    time_cost.append(end0-start0)
    model_list.append(model)
print("The cross_val_score is: {}".format(np.mean(val_loss)))

In [None]:
Hybrid_process = []
for i in range(len(learning_process[0])):
    temp = []
    for j in range(len(learning_process)):
        temp.append(learning_process[j][i])
    Hybrid_process.append(np.mean(temp))

Hybrid_time = np.mean(time_cost)

Hybrid_val_loss = np.mean(val_loss)

In [None]:
import matplotlib.pyplot as plt
plt.plot(Hybrid_process)
plt.xlabel("Epoch")
plt.ylabel("MSE")
plt.title("Training Loss")

In [None]:
print("The time cost(10-fold cross validation) for Hybrid Model is {}s".format(Hybrid_time * 10))
print("The 10-fold cross validation MSE score for Hybrid Model is {}".format(Hybrid_val_loss))

In [None]:
import matplotlib.pyplot as plt
plt.plot(val_loss)
plt.xlabel("#Model")
plt.ylabel("MSE")

In [None]:
eva_list_hybrid = []
for i in range(k_fold):
  eva_list_hybrid.append(float(criterion(model_list[i](temp_train.cuda()), y_train.cuda()).detach().cpu().numpy()))
  print("The {}-th model's loss on training set is {}.".format(i+1, eva_list_hybrid[i]))

In [None]:
#pd.DataFrame(model_list[4](temp_test.cuda()).detach().cpu().numpy()).to_csv("hybrid_predictions.csv")

##### Hybrid Model-2(KNN+AdaBoost+Random Forest+Decision Tree)

In [None]:
df_train = pd.read_csv("X_train.csv").iloc[:, 1:]
df_train["price"] = pd.read_csv("y_train.csv").iloc[:, 1:]
df_test = pd.read_csv("X_test.csv").iloc[:, 1:]

In [None]:
df_train

In [None]:
df_test["predictions"] = model_list[1](temp_test.cuda()).detach().cpu().numpy()
df_test

In [None]:
for i in range(df_test.shape[0]):
    if df_test["5"][i] in list(df_train["lng"]):
        temp = df_train[df_train["lng"] == df_test["5"][i]]
        if df_test["3"][i] in list(temp["size_sqft"]):
            temp2 = temp[temp["size_sqft"] == df_test["3"][i]]
            df_test.iloc[i, -1] = np.mean(temp2["price"])

In [None]:
for i in range(df_test.shape[0]):
    if df_test["4"][i] in list(df_train["lat"]):
        temp = df_train[df_train["lat"] == df_test["4"][i]]
        if df_test["3"][i] in list(temp["size_sqft"]):
            temp2 = temp[temp["size_sqft"] == df_test["3"][i]]
            df_test.iloc[i, -1] = np.mean(temp2["price"])

In [None]:
df_test["predictions"].to_csv("predictions/Hybrid_2_prediction.csv")

#### Other Models

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import *

##### KNN

In [None]:
# Grid Search(Change the param_grid to tune the parameters)

param_grid = [{'n_neighbors': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}]

regressor = KNeighborsRegressor()
grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train_N, y_train)

In [None]:
# Grid Search
print("The best model is: {} \n".format(grid_search.best_estimator_))

KNN_score = []

means = grid_search.cv_results_['mean_test_score']
params = grid_search.cv_results_['params']
for mean,param in zip(means,params):
    print("%f  with:   %r" % (mean,param))
    KNN_score.append(mean)

In [None]:
#10-fold cross validation
regressor = KNeighborsRegressor(n_neighbors = 2)

start = time.time()
score = cross_val_score(regressor, X_train_N, y_train, cv=10, scoring='neg_mean_squared_error')
print("The score for KNN: {}".format(-np.mean(score)))
end = time.time()
print("Time for 1 fold: {}".format((end - start)/10))

regressor = regressor.fit(X_train_N, y_train)
y_predict = regressor.predict(X_test_N)
pd.DataFrame(y_predict).to_csv("predictions/KNN_prediction.csv")

##### SVM

In [None]:
# Grid Search(Change the param_grid to tune the parameters)

param_grid = [{'C': np.linspace(1, 10000000, 50)}]

regressor = LinearSVR(max_iter=1000000000)

grid_search = GridSearchCV(regressor, param_grid, cv=10, scoring='neg_mean_squared_error')
grid_search.fit(X_train_N, y_train)

In [None]:
# Grid Search
print("The best model is: {} \n".format(grid_search.best_estimator_))

SVR_score = []

means = grid_search.cv_results_['mean_test_score']
params = grid_search.cv_results_['params']
for mean,param in zip(means,params):
    print("%f  with:   %r" % (mean,param))
    SVR_score.append(mean)

In [None]:
regressor = LinearSVR(max_iter=1000000000, C = 9795918)

start = time.time()
score = cross_val_score(regressor, X_train_N, y_train, cv=10, scoring='neg_mean_squared_error')
print("The score for SVR: {}".format(-np.mean(score)))
end = time.time()
print("Time for 1 fold: {}".format((end - start)/10))

regressor = regressor.fit(X_train_N, y_train)
y_predict = regressor.predict(X_test_N)
pd.DataFrame(y_predict).to_csv("predictions/SVR_prediction.csv")

In [None]:
#Tuning Plot
import seaborn as sns
import matplotlib.pyplot as plt

sns.lineplot(x = list(param_grid[0]["C"]), y = -np.array(SVR_score), label = "SVR")
sns.scatterplot(x = list(param_grid[0]["C"]), y = -np.array(SVR_score))

plt.xlabel("C")
plt.ylabel("10-fold cross validation MSE")
plt.legend(loc = 'best')
plt.title("C Tuning")

##### KNN Variant

In [None]:
temp_test = copy.deepcopy(X_test)
temp_test["prediction"] = "null"

In [None]:
start = time.time()
for i in range(temp_test.shape[0]):
    if temp_test["lng"][i] in list(df_train["lng"]):
        temp = df_train[df_train["lng"] == temp_test["lng"][i]]
        if temp_test["size_sqft"][i] in list(temp["size_sqft"]):
            temp2 = temp[temp["size_sqft"] == temp_test["size_sqft"][i]]
            temp_test.iloc[i, -1] = np.mean(temp2["price"])
    if temp_test["prediction"][i] == "null":
        if temp_test["lat"][i] in list(df_train["lat"]):
            temp = df_train[df_train["lat"] == temp_test["lat"][i]]
            if temp_test["size_sqft"][i] in list(temp["size_sqft"]):
                temp2 = temp[temp["size_sqft"] == temp_test["size_sqft"][i]]
                temp_test.iloc[i, -1] = np.mean(temp2["price"])
    if temp_test["prediction"][i] == "null":
        temp = df_train[df_train["property_type"] == temp_test["property_type"][i]]
        lat = temp_test["lat"][i]
        lng = temp_test["lng"][i]
        index = np.argmin(np.sqrt((df_train["lat"]-lat)**2+(df_train["lng"]-lng)**2))
        temp_test.iloc[i, -1] = (df_train["price"][index]/df_train["size_sqft"][index]) * temp_test["size_sqft"][i]
end = time.time()
print("Time for 1 fold: {}".format((end - start)))

In [None]:
pd.DataFrame(y_predict).to_csv("predictions/KNN_V_prediction.csv")

In [None]:
import seaborn as sns
plt.subplots(figsize = (12,12))
sns.heatmap(train.corr(),annot = True,vmax = 1,square = True,cmap = "Blues")
plt.show()