In [None]:
#import libraries
import pandas

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

import matplotlib

import pyodbc

In [None]:
#import data
conn_str = pyodbc.connect('DRIVER={ODBC Driver 17 for SQL Server}; SERVER=10.10.0.104,31433; DATABASE=externalDB;UID=admin;PWD=Password1234')

query_str = "SELECT * FROM [bdc].[hdb_resale_prices]"
raw_dataframe = pandas.read_sql(sql=query_str, con=conn_str)
print("Dataframe: ")
print(raw_dataframe)


In [None]:
# Data preprocessing
raw_dataframe.town = pandas.factorize(raw_dataframe.town)[0]
raw_dataframe.flat_type = pandas.factorize(raw_dataframe.flat_type)[0]
raw_dataframe.flat_model = pandas.factorize(raw_dataframe.flat_model)[0]
print(raw_dataframe)

In [None]:
#Observing the relationship between dependent variable and time
#custom range
town_value = 0
month_value = 1
year_value = 2017
refined_df = raw_dataframe.loc[ (raw_dataframe['town'] == town_value) &
                              # (raw_dataframe['month'] == month_value) &
                                (raw_dataframe['year'] == year_value) ]
dependent_variable = ['resale_price'] 
dependent_dataset = refined_df[dependent_variable]
print(refined_df)
print(refined_df.shape)

In [None]:
dependent_dataset.head(-1).plot()
# dependent_dataset.head(-1)[custom_range_start:custom_range_stop].plot() #modify this value to adjust number of rows returned

In [None]:
# Define dependent and independent variables
independent_variables = ['flat_type','floor_area_sqm','flat_model','remaining_lease_months'] #input any number of variables
dependent_variable = ['resale_price'] #only input one variable

# Split data into dependent and independent 
independent_dataset = refined_df[independent_variables]
dependent_dataset = refined_df[dependent_variable]

# Further splitting dataset into training and testing subsets
test_ratio=0.2 #splits the data into testing and training sets with ratio 0.2
indp_train_set, indp_test_set, dep_train_set, dep_test_set = train_test_split(independent_dataset, 
                                                                              dependent_dataset, 
                                                                              test_size=test_ratio)

print(indp_train_set) #outputs the generated test set

In [None]:
print(independent_dataset)
print(dependent_dataset)

In [None]:
# Create regression object and train model
linear_model = LinearRegression()
linear_model.fit(indp_train_set, dep_train_set)
print("Coefficients:", linear_model.coef_)
print("Intercept:", linear_model.intercept_)

In [None]:
# Generate predictions
linear_predictions = linear_model.predict(indp_test_set)
linear_predictions

In [None]:
# Compare predictions
print("Model score: ", linear_model.score(indp_test_set, dep_test_set))

In [None]:
# Custom predictions 
linear_predictions = linear_model.predict( [] )     
predictions_dataframe = pandas.DataFrame(linear_predictions)
predictions_dataframe.columns = ['predicted_price']
output_dataframe = pandas.concat( [dependent_dataset, predictions_dataframe], axis = 1)
print(output_dataframe)