# Import Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor,RandomForestClassifier, GradientBoostingRegressor
from sklearn.neural_network import MLPClassifier, MLPRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import MinMaxScaler,StandardScaler
from sklearn import tree



import warnings
warnings.filterwarnings("ignore")


# Important functions 

In [3]:
def coeffReport(linearModel, featureNames):

    coefs=linearModel.coef_ 

    sorted_coef=list(zip(list(featureNames),coefs))
    # I create a variable that list the zipped 2 list of the features with its respective coeficients 

    sorted_coef_value=sorted(list(zip(list(featureNames,coefs[0]))), key=lambda x:x[1],reverse=True)
    
    sorted_coef_abs=sorted(list(zip(list(featureNames,coefs))), key=lambda x:abs(x[1])) 

    #Option 2:
    #coefsAbs = list(map(abs,coefs))
    #sorted_coefs = sorted ((zip(featureNames, coefsAbs)),
    #           key = lambda e:e[1], reverse=True )

    #The different between the 2 option above is that in the one i am applying i am sorting the abs values by their abs value but i am not changing
    #the list. So, in my inform i will still get as an ouput the abs value with their possitive or negative sign.

    most_informative_coef=sorted_coef_value[:3]
    least_informative_coef=sorted_coef_value[-3:]
    abs_informative_coef=sorted_coef_abs[:3]

    print("\nLowest Coefficients ")
    print(least_informative_coef)

    print("\nHighest Coefficients ")
    print(most_informative_coef)

    print("\nCoefficients Closest to Zero ")   #the closest to cero are least informative because the variable logically is multiplies by 0 
    print(abs_informative_coef)



# Read file

In [4]:
#Read and fast visualize the data 
pd.set_option("display.max_columns", None)
pd.set_option("display.max_rows", None)
df = pd.read_csv("clean_housing_data.csv")
print("Shape: ",df.shape)
df.head()

Shape:  (57007, 22)


Unnamed: 0,yearMonth_sale,case_type_dk,zip_code,zip_code_name,erts89_utm32_x,erts89_utm32_y,ed50_x,ed50_y,wgs84_lat,wgs84_lon,residental_area,measured_area,energy_labeled_required,energy_labeled,amount_of_toilets,year_of_construction,renovation_year,first_offer_price,lastest_announced_price,sold_price,days_on_the_market_all_broker,change_broker
0,201805,Rækkehus,2100,København Ø,724656.3099617342,6177510.129902037,724738.3771995398,6177716.274317907,5569125957,1257452761,121,136.0,True,c,2,1882,0,9895000,9500000,8500000,79,0
1,201805,Rækkehus,2100,København Ø,724612.6998841494,6177539.480294425,724694.7669242901,6177745.625006782,5569154296,1257385928,136,151.0,True,c,2,1882,1,9595000,9195000,8750000,96,0
2,202006,Villa,2300,København S,726339.2701369224,6172005.419336407,726421.3337691108,6172211.5248479135,5564111328,1259671021,107,126.0,True,d,1,1927,0,4195000,3995000,3900000,148,0
3,202103,Villa,2300,København S,726314.7447286966,6171972.668490626,7263968081.0,6172178.773859339,5564083099,125962944,140,136.0,True,a,2,2017,0,7495000,7495000,7595000,17,0
4,201108,Villa,2700,Brønshøj,718735.9998927611,6178448.999985718,718818.0324674495,6178655.165567457,5570238375,1248127579,87,259.0,True,g,2,1928,0,2250000,2250000,2275000,15,0


# Prepare data for train_test_split

In [5]:
# For the moment we will not use the latitude and longitud
excluded_cols = ["erts89_utm32_x", "erts89_utm32_y", "ed50_x",
                 "ed50_y", "wgs84_lat", "wgs84_lon"]

#Drop those columns from the data set - IN CASE WE USE THEM DELETE THIS CELL
df = df.drop(excluded_cols, axis=1)




We need to check if there are any categorical features that need hot encoding for some of the machine learning models. 
One way to do this is by checking the data types of the columns:

In [6]:

#Find Categorical columns & save them in a variable
categorical_cols = [col for col in df.columns if df[col].dtype == "object"]

#Get dummies for those categorical columns
if categorical_cols:
    df = pd.get_dummies(df, columns=categorical_cols)

#Print the new shape of DF with dummies
print("This is the shape with the dummies:", df.shape)

This is the shape with the dummies: (57007, 38)


# Split data

In [7]:
#Save feature data in X
X = df.drop("sold_price", axis=1)

#Save target in y
y = df["sold_price"]

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X,y , random_state=42)


# Scaled data

First, The standard scaler is created in case it lead to better results

In [8]:

# Creating the scaler variable
scaler = StandardScaler()
#the scaler needs to be trained only in the training data and not in the test data, so it needs to be fit in there.
# What this will do is to adapt to all the values from the training set and create the new parameters dividing 
# all the numbers by the Max number found in the trianig data set
scaler.fit(X_train)

# Once the scaler variable has all the relative numbers, i can transform all the features so my new X_train_scaled will 
# only have numbers potencially between 0-1. On the test data we might find numbers higher than one or just smaller since 
# it will be divided by the higher number in the trainig set
X_train_standard_scaled = scaler.transform(X_train)
X_test_standard_scaled = scaler.transform(X_test) 

# Linear regression 

#### Example for NON-Scaled Data

In [9]:
name=(list(X.columns))


In [13]:
#Create the Linear regressor & Fit in the NON-Scaled DATA
linear_reg=LinearRegression().fit(X_train, y_train)


print("Accuracy of the Linear Regression, on the training set: {:.3f}".format(linear_reg.score(X_train, y_train)))
print("Accuracy of the Linear Regression, on the test set: {:.3f}".format(linear_reg.score(X_test, y_test)))

coeffReport(linear_reg,name)

Accuracy of the Linear Regression, on the training set: 0.755
Accuracy of the Linear Regression, on the test set: 0.709


TypeError: list expected at most 1 argument, got 2

In [None]:
Linear_reg= LinearRegression().fit(X_train, y_train)

print("Accuracy of the Linear Regression, on the training set: {:.3f}".format(Linear_reg.score(X_train, y_train)))
print("Accuracy of the Linear Regression, on the test set: {:.3f}".format(Linear_reg.score(X_test, y_test)))

#The score will represent the sum of squared differences divided by the amount of sample for both training & test set, 




In [None]:

linreg= LinearRegression().fit(X_train,y_train)


print("Linear model intercept (b): {}".format(linreg.intercept_)) # Underscore denotes a quantity derived from training data, as opposed to a user setting.
print("Linear model coeff (w): {}".format(linreg.coef_))

#linreg.coef_ --> will return w (the slope of the equation of the line)
#linreg.intercept_ --> will return b (the y intercept of the line)


print("R-squared score (training): {:.3f}".format(linreg.score(X_train, y_train))
print("R-squared score (test): {:.3f}".format(linreg.score(X_test, y_test))
