# Task 3: A regression example: predicting apartment prices

In [67]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score

import warnings 
warnings.filterwarnings('ignore')

In [68]:
# Read the CSV file using Pandas.
alldata = pd.read_csv('sberbank.csv')

In [69]:
# Convert the timestamp string to an integer representing the year.
def get_year(timestamp):
    return int(timestamp[:4])
alldata['year'] = alldata.timestamp.apply(get_year)

# Select the 9 input columns and the output column.
selected_columns = ['price_doc', 'year', 'full_sq', 'life_sq', 'floor', 'num_room', 'kitch_sq', 'full_all']
alldata = alldata[selected_columns]
alldata = alldata.dropna()

# Shuffle.
alldata_shuffled = alldata.sample(frac=1.0, random_state=0)

In [70]:
alldata.head()

Unnamed: 0,price_doc,year,full_sq,life_sq,floor,num_room,kitch_sq,full_all
7672,10100000,2013,73,36.0,17.0,2.0,11.0,102828
8056,2750000,2013,11,11.0,2.0,1.0,12.0,75377
8135,9000000,2013,53,30.0,10.0,2.0,8.0,68630
8144,4457400,2013,41,37.0,13.0,1.0,1.0,9553
8153,7011550,2013,77,41.0,2.0,3.0,12.0,9553


In [71]:
# Separate the input and output columns.
X = alldata_shuffled.drop('price_doc', axis=1)
# For the output, we'll use the log of the sales price.
Y = alldata_shuffled['price_doc'].apply(np.log)

# Split into training and test sets.
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.2, random_state=0)

#### Accessing dictionary items as object attributes to calculate the mean.

In [116]:
class objdict(dict):
    def __getattr__(self, name):
        if name in self:
            return self[name]
        else:
            raise AttributeError("No such attribute: " + name)

### Dummy Regressor

In [120]:
from sklearn.dummy import DummyRegressor
from sklearn.model_selection import cross_validate
m1 = DummyRegressor()
cross_validate(dr, Xtrain, Ytrain, scoring='neg_mean_squared_error')

{'fit_time': array([0.01612806, 0.00987911, 0.00212312, 0.00094223, 0.00064588]),
 'score_time': array([0.00239706, 0.00112295, 0.00036097, 0.00018501, 0.00015831]),
 'test_score': array([-0.39897319, -0.37113485, -0.38083108, -0.39057156, -0.40475168])}

In [121]:
from sklearn.metrics import mean_squared_error
  
m1.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, m1.predict(Xtest))

0.4028398414133475

### Linear Regression

In [127]:
from sklearn.linear_model import LinearRegression
ligr = LinearRegression()
cv1 = cross_validate(ligr, Xtrain, Ytrain, scoring='neg_mean_squared_error')
cv1

{'fit_time': array([0.02741098, 0.00953984, 0.00361013, 0.00314689, 0.00640178]),
 'score_time': array([0.00205898, 0.00125694, 0.00102592, 0.00109005, 0.00123906]),
 'test_score': array([-0.30222063, -0.32537384, -0.29377903, -0.29296258, -0.29265721])}

In [128]:
result1 = objdict(cv1)
result1.test_score.mean()

-0.3013986588767175

In [129]:
ligr.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, ligr.predict(Xtest))

0.3155890397003767

### Ridge

In [131]:
from sklearn.linear_model import Ridge
rid = Ridge()
cv2 = cross_validate(rid, Xtrain, Ytrain, scoring='neg_mean_squared_error')
cv2

{'fit_time': array([0.01160002, 0.00558591, 0.00360775, 0.00371504, 0.00291491]),
 'score_time': array([0.00380802, 0.00149393, 0.001719  , 0.00124693, 0.001122  ]),
 'test_score': array([-0.30222063, -0.32537046, -0.29377831, -0.29296256, -0.29265724])}

In [132]:
result2 = objdict(cv2)
result2.test_score.mean()

-0.3013978423217976

In [133]:
rid.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, rid.predict(Xtest))

0.3155902354580929

### Lasso

In [138]:
from sklearn import linear_model
lasso = linear_model.Lasso(alpha=0.1)
cv3 = cross_validate(lasso, Xtrain, Ytrain, scoring='neg_mean_squared_error')

In [139]:
result3 = objdict(cv3)
result3.test_score.mean()

-0.29817864931315446

In [140]:
lasso.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, lasso.predict(Xtest))

0.32604901387710894

### Decision Tree Regressor

In [142]:
from sklearn.tree import DecisionTreeRegressor
dtr = DecisionTreeRegressor()
cv4 = cross_validate(dtr, Xtrain, Ytrain, scoring='neg_mean_squared_error')
cv4

{'fit_time': array([0.06072497, 0.0364871 , 0.03447795, 0.03521991, 0.03295279]),
 'score_time': array([0.00181508, 0.00128889, 0.00133395, 0.00211835, 0.00109005]),
 'test_score': array([-0.53718287, -0.5402739 , -0.50751788, -0.50755841, -0.54348065])}

In [143]:
result4 = objdict(cv4)
result4.test_score.mean()

-0.5272027400221437

In [144]:
dtr.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, dtr.predict(Xtest))

0.5669153128106166

### Random Forest Regressor

In [146]:
from sklearn.ensemble import RandomForestRegressor
regr = RandomForestRegressor(max_depth=2, random_state=0)
cv5 = cross_validate(regr, Xtrain, Ytrain, scoring='neg_mean_squared_error')
cv5

{'fit_time': array([0.32297707, 0.28441501, 0.28959799, 0.288095  , 0.29025316]),
 'score_time': array([0.00840282, 0.00785303, 0.00805879, 0.00778699, 0.00820589]),
 'test_score': array([-0.30660535, -0.28378306, -0.29688942, -0.30035284, -0.29904841])}

In [147]:
result5 = objdict(cv5)
result5.test_score.mean()

-0.29733581591151304

In [148]:
regr.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, regr.predict(Xtest))

0.30696994385672405

### Gradient Boosting Regressor

In [149]:
from sklearn.ensemble import GradientBoostingRegressor
gbr = GradientBoostingRegressor()
cv6 = cross_validate(gbr, Xtrain, Ytrain, scoring='neg_mean_squared_error')
cv6

{'fit_time': array([0.54561305, 0.51106524, 0.51332998, 0.52311993, 0.5324049 ]),
 'score_time': array([0.00727892, 0.00405097, 0.00356293, 0.0036149 , 0.00466585]),
 'test_score': array([-0.27659396, -0.24923533, -0.26294343, -0.27079364, -0.26331716])}

In [150]:
result6 = objdict(cv6)
result6.test_score.mean()

-0.26457670465511574

In [151]:
gbr.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, gbr.predict(Xtest))

0.27141267313163636

### MLP Regressor

In [157]:
from sklearn.neural_network import MLPRegressor
mlp = MLPRegressor(max_iter=5)
cv7 = cross_validate(mlp, Xtrain, Ytrain, scoring='neg_mean_squared_error')
cv7

{'fit_time': array([0.17657709, 0.1296711 , 0.09182572, 0.10353112, 0.12119484]),
 'score_time': array([0.00693679, 0.00324798, 0.00312018, 0.00292897, 0.00188923]),
 'test_score': array([-359.38983003, -113.49057253,  -32.58236138,  -34.29397325,
         -61.27453223])}

In [158]:
result7 = objdict(cv7)
result7.test_score.mean()

-120.20625388395487

In [159]:
mlp.fit(Xtrain, Ytrain)
mean_squared_error(Ytest, mlp.predict(Xtest))

9.046364683581881

# Task 4: Decision trees for regression