## Part 1

### Data Processing

In [210]:
# imports

import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import metrics

In [186]:
# Had to add encoding option, since the standard is in UTF-8, and our data was not

df = pd.read_csv('assignment_7_data_aml.txt', encoding = "ISO-8859-1")
df.dropna(subset = ["lat"])

Unnamed: 0,address,zip_code,price,sell_date,sell_type,price_per_sq_m,no_rooms,housing_type,size_in_sq_m,year_of_construction,price_change_in_pct,zip_code_num,long,lat
oesterbro 22,2. TH,9000 Aalborg,1295000,30-10-2015,Alm. Salg,185000,20,Lejlighed,700,19000,0,9000,9933196,57046009.0
oesterbro 24,1. TV,9000 Aalborg,2000000,23-10-2015,Alm. Salg,186910,40,Lejlighed,1070,19280,0,9000,9934895,57045929.0
oesterbro 24,3. TV,9000 Aalborg,1898000,27-04-2015,Alm. Salg,186070,40,Lejlighed,1020,19280,0,9000,9934895,57045929.0
oesterbro 29D,2. TH,9000 Aalborg,2945000,25-08-2015,Alm. Salg,239430,30,Lejlighed,1230,20130,-20,9000,9936365,57046424.0
oesterbro 29E,2. TH,9000 Aalborg,2425000,24-09-2015,Alm. Salg,247440,30,Lejlighed,980,20130,-30,9000,9936407,57046491.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Vingaardsgade 11,3. TV,9000 Aalborg,1340000,28-10-2015,Alm. Salg,188730,20,Lejlighed,710,19380,0,9000,9918029,5704703.0
Vingaardsgade 27,4. TV,9000 Aalborg,1750000,05-02-2015,Alm. Salg,147050,30,Lejlighed,1190,19270,-30,9000,9915715,57047735.0
Vingaardsgade 32,2. TH,9000 Aalborg,1160000,23-04-2015,Alm. Salg,175750,20,Lejlighed,660,19310,-30,9000,9915987,57047822.0
Vingaardsgade 4,3. TH,9000 Aalborg,1615000,08-10-2015,Alm. Salg,209740,20,Lejlighed,770,19380,0,9000,9919094,57046892.0


In [187]:
# Need label encoding since it gives cannot convert string to float error otherwise

le = LabelEncoder()
df['zip_code_float'] = le.fit_transform(df['zip_code'])

### Data

In the data we have deleted random rows and outliers, and replaced æøå characters with standard english keyboard characters.

In [188]:
df.head()

Unnamed: 0,address,zip_code,price,sell_date,sell_type,price_per_sq_m,no_rooms,housing_type,size_in_sq_m,year_of_construction,price_change_in_pct,zip_code_num,long,lat,zip_code_float
oesterbro 22,2. TH,9000 Aalborg,1295000,30-10-2015,Alm. Salg,185000,20,Lejlighed,700,19000,0,9000,9933196,57046009.0,146
oesterbro 24,1. TV,9000 Aalborg,2000000,23-10-2015,Alm. Salg,186910,40,Lejlighed,1070,19280,0,9000,9934895,57045929.0,146
oesterbro 24,3. TV,9000 Aalborg,1898000,27-04-2015,Alm. Salg,186070,40,Lejlighed,1020,19280,0,9000,9934895,57045929.0,146
oesterbro 29D,2. TH,9000 Aalborg,2945000,25-08-2015,Alm. Salg,239430,30,Lejlighed,1230,20130,-20,9000,9936365,57046424.0,146
oesterbro 29E,2. TH,9000 Aalborg,2425000,24-09-2015,Alm. Salg,247440,30,Lejlighed,980,20130,-30,9000,9936407,57046491.0,146


## Part 2

### ML Model Selection and Construction

After taking a look at the data we can see that there is a column "price_change_in_pct". This column can be used to predict the change in the value, by labeling them so that a Classification model can use it. 

We were considering using either the KNN or the DecisionTreeClassifier, because the goal is to partition the data in subsets, where they are either adjusted or not adjusted. 

We decided that the DecisionTreeClassifier is a viable option for this use case, using a Divide-and-Conquer approach, where we can create a branch, split instances into subsets and apply recursion.

In [189]:
# we can see that we get various values for the price change in pct, therefore we need to create labels for these.
pd.set_option('display.max_rows', 10)

df['price_change_in_pct']

oesterbro 22           0
oesterbro 24           0
oesterbro 24           0
oesterbro 29D        -20
oesterbro 29E        -30
                    ... 
Vingaardsgade 32     -30
Vingaardsgade 4        0
Vonsyldsgade 17     9000
Vonsyldsgade 37     9000
Willemoesgade 5        0
Name: price_change_in_pct, Length: 610, dtype: int64

#### labeling the price change

To simplify the data we assign 

    - 0 price decrease
    - 1 price static
    - 2 price increase

In [190]:
def labeled_price_pct(price):
    if(price) < 0:
        return 0
    if(price) > 0:
        return 2
    else :
        return 1

In [191]:
df['labeled_price_pct'] = df.apply(lambda x : labeled_price_pct(x['price_change_in_pct']), axis = 1)

In [192]:
df['labeled_price_pct']

oesterbro 22        1
oesterbro 24        1
oesterbro 24        1
oesterbro 29D       0
oesterbro 29E       0
                   ..
Vingaardsgade 32    0
Vingaardsgade 4     1
Vonsyldsgade 17     2
Vonsyldsgade 37     2
Willemoesgade 5     1
Name: labeled_price_pct, Length: 610, dtype: int64

In [193]:
# not too much data as we can see....
df.labeled_price_pct.value_counts()

2    217
0    205
1    188
Name: labeled_price_pct, dtype: int64

In [205]:
# Masking & balancing

m_0 = df[df['labeled_price_pct'] == 0]
m_1 = df[df['labeled_price_pct'] == 1]
m_2 = df[df['labeled_price_pct'] == 2]

b_0 = m_0[:188]
b_2 = m_2[:188]

balanced_data = pd.concat([b_0, m_1, b_2], axis = 0)

balanced_data.dropna()

Unnamed: 0,address,zip_code,price,sell_date,sell_type,price_per_sq_m,no_rooms,housing_type,size_in_sq_m,year_of_construction,price_change_in_pct,zip_code_num,long,lat,zip_code_float,labeled_price_pct
oesterbro 29D,2. TH,9000 Aalborg,2945000,25-08-2015,Alm. Salg,239430,30,Lejlighed,1230,20130,-20,9000,9936365,57046424.0,146,0
oesterbro 29E,2. TH,9000 Aalborg,2425000,24-09-2015,Alm. Salg,247440,30,Lejlighed,980,20130,-30,9000,9936407,57046491.0,146,0
oesterbro 57,ST. TV,9000 Aalborg,970000,23-02-2015,Alm. Salg,151560,20,Lejlighed,640,19340,-30,9000,9941545,57045841.0,146,0
oestergravensgade 12,ST. TH,9000 Aalborg,1130000,03-06-2015,Alm. Salg,198240,20,Lejlighed,570,18890,-40,9000,9927859,57045956.0,146,0
oestre Alle 11,1,9000 Aalborg,1500000,11-06-2015,Alm. Salg,166660,20,Lejlighed,900,19200,-30,9000,9947868,57043636.0,146,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
Malurtvej 29,1. TV,9000 Aalborg,1245000,12-04-2015,Alm. Salg,168240,30,Lejlighed,740,19710,20,9000,9899402,57024315.0,146,2
Nybrogade 4,1. MF,9000 Aalborg,1486000,15-04-2015,Alm. Salg,206380,20,Lejlighed,720,19960,20,9000,9914724,57053653.0,146,2
Poul Paghs Gade 6B,1. TH,9000 Aalborg,1216000,17-06-2015,Alm. Salg,181490,20,Lejlighed,670,19020,20,9000,9910862,5705156.0,146,2
Ryesgade 49,2. TH,9000 Aalborg,950000,28-01-2015,Alm. Salg,175920,20,Lejlighed,540,19280,60,9000,990288,57054005.0,146,2


In [202]:
X = balanced_data[['price_per_sq_m','size_in_sq_m']]
y = balanced_data.labeled_price_pct

In [203]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=1,shuffle=True)

In [206]:
model = DecisionTreeClassifier(max_depth = 3)
model.fit(X_train,y_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=3,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=None, splitter='best')

In [211]:
prediction = model.predict(X_test)
accuracy_score(y_test, prediction)

0.7486631016042781

In [213]:
metrics.confusion_matrix(y_test, prediction)

array([[50, 11,  0],
       [31, 24,  0],
       [ 5,  0, 66]], dtype=int64)

### Conclusion

Not a very great solution,due to the lack of data 