# Ferrario Gabriele 817518 Assignment 1 
____
### Roadmap:
- Data Loading 
- Data Preprocessing
- Model Definition
- Training
- Test Prediction

In [1]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split

import numpy as np
import tensorflow as tf
from tensorflow import keras
from keras.layers import Dense
from keras.models import Sequential
from tensorflow.keras import layers

In [2]:
def preprocess_data(X, scaler=None):
    if not scaler:
        scaler =  MinMaxScaler()
        scaler.fit(X)
    X = scaler.transform(X)
    return X, scaler

### Data Loading 

In [3]:
x_train = pd.read_csv("X_train.csv", index_col=False)

In [4]:
# elimino una colonna senza nome presente nel file csv

del x_train['Unnamed: 0']

In [5]:
x_train

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Private_room,Entire_home/apt
0,40.71239,-73.95271,4,2,0.19,1,0,1,0
1,40696.00000,-73.91303,4,17,0.66,2,307,1,0
2,40.62707,-74.02817,3,1,0.04,1,87,0,0
3,40.77910,-73.98565,1,4,0.08,1,0,0,1
4,40.75777,-73.93509,1,0,0.00,1,358,1,0
...,...,...,...,...,...,...,...,...,...
33879,40.71187,-73.95864,1,0,0.00,2,0,1,0
33880,40.74123,-73.90152,30,0,0.00,103,247,1,0
33881,40.73375,-73.95570,2,3,0.23,1,0,1,0
33882,40.73660,-73.92358,30,1,0.19,103,252,1,0


In [6]:
y_train = pd.read_csv("Y_train.csv", index_col=False)
# elimino una colonna senza nome presente nel file csv
del y_train['Unnamed: 0']

### Data Preprocessing
Ho eliminato le istanze che presentavano un valore del campo price maggiore o uguale a 9999.

In [7]:
y_train.loc[y_train["price"]>=9999]

Unnamed: 0,price
4255,10000
9886,9999
10350,9999
23380,10000
25158,9999


In [8]:
x_train["price"]=y_train

In [9]:
x_train = x_train.loc[x_train["price"]<9999]
x_train

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Private_room,Entire_home/apt,price
0,40.71239,-73.95271,4,2,0.19,1,0,1,0,65
1,40696.00000,-73.91303,4,17,0.66,2,307,1,0,57
2,40.62707,-74.02817,3,1,0.04,1,87,0,0,225
3,40.77910,-73.98565,1,4,0.08,1,0,0,1,175
4,40.75777,-73.93509,1,0,0.00,1,358,1,0,125
...,...,...,...,...,...,...,...,...,...,...
33879,40.71187,-73.95864,1,0,0.00,2,0,1,0,150
33880,40.74123,-73.90152,30,0,0.00,103,247,1,0,59
33881,40.73375,-73.95570,2,3,0.23,1,0,1,0,60
33882,40.73660,-73.92358,30,1,0.19,103,252,1,0,39


In [10]:
x_train["Private_room"].value_counts()

0    18427
1    15452
Name: Private_room, dtype: int64

In [11]:
x_train["Entire_home/apt"].value_counts()

1    17621
0    16258
Name: Entire_home/apt, dtype: int64

Analizzando i campi Private_room e Entire_home/apt si può notare che ci sono istanze con entrambi i valori a zero, questo potrebbe significare che ci sono problemi nei dati poichè per ogni istanza mi aspetto un 1 in almeno uno di questi campi.

In [12]:
x_train.loc[(x_train["Private_room"]==0) & (x_train["Entire_home/apt"]==0)]

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Private_room,Entire_home/apt,price
2,40.62707,-74.02817,3,1,0.04,1,87,0,0,225
6,40.74320,-73.97605,2,0,0.00,3,324,0,0,39
9,40.71714,-73.95447,2,80,0.98,1,364,0,0,195
13,40.65421,-73.96155,2,10,0.93,1,0,0,0,47
60,40.66383,-73.92706,2,7,0.15,1,238,0,0,95
...,...,...,...,...,...,...,...,...,...,...
33578,40.76691,-73.98726,1,3,1.96,5,23,0,0,95
33640,40.71825,-73.83502,1,68,3.62,1,125,0,0,60
33706,40.72104,-73.93985,30,3,0.19,10,365,0,0,35
33776,40.61984,-73.97872,30,8,0.42,1,179,0,0,50


Aggiorno la variabile target

In [13]:
y_train = pd.DataFrame(x_train['price'], columns=['price']) 

del x_train["price"]

In [14]:
columns = x_train.columns
columns

Index(['latitude', 'longitude', 'minimum_nights', 'number_of_reviews',
       'reviews_per_month', 'calculated_host_listings_count',
       'availability_365', 'Private_room', 'Entire_home/apt'],
      dtype='object')

Verifico la presenza di eventuali valori mancanti

In [15]:
x_train.isna().sum()

latitude                          0
longitude                         0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
Private_room                      0
Entire_home/apt                   0
dtype: int64

Analizzo le statistiche descrittive e trasformo le features ridimensionandole in un intervallo [0, 1].

In [16]:
x_train.describe()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Private_room,Entire_home/apt
count,33879.0,33879.0,33879.0,33879.0,33879.0,33879.0,33879.0,33879.0,33879.0
mean,393.794218,-767.426857,7.103161,23.394964,1.095611,7.101981,112.996842,0.456094,0.520116
std,3773.642416,7124.27882,20.223105,44.610534,1.610519,32.749248,131.640565,0.498076,0.499603
min,40.49979,-74142.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0
25%,40.69042,-73.9835,1.0,1.0,0.04,1.0,0.0,0.0,0.0
50%,40.72364,-73.95602,3.0,5.0,0.37,1.0,45.0,0.0,1.0
75%,40.763905,-73.937,5.0,24.0,1.59,2.0,227.0,1.0,1.0
max,40894.0,-73.71299,1000.0,629.0,58.5,327.0,365.0,1.0,1.0


In [17]:
x_train, x_scaler = preprocess_data(x_train)
x_train

array([[5.20396047e-06, 9.99996764e-01, 3.00300300e-03, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [9.95153414e-01, 9.99997299e-01, 3.00300300e-03, ...,
        8.41095890e-01, 1.00000000e+00, 0.00000000e+00],
       [3.11552252e-06, 9.99995745e-01, 2.00200200e-03, ...,
        2.38356164e-01, 0.00000000e+00, 0.00000000e+00],
       ...,
       [5.72680428e-06, 9.99996723e-01, 1.00100100e-03, ...,
        0.00000000e+00, 1.00000000e+00, 0.00000000e+00],
       [5.79656575e-06, 9.99997157e-01, 2.90290290e-02, ...,
        6.90410959e-01, 1.00000000e+00, 0.00000000e+00],
       [4.21310289e-06, 9.99996764e-01, 4.00400400e-03, ...,
        0.00000000e+00, 0.00000000e+00, 1.00000000e+00]])

In [18]:
y_train

Unnamed: 0,price
0,65
1,57
2,225
3,175
4,125
...,...
33879,150
33880,59
33881,60
33882,39


In [19]:
y_train, y_scaler = preprocess_data(y_train.values)
y_train

array([[0.00764706],
       [0.00670588],
       [0.02647059],
       ...,
       [0.00705882],
       [0.00458824],
       [0.01352941]])

Verifico la trasformazione inversa

In [20]:
y_scaler.inverse_transform(y_train)

array([[ 65.],
       [ 57.],
       [225.],
       ...,
       [ 60.],
       [ 39.],
       [115.]])

In [21]:
print("len x_train: {}".format(len(x_train)))
print("len y_train: {}".format(len(y_train)))

len x_train: 33879
len y_train: 33879


Suddivido i dati di training in un insieme per il training e uno la validazione del modello.

In [22]:
x_train, x_validation, y_train, y_validation = train_test_split(x_train, y_train, test_size=0.1, random_state=0)

### Model Definition

In [23]:
inputs = keras.Input(shape = (x_train.shape[1]))
x = layers.Dense(13, activation = 'relu') (inputs)
x = layers.Dense(6, activation = 'relu') (x)
x = layers.Dense(1) (x)
model = keras.Model(inputs, x)
model.compile(loss = 'mse', optimizer='adam', metrics=['mse']) 

In [24]:
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, 9)]               0         
_________________________________________________________________
dense (Dense)                (None, 13)                130       
_________________________________________________________________
dense_1 (Dense)              (None, 6)                 84        
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 7         
Total params: 221
Trainable params: 221
Non-trainable params: 0
_________________________________________________________________


### Training

In [25]:
history = model.fit(x_train, y_train, batch_size=16, epochs=100, verbose=1, validation_data=(x_validation, y_validation))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100


Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78/100
Epoch 79/100
Epoch 80/100
Epoch 81/100
Epoch 82/100
Epoch 83/100
Epoch 84/100
Epoch 85/100
Epoch 86/100
Epoch 87/100
Epoch 88/100
Epoch 89/100
Epoch 90/100
Epoch 91/100
Epoch 92/100
Epoch 93/100
Epoch 94/100
Epoch 95/100
Epoch 96/100
Epoch 97/100
Epoch 98/100
Epoch 99/100
Epoch 100/100


In [26]:
history.history['mse'][-1] 

0.0005571647197939456

### Test Prediction

In [27]:
x_test = pd.read_csv("X_test.csv", index_col=False)
del x_test['Unnamed: 0']

In [28]:
x_test

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Private_room,Entire_home/apt
0,40.68579,-73.97455,2,4,0.17,1,0,0,1
1,40.67085,-73.92316,1,9,0.61,1,0,1,0
2,40.68817,-73.91523,2,31,1.35,5,0,1,0
3,40.67201,-73.86944,3,0,0.00,2,363,1,0
4,40.68696,-73.92905,1,7,0.19,1,0,1,0
...,...,...,...,...,...,...,...,...,...
3760,40.58085,-73.93934,2,73,2.15,1,330,0,1
3761,40.82760,-73.94457,1,4,0.85,2,365,1,0
3762,40.68408,-73.95659,1,4,0.30,1,0,0,1
3763,40.78277,-73.95164,2,2,0.03,1,0,0,1


In [29]:
x_test.loc[(x_test["Private_room"]==0) & (x_test["Entire_home/apt"]==0)]

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365,Private_room,Entire_home/apt
28,40687.00000,-73.93446,5,91,1.18,3,248,0,0
68,40.89785,-73.86977,2,2,0.11,2,0,0,0
75,40.64314,-73.99221,5,7,1.00,8,365,0,0
129,40.85362,-73.82949,1,0,0.00,1,364,0,0
139,40.60625,-74089.00000,4,1,0.77,7,68,0,0
...,...,...,...,...,...,...,...,...,...
3439,40.72549,-73.99198,2,0,0.00,1,0,0,0
3455,40.64681,-73.95901,3,5,0.14,1,189,0,0
3592,40.79949,-73.94265,1,8,2.40,7,90,0,0
3682,40.80069,-73.96431,7,0,0.00,1,39,0,0


In [30]:
x_test, _ = preprocess_data(x_test, x_scaler)

In [31]:
y_test = model.predict(x_test)

In [32]:
y_test

array([[0.02026977],
       [0.0115739 ],
       [0.0115739 ],
       ...,
       [0.02029842],
       [0.02099621],
       [0.0115739 ]], dtype=float32)

In [33]:
y = y_scaler.inverse_transform(y_test)
y

array([[172.29308],
       [ 98.37817],
       [ 98.37817],
       ...,
       [172.53654],
       [178.4678 ],
       [ 98.37817]], dtype=float32)

In [34]:
y.round()

array([[172.],
       [ 98.],
       [ 98.],
       ...,
       [173.],
       [178.],
       [ 98.]], dtype=float32)

In [35]:
file = open("Gabriele_Ferrario_817518_score1.txt", "w")
for row in y.round():
    np.savetxt(file, row)

file.close()