In [273]:
import pandas as pd
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LinearRegression, Ridge

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

csv_file = '/content/drive/My Drive/TFG_AlienEmbarecRiadi/hotel_bookings_modified.csv'
df = pd.read_csv(csv_file, delimiter=',')
df['arrival_year_week'] = df.arrival_date_year*100+df.arrival_date_week_number
df.to_csv("/content/drive/My Drive/TFG_AlienEmbarecRiadi/hotel_bookings_modified.csv", index=False)
data = df.copy()
print(df.head(10))

print(df.info())

          hotel  is_canceled  lead_time  arrival_date_year arrival_date_month  arrival_date_week_number  arrival_date_day_of_month  stays_in_weekend_nights  stays_in_week_nights  adults  children  babies meal country market_segment distribution_channel  is_repeated_guest  previous_cancellations  previous_bookings_not_canceled reserved_room_type assigned_room_type  booking_changes deposit_type  agent  days_in_waiting_list customer_type    adr  required_car_parking_spaces  total_of_special_requests reservation_status reservation_status_date  total_stayed_nights  weekend_nights_proportion  arrival_year_week
0  Resort Hotel            0        342               2015               July                        27                          1                        0                     0       2         0       0   BB     PRT         Direct               Direct                  0                       0                               0                  C                  C                3   No 

In [274]:
data["reservation_status_date"] = pd.to_datetime(data["reservation_status_date"]).dt.strftime("%Y%m%d")
print(data['reservation_status_date'])

0         20150701
1         20150701
2         20150702
3         20150702
4         20150703
            ...   
119385    20170906
119386    20170907
119387    20170907
119388    20170907
119389    20170907
Name: reservation_status_date, Length: 119390, dtype: object


In [275]:
data['meal'].unique()
valores_meal = {'meal':{'SC':0,'BB':1,'HB':2,'FB':3}}
data.replace(valores_meal, inplace=True)
print(data['meal'])

0         1
1         1
2         1
3         1
4         1
         ..
119385    1
119386    1
119387    1
119388    1
119389    2
Name: meal, Length: 119390, dtype: int64


In [276]:
data['hotel'].unique()
valores_hotel = {'hotel':{'Resort Hotel': 1, 'City Hotel': 0}}
data.replace(valores_hotel, inplace=True)
print(data['hotel'])

0         1
1         1
2         1
3         1
4         1
         ..
119385    0
119386    0
119387    0
119388    0
119389    0
Name: hotel, Length: 119390, dtype: int64


In [277]:
data['arrival_date_month'].unique()
valores_arrival_month = {'arrival_date_month':{'January': 1, 'February': 2, 'March': 3, 'April': 4, 'May': 5, 'June': 6, 'July': 7, 'August': 8, 'September': 9, 'October': 10, 'November': 11, 'December': 12}}
data.replace(valores_arrival_month, inplace=True)
print(data['arrival_date_month'])

0         7
1         7
2         7
3         7
4         7
         ..
119385    8
119386    8
119387    8
119388    8
119389    8
Name: arrival_date_month, Length: 119390, dtype: int64


In [278]:
del data['country']

In [279]:
valores_deposit_type = {'deposit_type': {'No Deposit': 0, 'Refundable': 1, 'Non Refund': 2}}
data.replace(valores_deposit_type, inplace=True)

In [280]:
valores_customer_type = {'customer_type': {'Transient': 0, 'Contract': 1, 'Transient-Party': 2, 'Group': 3}}
data.replace(valores_customer_type, inplace=True)

In [281]:
data['reservation_status_date'] = pd.to_numeric(data.reservation_status_date.str.replace('-',''))

In [282]:
print(data['reservation_status_date'])

0         20150701
1         20150701
2         20150702
3         20150702
4         20150703
            ...   
119385    20170906
119386    20170907
119387    20170907
119388    20170907
119389    20170907
Name: reservation_status_date, Length: 119390, dtype: int64


In [283]:
valores_reservation_status = {'reservation_status': {'Check-Out': 0, 'Canceled': 1, 'No-Show': 2}}
data.replace(valores_reservation_status, inplace=True)

In [284]:
print(data['reservation_status'])

0         0
1         0
2         0
3         0
4         0
         ..
119385    0
119386    0
119387    0
119388    0
119389    0
Name: reservation_status, Length: 119390, dtype: int64


In [285]:
print(data['assigned_room_type'])

0         C
1         C
2         C
3         A
4         A
         ..
119385    A
119386    E
119387    D
119388    A
119389    A
Name: assigned_room_type, Length: 119390, dtype: object


In [286]:
valores_reserved_room_type = {'assigned_room_type': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'L': 8, 'P': 9, 'I': 10, 'K': 11,}}
data.replace(valores_reserved_room_type, inplace=True)
data['assigned_room_type'].unique()

array([ 2,  0,  3,  4,  6,  5, 10,  1,  7,  9,  8, 11])

In [287]:
data['agent'].fillna(0, inplace=True)

In [288]:
data['market_segment'].unique()
valores_market_segment = {'market_segment': {'Direct': 0, 'Corporate': 1, 'Online TA': 2, 'Offline TA/TO': 3, 'Complementary': 5, 'Groups': 6, 'Unknown': 7, 'Aviation': 8}}
data.replace(valores_market_segment, inplace=True)

In [289]:
data['distribution_channel'].unique()
valores_distribution_channel = {'distribution_channel': {'Direct': 0, 'Corporate': 1, 'TA/TO': 2, 'Unknown': 3, 'GDS': 4}}
data.replace(valores_distribution_channel, inplace=True)

In [290]:
valores_reserved_room_type = {'reserved_room_type': {'A': 0, 'B': 1, 'C': 2, 'D': 3, 'E': 4, 'F': 5, 'G': 6, 'H': 7, 'L': 8, 'P': 9}}
data.replace(valores_reserved_room_type, inplace=True)

In [291]:
cols = data.columns[data.dtypes.eq('object')]
cols

Index([], dtype='object')

In [292]:
print(data.head())

   hotel  is_canceled  lead_time  arrival_date_year  arrival_date_month  arrival_date_week_number  arrival_date_day_of_month  stays_in_weekend_nights  stays_in_week_nights  adults  children  babies  meal  market_segment  distribution_channel  is_repeated_guest  previous_cancellations  previous_bookings_not_canceled  reserved_room_type  assigned_room_type  booking_changes  deposit_type  agent  days_in_waiting_list  customer_type   adr  required_car_parking_spaces  total_of_special_requests  reservation_status  reservation_status_date  total_stayed_nights  weekend_nights_proportion  arrival_year_week
0      1            0        342               2015                   7                        27                          1                        0                     0       2         0       0     1               0                     0                  0                       0                               0                   2                   2                3             0    0.0

In [293]:
datos_entrenamiento = data.sample(frac=0.8, random_state=0)
datos_test = data.drop(datos_entrenamiento.index)

etiquetas_entrenamiento = datos_entrenamiento.pop('adr')
etiquetas_test = datos_test.pop('adr')

print(etiquetas_entrenamiento)

5627       37.56
119322     86.10
75310      62.00
90032     133.10
75368      62.00
           ...  
7772      189.00
100386    149.00
95066     107.10
66008     109.80
53253     130.00
Name: adr, Length: 95512, dtype: float64


In [294]:
from sklearn.neighbors import KNeighborsRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge


X_ent, X_test, y_ent, y_test = train_test_split(datos_entrenamiento, etiquetas_entrenamiento)
X_test.shape

(23878, 32)

In [295]:
knn = KNeighborsRegressor(n_neighbors=6)
print('X_ent: ', X_ent)
print(y_ent)
knn.fit(X_ent,y_ent)


X_ent:          hotel  is_canceled  lead_time  arrival_date_year  arrival_date_month  arrival_date_week_number  arrival_date_day_of_month  stays_in_weekend_nights  stays_in_week_nights  adults  children  babies  meal  market_segment  distribution_channel  is_repeated_guest  previous_cancellations  previous_bookings_not_canceled  reserved_room_type  assigned_room_type  booking_changes  deposit_type  agent  days_in_waiting_list  customer_type  required_car_parking_spaces  total_of_special_requests  reservation_status  reservation_status_date  total_stayed_nights  weekend_nights_proportion  arrival_year_week
65631       0            1         69               2017                   4                        14                          6                        0                     3       1         0       0     1               1                     1                  0                       0                               0                   0                   0                0         

KNeighborsRegressor(algorithm='auto', leaf_size=30, metric='minkowski',
                    metric_params=None, n_jobs=None, n_neighbors=6, p=2,
                    weights='uniform')

In [296]:
knn.score(X_test,y_test)

0.558195745086024

In [297]:
del knn

In [298]:
rl=LinearRegression()

In [299]:
rl.fit(X_ent, y_ent)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

In [300]:
rl.score(X_test, y_test)

0.4323652738011268

In [301]:
del rl

In [302]:
ridge = Ridge(alpha=0.2)

In [303]:
ridge.fit(X_ent, y_ent)

Ridge(alpha=0.2, copy_X=True, fit_intercept=True, max_iter=None,
      normalize=False, random_state=None, solver='auto', tol=0.001)

In [304]:
ridge.score(X_test, y_test)

0.43236496965612137

In [305]:
data.to_csv("/content/drive/My Drive/TFG_AlienEmbarecRiadi/hotel_bookings_numeric.csv", index=False)