In [1]:
import os 
import sys 
import warnings 

import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 

In [3]:
train_df = pd.read_csv('../../../data/data/Hotel-A-train.csv')
val_df = pd.read_csv('../../../data/data/Hotel-A-validation.csv')
test_df = pd.read_csv('../../../data/data/Hotel-A-test.csv')

In [6]:
print(train_df.columns)
train_df.head()

Index(['Reservation-id', 'Gender', 'Age', 'Ethnicity', 'Educational_Level',
       'Income', 'Country_region', 'Hotel_Type', 'Expected_checkin',
       'Expected_checkout', 'Booking_date', 'Adults', 'Children', 'Babies',
       'Meal_Type', 'Visted_Previously', 'Previous_Cancellations',
       'Deposit_type', 'Booking_channel', 'Required_Car_Parking',
       'Reservation_Status', 'Use_Promotion', 'Discount_Rate', 'Room_Rate'],
      dtype='object')


Unnamed: 0,Reservation-id,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Expected_checkin,Expected_checkout,...,Meal_Type,Visted_Previously,Previous_Cancellations,Deposit_type,Booking_channel,Required_Car_Parking,Reservation_Status,Use_Promotion,Discount_Rate,Room_Rate
0,39428300,F,40,Latino,Grad,<25K,North,City Hotel,7/1/2015,7/2/2015,...,BB,No,No,No Deposit,Online,Yes,Check-In,Yes,10,218
1,77491756,F,49,Latino,Mid-School,50K -- 100K,East,City Hotel,7/1/2015,7/2/2015,...,BB,No,No,Refundable,Online,Yes,Check-In,No,0,185
2,73747291,F,42,caucasian,Grad,<25K,East,City Hotel,7/2/2015,7/6/2015,...,BB,No,No,No Deposit,Online,Yes,Check-In,No,0,119
3,67301739,M,25,African American,College,>100K,South,Airport Hotels,7/2/2015,7/3/2015,...,BB,No,No,Refundable,Agent,Yes,Check-In,Yes,5,144
4,77222321,F,62,Latino,High-School,25K --50K,East,Resort,7/3/2015,7/4/2015,...,BB,No,No,No Deposit,Direct,No,Check-In,Yes,10,242


column masking

In [11]:
col_mask = ['Gender', 'Age', 'Ethnicity', 'Educational_Level',
       'Income', 'Country_region', 'Hotel_Type', 'Adults', 'Children', 'Babies',
       'Meal_Type', 'Visted_Previously', 'Previous_Cancellations',
       'Deposit_type', 'Booking_channel', 'Required_Car_Parking',
       'Use_Promotion', 'Discount_Rate', 'Room_Rate', 'Reservation_Status']


In [12]:
ms_train_df = train_df.loc[:, col_mask]

categorical encoding

In [13]:
def cat_to_int(df, columns, enc={}):
    df = df.copy()
    if enc == {}:
        maps = {}
        for col in columns:
            mapping = {k: i for i,k in enumerate(df.loc[:,col].unique())}
            df[col] = df[col].map(mapping)
            maps[col] = mapping
        return df, maps
    else:
        maps = enc
        for col in columns:
            df[col] = df[col].map(maps[col])
        return df

In [16]:
enc_train_df, maps= cat_to_int(ms_train_df, ['Gender','Ethnicity', 'Educational_Level',
       'Income', 'Country_region', 'Hotel_Type','Meal_Type', 'Visted_Previously', 'Previous_Cancellations',
       'Deposit_type', 'Booking_channel', 'Required_Car_Parking', 'Use_Promotion', 'Discount_Rate', 'Reservation_Status'])

In [17]:
enc_train_df.head()

Unnamed: 0,Gender,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Adults,Children,Babies,Meal_Type,Visted_Previously,Previous_Cancellations,Deposit_type,Booking_channel,Required_Car_Parking,Use_Promotion,Discount_Rate,Room_Rate,Reservation_Status
0,0,40,0,0,0,0,0,2,2,0,0,0,0,0,0,0,0,0,218,0
1,0,49,0,1,1,1,0,3,3,0,0,0,0,1,0,0,1,1,185,0
2,0,42,1,0,0,1,0,3,3,0,0,0,0,0,0,0,1,1,119,0
3,1,25,2,2,2,2,1,4,3,0,0,0,0,1,1,0,0,2,144,0
4,0,62,0,3,3,1,2,1,1,0,0,0,0,0,2,1,0,0,242,0


Lasso feature selection

In [36]:
import matplotlib.pyplot as plt
import numpy as np

from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LassoCV
from sklearn.ensemble import RandomForestClassifier

In [43]:
def Lasso_fs(estimator, X, y, threshold=0.25):
    sfm = SelectFromModel(estimator, threshold=threshold, prefit=False, norm_order=1, max_features=None)
    sfm.fit(X, y)

    feature_idx = sfm.get_support()
    feature_name = X.columns[feature_idx]
    feature_name

    # n_features = sfm.transform(X).shape[1]
    # n_features
    return feature_name

In [45]:
estimator = RandomForestClassifier()
features = Lasso_fs(estimator, enc_train_df.iloc[:, :-1], enc_train_df.iloc[:, -1], 0.1)
print(features)

Index(['Age', 'Room_Rate'], dtype='object')


Recursive feature elimination

In [59]:
from sklearn.feature_selection import RFE
from sklearn.feature_selection import RFECV

In [60]:
def rfe(model, X, y):
    names=pd.DataFrame(X.columns)


    #This is to select 5 variables: can be changed and checked in model for accuracy
    rfe_mod = RFE(estimator=model, n_features_to_select=10, step=1) #RFECV(lin_reg, step=1, cv=5) 
    myvalues=rfe_mod.fit(X,y) #to fit
    myvalues.support_#The mask of selected features.
    myvalues.ranking_ #The feature ranking, such that ranking_[i] corresponds to the ranking position of the i-th feature. Selected (i.e., estimated best) features are assigned rank 1.

    rankings=pd.DataFrame(myvalues.ranking_) #Make it into data frame
    return rankings

In [62]:
fs_mask = rfe(estimator, enc_train_df.iloc[:, :-1], enc_train_df.iloc[:, -1])

In [63]:
fs_mask.reset_index(inplace=True)

In [66]:
ch_cols = fs_mask.loc[fs_mask.iloc[:,1]==1]['index'].values

In [67]:
fin_df = enc_train_df.iloc[:, ch_cols]

In [68]:
fin_df.head()

Unnamed: 0,Age,Ethnicity,Educational_Level,Income,Country_region,Hotel_Type,Adults,Booking_channel,Discount_Rate,Room_Rate
0,40,0,0,0,0,0,2,0,0,218
1,49,0,1,1,1,0,3,0,1,185
2,42,1,0,0,1,0,3,0,1,119
3,25,2,2,2,2,1,4,1,2,144
4,62,0,3,3,1,2,1,2,0,242
