In [2]:
# coding: utf8

## import

In [3]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
import warnings
%matplotlib inline
warnings.filterwarnings("ignore")
sns.set(style="white", color_codes=True)

In [5]:
from sklearn.cross_validation import train_test_split

## constantes

In [6]:
SEED_STATE = 1
KFOLDS_NUM = 10

##### Kaggle dataset description for the Expedia challenge
*checkout: [Kaggle web site](https://www.kaggle.com/c/expedia-hotel-recommendations/data)*

**train.csv**

Column | name |	Description	Data type
-------|------|----------------------
date_time |	Timestamp |	string
site_name |	ID of the Expedia point of sale (i.e. Expedia.com, Expedia.co.uk, Expedia.co.jp, ...) |	int
posa_continent |	ID of continent associated with site_name |	int
user_location_country |	The ID of the country the customer is located |	int
user_location_region |	The ID of the region the customer is located |	int
user_location_city |	The ID of the city the customer is located |	int
orig_destination_distance |	Physical distance between a hotel and a customer at the time of search. A null means the distance could not be calculated |	double
user_id |	ID of user |	int
is_mobile |	1 when a user connected from a mobile device, 0 otherwise |	tinyint
is_package |	1 if the click/booking was generated as a part of a package (i.e. combined with a flight), 0 otherwise |	int
channel |	ID of a marketing channel |	int
srch_ci |	Checkin date |	string
srch_co |	Checkout date |	string
srch_adults_cnt |	The number of adults specified in the hotel room |	int
srch_children_cnt |	The number of (extra occupancy) children specified in the hotel room |	int
srch_rm_cnt |	The number of hotel rooms specified in the search |	int
srch_destination_id |	ID of the destination where the hotel search was performed |	int
srch_destination_type_id |	Type of destination |	int
hotel_continent |	Hotel continent |	int
hotel_country |	Hotel country |	int
hotel_market |	Hotel market |	int
is_booking |	1 if a booking, 0 if a click |	tinyint
cnt	 | Numer of similar events in the context of the same user session |	bigint
hotel_cluster |	ID of a hotel cluster |	int


**destination.csv**

Column | name |	Description	Data type
-------|------|-----------
srch_destination_id |	ID of the destination where the hotel search was performed |	int
d1-d149 |	latent description of search regions |	double

## laod dataset

In [7]:
DATA_TEST = pd.read_csv("../data/test.csv")
DATA_DEST = pd.read_csv("../data/destinations.csv")
DATA_TRAIN = pd.read_csv("../data/train.csv")

In [8]:
DATA_DEST.head(5)

Unnamed: 0,srch_destination_id,d1,d2,d3,d4,d5,d6,d7,d8,d9,...,d140,d141,d142,d143,d144,d145,d146,d147,d148,d149
0,0,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-1.897627,-2.198657,-2.198657,-1.897627,...,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657,-2.198657
1,1,-2.18169,-2.18169,-2.18169,-2.082564,-2.18169,-2.165028,-2.18169,-2.18169,-2.031597,...,-2.165028,-2.18169,-2.165028,-2.18169,-2.18169,-2.165028,-2.18169,-2.18169,-2.18169,-2.18169
2,2,-2.18349,-2.224164,-2.224164,-2.189562,-2.105819,-2.075407,-2.224164,-2.118483,-2.140393,...,-2.224164,-2.224164,-2.196379,-2.224164,-2.192009,-2.224164,-2.224164,-2.224164,-2.224164,-2.057548
3,3,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.115485,-2.177409,-2.177409,-2.177409,...,-2.161081,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409,-2.177409
4,4,-2.189562,-2.187783,-2.194008,-2.171153,-2.152303,-2.056618,-2.194008,-2.194008,-2.145911,...,-2.187356,-2.194008,-2.191779,-2.194008,-2.194008,-2.185161,-2.194008,-2.194008,-2.194008,-2.188037


In [9]:
print "Number of destination ids: ",len(DATA_DEST.srch_destination_id)

Number of destination ids:  62106


In [10]:
print "Nombre de colonnes présentant des N/A dans le fichier de destination: %i"%\
len(filter(lambda y: y != 0, DATA_DEST.isnull().sum()))
print "Nombre de colonnes présentant des N/A dans le fichier de recherche: %i"%\
len(filter(lambda y: y != 0, DATA_TRAIN.isnull().sum()))

Nombre de colonnes présentant des N/A dans le fichier de destination: 0
Nombre de colonnes présentant des N/A dans le fichier de recherche: 3


In [11]:
print len(DATA_TRAIN["hotel_cluster"].unique())

100


In [12]:
DATA_TRAIN["srch_destination_type_id"].unique()

array([1, 6, 4, 8, 3, 7, 5, 9, 0, 2])

In [10]:
DATA_TRAIN["date_time"] = pd.to_datetime(DATA_TRAIN["date_time"])
DATA_TRAIN["year"] = DATA_TRAIN["date_time"].dt.year
DATA_TRAIN["month"] = DATA_TRAIN["date_time"].dt.month

In [14]:
# Contruction de vecteur utilisateurs
# classsification de "is_booking" sur tout train.csv
# -> cluster partypes -> proba booking cluster ->


In [16]:
#DATA_TRAIN.corr()["hotel_cluster"]

## Sampling

In [11]:
selected = DATA_TEST["user_id"].unique()
DATA_TRAIN = DATA_TRAIN[DATA_TRAIN.user_id.isin(selected)]
TARGET_TRAIN, DUMP_TRAIN =  train_test_split(DATA_TRAIN, test_size=0.20, random_state=1)

In [12]:
print "",len(selected)
print "",len(DATA_TRAIN)
print "",len(TARGET_TRAIN)

 1181577
 37214162
 29771329


In [13]:
selected_users, test_users = train_test_split(selected, test_size=0.99, random_state=SEED_STATE)

In [14]:
print "Nombre de users selectionnés : %s (ex: %s ... )"%(len(selected_users),selected_users[:3])

Nombre de users selectionnés : 11815 (ex: [ 57222 305851 367900] ... )


In [15]:
SELECT_TRAIN = DATA_TRAIN[DATA_TRAIN.user_id.isin(selected_users)]
print "Nombre de recherches liée aux user séléectionés : %s"%(len(SELECT_TRAIN))

Nombre de recherches liée aux user séléectionés : 379145


In [16]:
# Resampling with the same pattern, the validation set is more recent
TRAIN_SET = SELECT_TRAIN[((SELECT_TRAIN.year == 2013) | ((SELECT_TRAIN.year == 2014) & (SELECT_TRAIN.month < 8)))]
VALID_SET = SELECT_TRAIN[((SELECT_TRAIN.year == 2014) &  (SELECT_TRAIN.month >= 8))]

In [17]:
VALID_SET = VALID_SET[VALID_SET.is_booking == True]
print "Number of request to predict",len(VALID_SET)

Number of request to predict 9766


### Dummy algo

In [26]:
# pip install ml_metrics
import ml_metrics as ml

most_common_clusters = list(TRAIN_SET.hotel_cluster.value_counts().head().index)
dummy_predictions = [most_common_clusters for i in range(VALID_SET.shape[0])]
#[[91, 48, 41, 65, 64],
# [91, 48, 41, 65, 64],
# [91, 48, 41, 65, 64],
# ... ]

dummy_target = [[l] for l in VALID_SET["hotel_cluster"]]

print "Here is the dummy error of the dummy alg : %s"%ml.mapk(dummy_target, dummy_predictions, k=5)

Here is the dummy error of the dummy alg : 0.0696456268123


## Top cluster 

### Search destination id grouping

In [18]:
TRAIN_SET[["user_id","user_location_region","srch_destination_id","srch_destination_type_id","hotel_cluster"]].head()

Unnamed: 0,user_id,user_location_region,srch_destination_id,srch_destination_type_id,hotel_cluster
2394,10643,135,8268,1,18
2395,10643,135,8824,1,66
2396,10643,135,8824,1,30
2397,10643,135,12239,6,37
2398,10643,135,12654,5,71


In [19]:
GLOBAL_RANK_HOTEL = TRAIN_SET["hotel_cluster"].value_counts().index.tolist()
GLOBAL_RANK_HOTEL[:5]

[91, 41, 48, 64, 5]

In [20]:
topagg = {}

In [21]:
groups = TRAIN_SET.groupby(["srch_destination_id", "hotel_cluster"])

for name, group in groups:
    
    dest_id = name[0]
    cluster_id = name[1]
    
    clicks = len(group.is_booking[group.is_booking == False])
    bookings = len(group.is_booking[group.is_booking == True])
    
    score = bookings + .15 * clicks
    
    if dest_id not in topagg.keys():
        topagg[dest_id] = {}
    
    topagg[dest_id][cluster_id] = score

In [22]:
topagg[8]

{7: 0.15,
 32: 0.15,
 42: 0.6,
 43: 0.44999999999999996,
 48: 1.6,
 60: 0.15,
 76: 0.15,
 91: 0.3}

In [23]:
topagg[8].keys()

[32, 7, 42, 43, 76, 48, 91, 60]

In [24]:
import operator
[i[0] for i in sorted(topagg[8].items(), key=operator.itemgetter(1), reverse=True)]

[48, 42, 43, 91, 32, 7, 76, 60]

In [35]:
df_top_col=[]
for dest_id in sorted(topagg.keys()):
    df_top_col.append([i[0] for i in sorted(topagg[dest_id].items(), key=operator.itemgetter(1), reverse=True)])

In [36]:
df_top = pd.DataFrame({"srch_destination_id": sorted(topagg.keys()), "hotel_cluster_top":df_top_col})
df_top.head(5)

Unnamed: 0,hotel_cluster_top,srch_destination_id
0,"[25, 82, 30]",4
1,"[48, 42, 43, 91, 32, 7, 76, 60]",8
2,[51],11
3,[20],14
4,"[85, 5, 47]",16


In [47]:
def complet(x):
    n = 0
    while len(x) < 5:
        hc = GLOBAL_RANK_HOTEL[n]
        n = n + 1
        if hc not in x:
            x.append(hc)
    
    return x

In [53]:
def reformat(x):
    pattern = "%s "*5
    return pattern%tuple(x[:5])

In [54]:
df_top["hotel_cluster_top"] = df_top["hotel_cluster_top"].apply(complet)
df_top["hotel_cluster_top"] = df_top["hotel_cluster_top"].apply(lambda y: y[:5])
df_top["str_hotel_cluster"] = df_top["hotel_cluster_top"].apply(reformat)

In [56]:
df_top.head(5)

Unnamed: 0,hotel_cluster_top,srch_destination_id,str_hotel_cluster
0,"[25, 82, 30, 91, 41]",4,25 82 30 91 41
1,"[48, 42, 43, 91, 32]",8,48 42 43 91 32
2,"[51, 91, 41, 48, 64]",11,51 91 41 48 64
3,"[20, 91, 41, 48, 64]",14,20 91 41 48 64
4,"[85, 5, 47, 91, 41]",16,85 5 47 91 41


In [61]:
VALID_SET = VALID_SET.join(df_top, on="srch_destination_id", how="left", rsuffix="_")
VALID_SET["str_hotel_cluster"].fillna("91 41 48 64 5", inplace=True)

### Features engeneering

In [27]:
from sklearn.decomposition import PCA
pca = PCA(n_components=3)
dest_small = pca.fit_transform(DATA_DEST[["d{0}".format(i + 1) for i in range(149)]])
dest_small = pd.DataFrame(dest_small)
dest_small["srch_destination_id"] = DATA_DEST["srch_destination_id"]

In [28]:
dest_small.head(5)

Unnamed: 0,0,1,2,srch_destination_id
0,0.044268,-0.169419,-0.032522,0
1,0.440761,-0.077405,0.091572,1
2,-0.001033,-0.020677,-0.012108,2
3,0.480467,0.040345,0.01932,3
4,0.207253,0.042694,0.011744,4


In [29]:
def calc_fast_features(df):
    df["date_time"] = pd.to_datetime(df["date_time"])
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format='%Y-%m-%d', errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format='%Y-%m-%d', errors="coerce")
    
    props = {}
    for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        props[prop] = getattr(df["date_time"].dt, prop)
    
    carryover = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    for prop in carryover:
        props[prop] = df[prop]
    
    date_props = ["month", "day", "dayofweek", "quarter"]
    for prop in date_props:
        props["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
        props["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)
    props["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
        
    ret = pd.DataFrame(props)
    
    ret = ret.join(dest_small, on="srch_destination_id", how='left', rsuffix="dest")
    ret = ret.drop("srch_destination_iddest", axis=1)
    return ret

#df = calc_fast_features(t1)
#df.fillna(-1, inplace=True)

In [None]:
def dataClean(df):

    df["date_time"] = pd.to_datetime(df["date_time"])
    df["srch_ci"] = pd.to_datetime(df["srch_ci"], format="%Y-%m-%d", errors="coerce")
    df["srch_co"] = pd.to_datetime(df["srch_co"], format="%Y-%m-%d", errors="coerce")

    props = {}
    for prop in ["month", "day", "hour", "minute", "dayofweek", "quarter"]:
        props[prop] = getattr(df["date_time"].dt, prop)
    
    features = [p for p in df.columns if p not in ["date_time", "srch_ci", "srch_co"]]
    
    for prop in features:
        props[prop] = df[prop]
    
    date_props = ["month", "day", "dayofweek", "quarter"]
    for prop in date_props:
        props["ci_{0}".format(prop)] = getattr(df["srch_ci"].dt, prop)
        props["co_{0}".format(prop)] = getattr(df["srch_co"].dt, prop)
    props["stay_span"] = (df["srch_co"] - df["srch_ci"]).astype('timedelta64[h]')
        
    frame = pd.DataFrame(props)
    
    #frame = frame.join(dest_pca, on="srch_destination_id", how='left', rsuffix="dest")
    #frame = frame.drop("srch_destination_iddest", axis=1)    
    
    #frame = frame.drop(toDrop, axis=1)
    orig_dest_dist_median = frame["orig_destination_distance"].median()
    frame["orig_destination_distance"].fillna(orig_dest_dist_median, inplace=True)
    
    return frame

In [31]:
neo_df = calc_fast_features(TRAIN_SET)
neo_df.fillna(-1, inplace=True)
neo_df.head(5)

Unnamed: 0,channel,ci_day,ci_dayofweek,ci_month,ci_quarter,cnt,co_day,co_dayofweek,co_month,co_quarter,...,srch_rm_cnt,stay_span,user_id,user_location_city,user_location_country,user_location_region,year,0,1,2
2596,9,29,0,4,2,3,2,3,5,2,...,1,72,13459,1263,66,462,2013,-0.558787,0.302728,0.043319
2597,1,25,5,5,2,5,28,1,5,2,...,1,72,13459,1263,66,462,2013,-0.339104,0.246993,0.159985
2598,1,1,6,6,2,1,6,4,6,2,...,1,120,13459,1263,66,462,2014,-0.219883,0.139836,-0.162701
2599,1,1,6,6,2,1,6,4,6,2,...,1,120,13459,1263,66,462,2014,-0.219883,0.139836,-0.162701
2600,1,1,6,6,2,1,6,4,6,2,...,1,120,13459,1263,66,462,2014,-0.219883,0.139836,-0.162701


### Tree based methods

In [34]:
predictors = [c for c in neo_df.columns if c not in ["hotel_cluster"]]
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier

clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)
scores = cross_validation.cross_val_score(clf, neo_df[predictors], neo_df['hotel_cluster'], cv=3)
scores

array([ 0.06770873,  0.06540906,  0.06761409])

In [36]:
from sklearn.cross_validation import KFold
from itertools import chain

In [37]:
help(chain.from_iterable)

Help on built-in function from_iterable:

from_iterable(...)
    chain.from_iterable(iterable) --> chain object
    
    Alternate chain() contructor taking a single iterable argument
    that evaluates lazily.



In [60]:
all_probs = []
unique_clusters = neo_df["hotel_cluster"].unique()
for cluster in unique_clusters:
    neo_df["target"] = 1
    neo_df["target"][neo_df["hotel_cluster"] != cluster] = 0
    predictors = [col for col in neo_df if col not in ['hotel_cluster', "target"]]
    probs = []
    cv = KFold(len(neo_df["target"]), n_folds=2)
    clf = RandomForestClassifier(n_estimators=10, min_weight_fraction_leaf=0.1)
    for i, (tr, te) in enumerate(cv):
        clf.fit(neo_df[predictors].iloc[tr], neo_df["target"].iloc[tr])
        preds = clf.predict_proba(neo_df[predictors].iloc[te])
        probs.append([p[1] for p in preds])
    full_probs = chain.from_iterable(probs)
    all_probs.append(list(full_probs))

prediction_frame = pd.DataFrame(all_probs).T
prediction_frame.columns = unique_clusters
def find_top_5(row):
    return list(row.nlargest(5).index)

preds = []
for index, row in prediction_frame.iterrows():
    preds.append(find_top_5(row))

ml.mapk([[l] for l in VALID_SET.iloc[:]["hotel_cluster"]], preds, k=5)

0.046297794861420195