In [None]:
import pandas as pd
import numpy as np
import zipfile
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import re
from wordcloud import WordCloud
from tqdm import tqdm
from sklearn.cross_validation import train_test_split
from datetime import datetime, date
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix
from scipy.sparse import hstack
from scipy.sparse import vstack
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.grid_search import GridSearchCV
from sklearn.grid_search import RandomizedSearchCV
from sklearn.preprocessing import LabelBinarizer,LabelEncoder
from scipy.stats import randint as sp_randint
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import pickle
import joblib
from IPython.display import Image

In [None]:
# tokenisation

def tokens(x):
        
    """
    
    Function takes strings and tokenises using comma.
    
    parameters:  string
    
    returns: comma separeted tokens
    
    """
        
    return x.split(',')

In [None]:
# https://www.kaggle.com/davidgasquez/ndcg-scorer

def dcg_score(y_true, y_score, k=5):
    
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.

    Returns
    -------
    score : float
    """
    
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
    
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    
    lb = LabelBinarizer()
    lb.fit(range(predictions.shape[1] + 1))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

In [None]:
def func_timestamp_first_active(timestamp):
        
        """
        Function takes timestamp_first_active and converts it into datetime.
        
        Input type:  Integer
        
        return type: Datetime
        
        """
        
        timestamp = str(timestamp)
        
        timestamp = datetime(year=int(timestamp[0:4]), month=int(timestamp[4:6]), day=int(timestamp[6:8]),\
                             
                             hour=int(timestamp[8:10]), minute=int(timestamp[10:12]), second=int(timestamp[12:]))
        
        return timestamp

In [None]:
# median replacement with median age = 34.0

def age_median(age):
    
    """
    Function to replace age outliers with median age
    
    parameters: age 
    
    returns : age  
    
    """
    
    if age < 15.0 or age > 100.0: 
        
        return 34.0
  
    else: 
        
        return age

In [None]:
#creating age buckets

bins = [i for i in range(15,106,5)]

def age_interv(age):
        
    """
    
    Function takes age and returns interval.
    
    parameters:  age
    
    returns: age interval
    
    """
        
    for i in range(len(bins)):
        
        if age < bins[i]:
            
            return i

In [None]:
le = LabelEncoder()

y = le.fit_transform(y)

y

array([11,  7,  7, ...,  7,  7,  7])

In [None]:
# loading from memory

y = np.load('/home/puneetchandna12/cs1/y.npy')

with open("/home/puneetchandna12/cs1/col_lst.txt", "rb") as fp:
    
    col_lst = pickle.load(fp)
    
fp.close()

with open("/home/puneetchandna12/cs1/lst_ohe_train.txt", "rb") as fp:
    
    lst_ohe_train = pickle.load(fp)
    
fp.close()

vectorizer_action = pickle.load(open("/home/puneetchandna12/cs1/vectorizer_action.pickle", "rb"))

vectorizer_action_type = pickle.load(open("/home/puneetchandna12/cs1/vectorizer_action_type.pickle", "rb"))

vectorizer_action_detail = pickle.load(open("/home/puneetchandna12/cs1/vectorizer_action_detail.pickle", "rb"))

session_df_concat = pd.read_pickle('/home/puneetchandna12/cs1/session_df_concat.pickle')

train_merge_raw = pd.read_pickle('/home/puneetchandna12/cs1/train_merge_raw.pickle')

clf = joblib.load('/home/puneetchandna12/cs1/clf')

In [None]:
#ohe

lst_ohe = ['gender', 'signup_method', 'language', 'affiliate_channel',\
            'affiliate_provider', 'first_affiliate_tracked', 'signup_app', 'first_device_type', 'first_browser']

In [None]:
#raw data creation

raw_df = train_merge_raw[:2]

raw_label = y[:2]

In [None]:
raw_df

Unnamed: 0,id,date_account_created,timestamp_first_active,date_first_booking,gender,age,signup_method,signup_flow,language,affiliate_channel,...,signup_app,first_device_type,first_browser,country_destination,user_id,action,action_type,action_detail,device_type,secs_elapsed
0,d1mm9tcy42,2014-01-01,20140101000936,2014-01-04,MALE,62.0,basic,0,en,sem-non-brand,...,Web,Windows Desktop,Chrome,other,d1mm9tcy42,"lookup,search_results,lookup,search_results,lo...",",click,,click,,click,,data,view,,click,,data,v...",",view_search_results,,view_search_results,,vie...","-unknown-,Windows Desktop",3427529.0
1,yo8nz8bqcq,2014-01-01,20140101001558,,-unknown-,34.0,basic,0,en,direct,...,Web,Mac Desktop,Firefox,NDF,yo8nz8bqcq,"dashboard,create,confirm_email,show,show_perso...","view,submit,click,view,data,view,data,data,","dashboard,create_user,confirm_email_link,p3,us...",Mac Desktop,207842.0


In [None]:
raw_label

array([11,  7])

In [None]:
# func1

def func1(data):
    
    """
    
    Function takes raw data and returns processed data.
    
    parameters:  data
        
    """
    countries = []
    
    data['user_id'].fillna('na' , inplace=True)

    data['action'].fillna('na' , inplace=True)

    data['action_type'].fillna('na' , inplace=True)

    data['action_detail'].fillna('na' , inplace=True)

    data['device_type'].fillna('na' , inplace=True)

    data['secs_elapsed'].fillna(0, inplace=True)
    
    data['date_account_created'] = pd.to_datetime(data['date_account_created'])

    data['date_account_created_day'] = data.date_account_created.dt.weekday

    data['date_account_created_month'] = data.date_account_created.dt.month

    data['date_account_created_year'] = data.date_account_created.dt.year

    data['timestamp_first_active'] = pd.to_datetime(data.timestamp_first_active.apply(func_timestamp_first_active))

    data['timestamp_first_active_day'] = data.timestamp_first_active.dt.weekday

    data['timestamp_first_active_month'] = data.timestamp_first_active.dt.month

    data['timestamp_first_active_year'] = data.timestamp_first_active.dt.year

    data['timestamp_first_active_hour'] = data.timestamp_first_active.dt.hour
    
    data['age'] = data['age'].apply(age_median)

    data['age'].fillna(34.0 , inplace=True)
    
    data['age_interv'] = data['age'].apply(lambda x: age_interv(x))

    data.first_affiliate_tracked.fillna('untracked',inplace = True)
    
    data.drop(['id','date_account_created','timestamp_first_active',\
                  'date_first_booking','country_destination','user_id'],axis=1,inplace = True)
    
    for i in range(len(lst_ohe)):
        
        ohe = pd.get_dummies(data[lst_ohe[i]], prefix=lst_ohe[i])
        
        data.drop([lst_ohe[i]], axis=1, inplace = True)
        
        # Get missing columns in the training test
    
        missing_cols = set( lst_ohe_train[i] ) - set( ohe.columns )
    
        # Add a missing column in test set with default value equal to 0
    
        for c in missing_cols:
            
            ohe[c] = 0
        
        # Ensure the order of column in the test set is in the same order than in train set
    
        ohe = ohe[lst_ohe_train[i]]
    
        data = pd.concat((data, ohe), axis=1)
        
    ohe = data['device_type'].str.get_dummies(sep=",")
    
    data.drop(['device_type'], axis=1, inplace = True)
    
    # Get missing columns in the training test
    
    missing_cols = set( lst_ohe_train[-1] ) - set( ohe.columns )
    
    # Add a missing column in test set with default value equal to 0
    
    for c in missing_cols:
        
        ohe[c] = 0
        
    # Ensure the order of column in the test set is in the same order than in train set
    
    ohe = ohe[lst_ohe_train[-1]]
    
    data = pd.concat((data, ohe), axis=1)    
    
    data_action_tfidf = vectorizer_action.transform(data['action'].values)
    
    data_action_type_tfidf = vectorizer_action_type.transform(data['action_type'].values)
        
    data_action_detail_tfidf = vectorizer_action_detail.transform(data['action_detail'].values)
        
    data.drop(['action','action_type','action_detail'],axis=1,inplace = True)
    
    data_tfidf = hstack((data,data_action_tfidf,data_action_type_tfidf,data_action_detail_tfidf)).tocsr()
    
    pred = clf.predict_proba(data_tfidf)
    
    for i in pred:
        
        countries.append(le.inverse_transform(np.argsort(i)[::-1][:5]).tolist())
    
    return countries,pred

In [None]:
# func2

def func2(label,pred):
    
    """
    
    Function takes label and pred as input and returns ndcg score.
    
    parameters:  label , pred
        
    """
    
    score = ndcg_score(label,pred,5)
    
    return score

In [None]:
count,pred = func1(raw_df)

print(count,pred)

[['US', 'NDF', 'other', 'GB', 'IT'], ['NDF', 'US', 'other', 'FR', 'GB']] [[ 0.07227261  0.07268214  0.07244734  0.0727839   0.0736526   0.07426566
   0.07396927  0.11341434  0.07238723  0.07212908  0.12122008  0.10877571]
 [ 0.06988983  0.07025965  0.07004832  0.0703583   0.0706948   0.07053546
   0.07049852  0.2142804   0.07002889  0.06979483  0.08207586  0.07153518]]


In [None]:
score = func2(raw_label,pred)

score

0.75