In [2]:
import pandas as pd
import numpy as np
import zipfile
import os
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
import re
from wordcloud import WordCloud
from tqdm import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from datetime import datetime, date
from sklearn.feature_extraction.text import TfidfVectorizer
from scipy.sparse import coo_matrix
from scipy.sparse import hstack
from scipy.sparse import vstack
from scipy import sparse
from scipy.sparse import csr_matrix
from sklearn.preprocessing import OneHotEncoder
from sklearn import metrics
from sklearn.metrics import make_scorer
from sklearn.preprocessing import LabelBinarizer,LabelEncoder
from scipy.stats import randint as sp_randint
from sklearn import linear_model
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
import pickle
import joblib
from IPython.display import Image

In [3]:
def load_sparse_csr(filename):
    
    """
    
    Function takes filename and returns csr matrix .
    
    parameters:  filename
    
    returns: csr matrix
    
    """
    
    loader = np.load(filename)
    return csr_matrix((loader['data'], loader['indices'], loader['indptr']),
                      shape=loader['shape'])

In [4]:
train_merge_tfidf = load_sparse_csr('../data/train_merge_tfidf.npz')

test_merge_tfidf = load_sparse_csr('../data/test_merge_tfidf.npz')

y = np.load('../data/y.npy', allow_pickle=True)

with open("../data/col_lst.txt", "rb") as fp:
    
    col_lst = pickle.load(fp)
    
fp.close()

In [5]:
print("Final Data matrix")
print(train_merge_tfidf.shape)
print(test_merge_tfidf.shape)
print(y.shape)
print(len(col_lst))
print("="*100)

Final Data matrix
(73815, 523)
(62096, 523)
(73815,)
523


In [6]:
le = LabelEncoder()

y = le.fit_transform(y)

y

array([11,  7,  7, ...,  7,  7,  7])

In [7]:
# https://www.kaggle.com/davidgasquez/ndcg-scorer

def dcg_score(y_true, y_score, k=5):
    
    """Discounted cumulative gain (DCG) at rank K.

    Parameters
    ----------
    y_true : array, shape = [n_samples]
        Ground truth (true relevance labels).
    y_score : array, shape = [n_samples, n_classes]
        Predicted scores.
    k : int
        Rank.

    Returns
    -------
    score : float
    """
    
    order = np.argsort(y_score)[::-1]
    y_true = np.take(y_true, order[:k])

    gain = 2 ** y_true - 1

    discounts = np.log2(np.arange(len(y_true)) + 2)
    return np.sum(gain / discounts)


def ndcg_score(ground_truth, predictions, k=5):
    
    """Normalized discounted cumulative gain (NDCG) at rank K.

    Normalized Discounted Cumulative Gain (NDCG) measures the performance of a
    recommendation system based on the graded relevance of the recommended
    entities. It varies from 0.0 to 1.0, with 1.0 representing the ideal
    ranking of the entities.

    Parameters
    ----------
    ground_truth : array, shape = [n_samples]
        Ground truth (true labels represended as integers).
    predictions : array, shape = [n_samples, n_classes]
        Predicted probabilities.
    k : int
        Rank.

    Returns
    -------
    score : float

    Example
    -------
    >>> ground_truth = [1, 0, 2]
    >>> predictions = [[0.15, 0.55, 0.2], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    1.0
    >>> predictions = [[0.9, 0.5, 0.8], [0.7, 0.2, 0.1], [0.06, 0.04, 0.9]]
    >>> score = ndcg_score(ground_truth, predictions, k=2)
    0.6666666666
    """
    
    lb = LabelBinarizer()
    lb.fit(range(predictions.shape[1] + 1))
    T = lb.transform(ground_truth)

    scores = []

    # Iterate over each y_true and compute the DCG score
    for y_true, y_score in zip(T, predictions):
        actual = dcg_score(y_true, y_score, k)
        best = dcg_score(y_true, y_true, k)
        score = float(actual) / float(best)
        scores.append(score)

    return np.mean(scores)


# NDCG Scorer function
ndcg_scorer = make_scorer(ndcg_score, needs_proba=True, k=5)

# Testing with lr

In [8]:
params =  {'C':[0.0001, 0.001, 0.01]}

lr = linear_model.LogisticRegression(multi_class="multinomial",solver="lbfgs")

clf = RandomizedSearchCV(lr, params, verbose=10, n_iter=3 ,n_jobs=-1,scoring=ndcg_scorer)

In [9]:
train_merge_tfidf = train_merge_tfidf[:10000]

In [10]:
train_merge_tfidf

<10000x523 sparse matrix of type '<class 'numpy.float64'>'
	with 482466 stored elements in Compressed Sparse Row format>

In [11]:
y = y[:10000]

In [12]:
y

array([11,  7,  7, ...,  7, 10,  7])

In [13]:
clf.fit(train_merge_tfidf,y)

Fitting 5 folds for each of 3 candidates, totalling 15 fits


In [14]:
pred_y = clf.predict_proba(train_merge_tfidf)

In [15]:
clf.best_params_

{'C': 0.0001}

In [16]:
y

array([11,  7,  7, ...,  7, 10,  7])

In [17]:
pred_y

array([[3.23165447e-005, 6.92595999e-005, 1.99035121e-006, ...,
        0.00000000e+000, 3.36127854e-001, 5.48935407e-002],
       [5.77556181e-002, 5.98021666e-002, 4.77098614e-002, ...,
        1.08157180e-023, 1.19897760e-001, 8.25321900e-002],
       [9.04201321e-003, 1.15454040e-002, 3.53541585e-003, ...,
        1.65935567e-121, 2.18484932e-001, 9.93647568e-002],
       ...,
       [7.28324938e-002, 7.19712121e-002, 7.10799660e-002, ...,
        5.73478348e-002, 8.74861939e-002, 6.60502134e-002],
       [3.70166839e-003, 5.15289028e-003, 1.06342757e-003, ...,
        7.16385675e-162, 2.45938245e-001, 9.42876307e-002],
       [3.10449510e-003, 4.39408500e-003, 8.40465551e-004, ...,
        1.20072270e-169, 2.50622444e-001, 9.29741995e-002]])

In [18]:
#Train ndcg score

s = ndcg_score(y, pred_y, k=5)

s

0.8198739963386653

# Testing with RF

In [67]:
#https://stackoverflow.com/questions/53782169/random-forest-tuning-with-randomizedsearchcv

params = {
'n_estimators' : [200, 700, 800, 1000, 1200],
'max_depth' : [15, 20, 25, 30, 35, 50],
'min_samples_split' : [2, 3, 5, 8],
'min_samples_leaf' : [1, 2, 5, 10] }

rf = RandomForestClassifier()

clf = RandomizedSearchCV(rf, params, verbose=10,n_jobs=-1,scoring=ndcg_scorer)

In [None]:
clf.fit(train_merge_tfidf,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


In [None]:
pred_y = clf.predict_proba(train_merge_tfidf)

In [None]:
clf.best_params_

In [None]:
#Train ndcg score

s = ndcg_score(y, pred_y, k=5)

s

# Testing with Xgboost

In [8]:
param_grid = {
'max_depth': sp_randint(3, 20),
'learning_rate': [0.001, 0.01, 0.1, 0.2],
'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0],
'min_child_weight': [0.25,0.5, 1.0, 3.0, 5.0, 7.0],
'gamma': [0, 0.25, 0.3,0.35,0.45,0.5,0.6,0.8,1.0],
'reg_lambda': [0.1,0.2,0.4,0.5,0.6,0.8,1.0,10.0],
'n_estimators':[100,200,500,1000,2000],
'colsample_bytree':[0.1,0.3,0.5,1],
'colsample_bylevel':[0.1,0.3,0.5,1]
}


gb = xgb.XGBClassifier(objective='multi:softmax',eval_metric= 'mlogloss')

clf = RandomizedSearchCV(gb, param_grid,n_jobs=-1,verbose=10,scoring=ndcg_scorer, random_state=42)

In [10]:
clf.fit(train_merge_tfidf,y)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


ValueError: 
All the 50 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
40 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ayoub\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ayoub\anaconda3\lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "C:\Users\ayoub\anaconda3\lib\site-packages\xgboost\sklearn.py", line 1440, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [ 0  1  2  3  4  5  6  7  8  9 10], got [ 0  1  2  3  4  5  6  7  8 10 11]

--------------------------------------------------------------------------------
10 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\ayoub\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\ayoub\anaconda3\lib\site-packages\xgboost\core.py", line 620, in inner_f
    return func(**kwargs)
  File "C:\Users\ayoub\anaconda3\lib\site-packages\xgboost\sklearn.py", line 1440, in fit
    raise ValueError(
ValueError: Invalid classes inferred from unique values of `y`.  Expected: [0 1 2 3 4 5 6 7 8 9], got [ 0  1  2  3  4  5  6  7 10 11]


In [None]:
pred_y = clf.predict_proba(train_merge_tfidf)

In [None]:
# Train ndcg score

s = ndcg_score(y, pred_y, k=5)

s

In [14]:
#feature importance

features = col_lst
importances = clf.feature_importances_
indices = (np.argsort(importances))[-25:]
plt.figure(figsize=(10,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='r', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

AttributeError: 'RandomizedSearchCV' object has no attribute 'feature_importances_'

In [None]:
# https://stackoverflow.com/questions/43691380/how-to-save-load-xgboost-model

#save model

joblib.dump(clf,'../data/clf') 

In [None]:
#load saved model

clf = joblib.load('../data/clf')

# Select top 80% of the features

In [None]:
top_80 = int(len(clf.feature_importances_)*0.8)

indices = (np.argsort(importances))[::-1][:top_80]

In [None]:
# https://stackoverflow.com/questions/48099075/how-to-get-columns-from-big-sparse-csc-matrix

cols = []

for i in indices:
    
    cols.append(train_merge_tfidf[:,i])
    
train_merge_tfidf_new = hstack(cols)

cols = []

for i in indices:
    
    cols.append(test_merge_tfidf[:,i])
    
test_merge_tfidf_new = hstack(cols)

col_lst_new = []

for i in indices:
    
     col_lst_new.append(col_lst[i])

In [None]:
print("Final Data matrix")
print(train_merge_tfidf_new.shape)
print(test_merge_tfidf_new.shape)
print(y.shape)
print(len(col_lst_new))
print("="*100)

# Train on top 80% data

In [None]:
clf.fit(train_merge_tfidf_new,y)

In [None]:
pred_y = clf.predict_proba(train_merge_tfidf_new)

In [None]:
# Train ndcg score

s = ndcg_score(y, pred_y, k=5)

s

In [None]:
#feature importance

features = col_lst_new
importances = clf.feature_importances_
indices = (np.argsort(importances))[-25:]
plt.figure(figsize=(10,12))
plt.title('Feature Importances')
plt.barh(range(len(indices)), importances[indices], color='r', align='center')
plt.yticks(range(len(indices)), [features[i] for i in indices])
plt.xlabel('Relative Importance')
plt.show()

# Best score = 13% of the Leaderboard

In [None]:
#test csv

test_df = pd.read_csv('../data/test_users.csv')

test_id = test_df['id'].values

test_id

In [None]:
# https://www.kaggle.com/kevinwu06/feature-importance-w-xgboost

pred = clf.predict_proba(test_merge_tfidf)

ids = []
countries = []

# Taking the 5 classes with highest probabilities

for i in range(len(test_id)):
    idx = test_id[i]
    ids += [idx] * 5
    countries += le.inverse_transform(np.argsort(pred[i])[::-1][:5]).tolist()
    
# Generate submission

sub = pd.DataFrame({"id" : ids,"country" : countries})

sub.to_csv('../data/kag_sub.csv', index = False)

In [None]:
Image(filename='../data/score.PNG')

# Comparison with other submitted notebooks ordered by best score

1) https://www.kaggle.com/zhugds/test-script - score = 0.87008

== This user doesnt make use of the sessions data, instead uses the entire train data.

== But from our model, we know secs_elapsed and other actions are among the most imp features.

== Thus, we get a better score.

2) https://www.kaggle.com/wallinm1/script-0-1 - score = 0.86987

== This user also doesnt make use of the sessions data, instead uses the entire train data.

== But from our model, we know secs_elapsed and other actions are among the most imp features.

== Thus, we get a better score.

3) https://www.kaggle.com/kapetis/script-0-1 - score = 0.86987

== This user also doesnt make use of the sessions data, instead uses the entire train data.

== But from our model, we know secs_elapsed and other actions are among the most imp features.

== Thus, we get a better score.

4) https://www.kaggle.com/foutik/script-cleaning-data - score = 0.86969

== This user also doesnt make use of the sessions data, instead uses the entire train data.

== But from our model, we know secs_elapsed and other actions are among the most imp features.

== Thus, we get a better score.

5) https://www.kaggle.com/michaelpawlus/xgb-feature-exploration - score = 0.85655

== This user also doesnt make use of the sessions data, instead uses the entire train data.

== But from our model, we know secs_elapsed and other actions are among the most imp features.

== Thus, we get a better score.