In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
import os
import re
import time
from sklearn.metrics import confusion_matrix
from sklearn.metrics.classification import log_loss
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from collections import Counter
from scipy.sparse import hstack
from scipy.sparse import csr_matrix
import warnings
warnings.filterwarnings("ignore")
from datetime import datetime
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.calibration import CalibratedClassifierCV
from xgboost import XGBClassifier
from sklearn.model_selection import RandomizedSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.externals import joblib
from datetime import datetime

In [2]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, BatchNormalization,Input,PReLU
from keras.utils import np_utils
from keras.optimizers import Adam
from keras.models import Model
from keras.optimizers import Adagrad

Using TensorFlow backend.


In [3]:
def model_1_1(input_shape):
    model = Sequential()
    model.add(Dense(256, input_dim=input_shape))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(64))
    model.add(PReLU())
    model.add(BatchNormalization())
    model.add(Dropout(0.5))
    model.add(Dense(12))
    model.add(Activation('softmax'))
    model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])
    return model

In [4]:
def create_model_1_2(input_dim,output_dim, learRate=0.0025):
    
    model = Sequential()
    model.add(Dense(500, input_shape=(input_dim,), init='uniform'))
    model.add(PReLU(init='zero'))
    model.add(Dropout(0.82))
    model.add(Dense(output_dim, init='uniform'))
    model.add(Activation('softmax'))
    opt = Adagrad(lr=learRate, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    return model

In [5]:
def model_2_1(input_dim,output_dim):
    model = Sequential()
    model.add(Dropout(0.15, input_shape=(input_dim,)))
    model.add(Dense(240, init='uniform'))
    model.add(PReLU(init='zero'))
    model.add(Dropout(0.8))
    model.add(Dense(240, init='uniform'))
    model.add(PReLU(init='zero', weights=None))
    model.add(Dropout(0.35))
    model.add(Dense(260, init='uniform'))
    model.add(PReLU(init='zero', weights=None))
    model.add(Dropout(0.40))
    model.add(Dense(output_dim, init='uniform'))
    model.add(Activation('softmax'))

    opt = Adagrad(lr=0.008, epsilon=1e-08)
    model.compile(loss='categorical_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    return model

In [6]:
def model_2_2(input_dim,output_dim):
    model = Sequential()
    model.add(Dropout(0.4, input_shape=(input_dim,)))
    model.add(Dense(75))
    model.add(PReLU())
    model.add(Dropout(0.30))
    model.add(Dense(50, init='normal', activation='tanh'))
    model.add(PReLU())
    model.add(Dropout(0.20))
    model.add(Dense(output_dim, init='normal', activation='softmax'))
    model.compile(loss='categorical_crossentropy', optimizer='adadelta', metrics=['accuracy'])
    return model

In [15]:
train_data=pd.read_csv('gender_age_train.csv')
test_data=pd.read_csv('gender_age_test.csv')
events = pd.read_csv('events.csv',  parse_dates=['timestamp'], index_col='event_id')
phone_data=pd.read_csv('phone_brand_device_model.csv')
# We Need to Drop Duplicate Devices and set Device_id as index like we did for Data while importing
phone_data = phone_data.drop_duplicates('device_id',keep='first').set_index('device_id')
app_events = pd.read_csv('app_events.csv', usecols=['event_id','app_id','is_active'], dtype={'is_active':bool})
app_labels = pd.read_csv('app_labels.csv')
label_categories = pd.read_csv('label_categories.csv')

In [16]:
def ML_pipeline(train_data, test_data,events,phone_data,app_events,app_labels,label_categories):
    start=datetime.now()
    
    #Data Preparation Pipeline
    print("Preparing Data......")
    class_encoder=LabelEncoder()
    encoded_y=class_encoder.fit_transform(train_data['group'])
    train_data['Class']=encoded_y
    train_data=train_data.drop(['age','gender','group'],axis=1)
    train_devices_have_events=np.in1d(train_data['device_id'].values,events['device_id'].values)
    train_data['has_events']=train_devices_have_events
    test_devices_have_events=np.in1d(test_data['device_id'].values,events['device_id'].values)
    test_data['has_events']=test_devices_have_events
    
    events_train_data=train_data.loc[train_data['has_events']==True]
    events_test_data=test_data.loc[test_data['has_events']==True]
    noevents_train_data=train_data.loc[train_data['has_events']==False]
    noevents_test_data=test_data.loc[test_data['has_events']==False]
    events_train_data=events_train_data.drop(['has_events'],axis=1)
    events_test_data=events_test_data.drop(['has_events'],axis=1)
    
    train_data=train_data.set_index('device_id')
    test_data=test_data.set_index('device_id')
    events_train_data=events_train_data.set_index('device_id')
    events_test_data=events_test_data.set_index('device_id')
    noevents_train_data=noevents_train_data.set_index('device_id')
    noevents_test_data=noevents_test_data.set_index('device_id')
    
    train_data['trainrow']=np.arange(train_data.shape[0])
    events_train_data['trainrow']=np.arange(events_train_data.shape[0])
    test_data['testrow']=np.arange(test_data.shape[0])
    events_test_data['testrow']=np.arange(events_test_data.shape[0])
    
    noevents_train_data['trainrow']=np.arange(noevents_train_data.shape[0])
    noevents_test_data['testrow']=np.arange(noevents_test_data.shape[0])
    
    brand_encoder = LabelEncoder().fit(phone_data['phone_brand'])
    phone_data['brand'] = brand_encoder.transform(phone_data['phone_brand'])
    nbrands=len(brand_encoder.classes_)

    concat_model = phone_data['phone_brand'].str.cat(phone_data['device_model'])
    model_encoder=LabelEncoder().fit(concat_model)
    phone_data['model_brand']=model_encoder.transform(concat_model)
    nmodels=len(model_encoder.classes_)
    
    model_encode=LabelEncoder().fit(phone_data['device_model'])
    phone_data['model']=model_encode.transform(phone_data['device_model'])
    num_models=len(model_encoder.classes_)
    
    train_data['phone_brand']=phone_data['brand']
    test_data['phone_brand']=phone_data['brand']
    train_data['phone_model']=phone_data['model']
    test_data['phone_model']=phone_data['model']
    events_train_data['phone_brand']=phone_data['brand']
    events_test_data['phone_brand']=phone_data['brand']
    events_train_data['phone_model']=phone_data['model_brand']
    events_test_data['phone_model']=phone_data['model_brand']
    
    noevents_train_data['phone_brand']=phone_data['brand']
    noevents_test_data['phone_brand']=phone_data['brand']
    noevents_train_data['phone_model']=phone_data['model']
    noevents_test_data['phone_model']=phone_data['model']
    
    app_encoder = LabelEncoder().fit(app_events['app_id'])
    app_events['app'] = app_encoder.transform(app_events['app_id'])
    napps = len(app_encoder.classes_)
    deviceapps = (app_events.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(['device_id','app'])['app'].agg(['size'])
                       .merge(events_train_data[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(events_test_data[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())
    
    app_labels = app_labels.loc[app_labels['app_id'].isin(app_events['app_id'].unique())]
    app_labels['app'] = app_encoder.transform(app_labels['app_id'])
    labelencoder = LabelEncoder().fit(app_labels['label_id'])
    app_labels['label'] = labelencoder.transform(app_labels['label_id'])
    nlabels = len(labelencoder.classes_)
    
    devicelabels = (deviceapps[['device_id','app']]
                .merge(app_labels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(events_train_data[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(events_test_data[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())
    
    events['hour'] = events['timestamp'].map(lambda x:pd.to_datetime(x).hour)
    events['hourbin'] = [1 if ((x>=1)&(x<=6)) else 2 if ((x>=7)&(x<=12)) else 3 if ((x>=13)&(x<=18)) else 4 for x in events['hour']]
    hourevents = events.groupby("device_id")["hour"].apply(lambda x: " ".join('0'+str(s) for s in x))
    hourbinevents = events.groupby("device_id")["hourbin"].apply(lambda x: " ".join('0'+str(s) for s in x))
    events_train_data['event_hours']=events_train_data.index.map(hourevents)
    events_test_data['event_hours']=events_test_data.index.map(hourevents)
    events_train_data['event_hours_bins']=events_train_data.index.map(hourbinevents)
    events_test_data['event_hours_bins']=events_test_data.index.map(hourbinevents)
    
    days_of_week=events['timestamp'].dt.day_name()
    events['day']=days_of_week.map({'Sunday':0,'Monday':1,'Tuesday':2,'Wednesday':3,'Thursday':4,'Friday':5,'Saturday':6})
    daysevents = events.groupby("device_id")["day"].apply(lambda x: " ".join('0'+str(s) for s in x))
    events_train_data['event_day']=events_train_data.index.map(daysevents)
    events_test_data['event_day']=events_test_data.index.map(daysevents)
    
    lat_events = events.groupby("device_id")["latitude"].apply(lambda x: np.median([float(s) for s in x]))
    long_events = events.groupby("device_id")["longitude"].apply(lambda x: np.median([float(s) for s in x]))
    events_train_data['event_med_lat']=events_train_data.index.map(lat_events)
    events_test_data['event_med_lat']=events_test_data.index.map(lat_events)
    events_train_data['event_med_long']=events_train_data.index.map(long_events)
    events_test_data['event_med_long']=events_test_data.index.map(long_events)
    
    appsactive = app_events.groupby("event_id")["is_active"].apply(lambda x: " ".join(str(s) for s in x))
    events["apps_active"] = events.index.map(appsactive)
    events_apps_active_map = events.groupby("device_id")["apps_active"].apply(lambda x: " ".join(str(s) for s in x if str(s)!='nan'))
    events_train_data['apps_active']=events_train_data.index.map(events_apps_active_map)
    events_test_data['apps_active']=events_test_data.index.map(events_apps_active_map)
    print("Data Preparation Complete Time Taken: ",datetime.now()-start)
    
    print("Preparing Features......")
    #Feature Engineering Pipeline
    #Considering Devices with No Events Data
    Xtr_noevents_brand = csr_matrix((np.ones(noevents_train_data.shape[0]), 
                       (noevents_train_data.trainrow, noevents_train_data.phone_brand)))
    Xte_noevents_brand = csr_matrix((np.ones(noevents_test_data.shape[0]), 
                       (noevents_test_data.testrow, noevents_test_data.phone_brand)))
    
    Xtr_noevents_model = csr_matrix((np.ones(noevents_train_data.shape[0]), 
                       (noevents_train_data.trainrow, noevents_train_data.phone_model)))
    Xte_noevents_model = csr_matrix((np.ones(noevents_test_data.shape[0]), 
                       (noevents_test_data.testrow, noevents_test_data.phone_model)))
    X_train_noevents_one_hot=hstack((Xtr_noevents_brand,Xtr_noevents_model),format='csr')
    X_test_noevents_one_hot=hstack((Xte_noevents_brand,Xte_noevents_model),format='csr')
    
    #Events Data Feature Matrix Creation
    Xtr_events_brand = csr_matrix((np.ones(events_train_data.shape[0]), 
                       (events_train_data.trainrow, events_train_data.phone_brand)), 
                              shape=(events_train_data.shape[0],nbrands))
    Xte_events_brand = csr_matrix((np.ones(events_test_data.shape[0]), 
                       (events_test_data.testrow, events_test_data.phone_brand)),
                             shape=(events_test_data.shape[0],nbrands))
    Xtr_events_model = csr_matrix((np.ones(events_train_data.shape[0]), # Number of Rows/Devices
                       (events_train_data.trainrow, events_train_data.phone_model)),
                         shape=(events_train_data.shape[0],nmodels))
    Xte_events_model = csr_matrix((np.ones(events_test_data.shape[0]), # Number of Rows/Devices
                       (events_test_data.testrow, events_test_data.phone_model)),
                           shape=(events_test_data.shape[0],nmodels))
    
    d = deviceapps.dropna(subset=['trainrow'])
    Xtr_events_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                      shape=(events_train_data.shape[0],napps))
    d = deviceapps.dropna(subset=['testrow'])
    Xte_events_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                      shape=(events_test_data.shape[0],napps))
    d = devicelabels.dropna(subset=['trainrow'])
    Xtr_events_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(events_train_data.shape[0],nlabels))
    d = devicelabels.dropna(subset=['testrow'])
    Xte_events_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(events_test_data.shape[0],nlabels))
    
    vectorizer_4=TfidfVectorizer()
    vectorizer_4.fit(events_train_data['event_hours'].values)
    X_tr_event_hours_one_hot = vectorizer_4.transform(events_train_data['event_hours'].values)
    X_te_event_hours_one_hot = vectorizer_4.transform(events_test_data['event_hours'].values)
    
    vectorizer_5=CountVectorizer(binary=True)
    vectorizer_5.fit(events_train_data['event_hours_bins'].values)
    X_tr_event_hours_bins_one_hot = vectorizer_5.transform(events_train_data['event_hours_bins'].values)
    X_te_event_hours_bins_one_hot = vectorizer_5.transform(events_test_data['event_hours_bins'].values)
    
    vectorizer_6=TfidfVectorizer()
    vectorizer_6.fit(events_train_data['event_day'].values)
    X_tr_event_day_one_hot = vectorizer_6.transform(events_train_data['event_day'].values)
    X_te_event_day_one_hot = vectorizer_6.transform(events_test_data['event_day'].values)
    
    scaler_1=StandardScaler()
    scaler_1.fit(events_train_data['event_med_lat'].values.reshape(-1,1))
    X_tr_event_med_lat_scaled = scaler_1.transform(events_train_data['event_med_lat'].values.reshape(-1,1))
    X_te_event_med_lat_scaled = scaler_1.transform(events_test_data['event_med_lat'].values.reshape(-1,1))
    
    scaler_2=StandardScaler()
    scaler_2.fit(events_train_data['event_med_long'].values.reshape(-1,1))
    X_tr_event_med_long_scaled = scaler_2.transform(events_train_data['event_med_long'].values.reshape(-1,1))
    X_te_event_med_long_scaled = scaler_2.transform(events_test_data['event_med_long'].values.reshape(-1,1))
    
    vectorizer_8=TfidfVectorizer()
    vectorizer_8.fit(events_train_data['apps_active'].values)
    X_tr_apps_active_one_hot = vectorizer_8.transform(events_train_data['apps_active'].values)
    X_te_apps_active_one_hot = vectorizer_8.transform(events_test_data['apps_active'].values)
    
    X_tr_event_hours_one_hot=X_tr_event_hours_one_hot.tocsr()
    X_te_event_hours_one_hot=X_te_event_hours_one_hot.tocsr()
    X_tr_event_hours_bins_one_hot=X_tr_event_hours_bins_one_hot.tocsr()
    X_te_event_hours_bins_one_hot=X_te_event_hours_bins_one_hot.tocsr()
    X_tr_event_day_one_hot=X_tr_event_day_one_hot.tocsr()
    X_te_event_day_one_hot=X_te_event_day_one_hot.tocsr()
    X_tr_apps_active_one_hot=X_tr_apps_active_one_hot.tocsr()
    X_te_apps_active_one_hot=X_te_apps_active_one_hot.tocsr()
    
    X_train_events_one_hot_1=hstack((Xtr_events_brand,Xtr_events_model,Xtr_events_label,X_tr_event_hours_one_hot,X_tr_event_hours_bins_one_hot,X_tr_event_day_one_hot,X_tr_event_med_lat_scaled,X_tr_event_med_long_scaled,Xtr_events_app,X_tr_apps_active_one_hot),format='csr')
    X_test_events_one_hot_1=hstack((Xte_events_brand,Xte_events_model,Xte_events_label,X_te_event_hours_one_hot,X_te_event_hours_bins_one_hot,X_te_event_day_one_hot,X_te_event_med_lat_scaled,X_te_event_med_long_scaled,Xte_events_app,X_te_apps_active_one_hot),format='csr')
    
    print("Feature Preparation Done Time Taken: ",datetime.now()-start)
    
    print("Predicting Output......")
    y_data=noevents_train_data['Class'].values
    train_1, cv_1, y_train_1, y_cv_1 = train_test_split(X_train_noevents_one_hot, y_data,stratify=y_data,test_size=0.15,random_state=18)
    test_1=X_test_noevents_one_hot
    
    #Loading Saved Logistic Regression Model
    lr_model=joblib.load('Saved_Models/no_events_calibrated_logistic_regression.sav')
    lr_no_events_train_prediction=lr_model.predict_proba(train_1)
    lr_no_events_cv_prediction=lr_model.predict_proba(cv_1)
    lr_no_events_test_prediction=lr_model.predict_proba(test_1)
    
    #Loading All 5 Saved Model_1_1 Neural Network  Models
    model_list_1=[]
    for i in range(5):
        model=model_1_1(train_1.shape[1])
        model.load_weights('Saved_Models/No_Events/Neural_Network_1/Model_1_1_'+str(i+1)+'.h5')
        model_list_1.append(model)
        
    train_pred_avg_1_1=np.zeros((train_1.shape[0],12))
    for i in range(len(model_list_1)):
        train_pred=model_list_1[i].predict_proba(train_1)
        train_pred_avg_1_1+=train_pred
    train_pred_avg_1_1/=len(model_list_1)
    
    cv_pred_avg_1_1=np.zeros((cv_1.shape[0],12))
    for i in range(len(model_list_1)):
        cv_pred=model_list_1[i].predict_proba(cv_1)
        cv_pred_avg_1_1+=cv_pred
    cv_pred_avg_1_1/=len(model_list_1)
    
    test_pred_avg_1_1=np.zeros((test_1.shape[0],12))
    for i in range(len(model_list_1)):
        test_pred=model_list_1[i].predict_proba(test_1)
        test_pred_avg_1_1+=test_pred
    test_pred_avg_1_1/=len(model_list_1)
    
    #Loading Saved Model_1_2 Neural Network Model
    model_1_2=create_model_1_2(train_1.shape[1],12)
    model_1_2.load_weights('Saved_Models/No_Events/Model_1_2.h5')
    train_pred_1_2=model_1_2.predict_proba(train_1)
    cv_pred_1_2=model_1_2.predict_proba(cv_1)
    test_pred_1_2=model_1_2.predict_proba(test_1)
    
    y_data_events=events_train_data['Class'].values
    train_2, cv_2, y_train_2, y_cv_2 = train_test_split(X_train_events_one_hot_1, y_data_events,stratify=y_data_events,test_size=0.2,random_state=9)
    test_2=X_test_events_one_hot_1
    
    #Loading All 20 Saved Model_2_1 Neural Network Models
    model_list_2=[]
    for i in range(20):
        model=model_2_1(train_2.shape[1],12)
        model.load_weights('Saved_Models/Events/Neural_Network_1/Model_2_1_'+str(i+1)+'.h5')
        model_list_2.append(model)
    
    train_pred_avg_2_1=np.zeros((train_2.shape[0],12))
    for i in range(len(model_list_2)):
        train_pred=model_list_2[i].predict_proba(train_2)
        train_pred_avg_2_1+=train_pred
    train_pred_avg_2_1/=len(model_list_2)
    
    cv_pred_avg_2_1=np.zeros((cv_2.shape[0],12))
    for i in range(len(model_list_2)):
        cv_pred=model_list_2[i].predict_proba(cv_2)
        cv_pred_avg_2_1+=cv_pred
    cv_pred_avg_2_1/=len(model_list_2)
    
    test_pred_avg_2_1=np.zeros((test_2.shape[0],12))
    for i in range(len(model_list_2)):
        test_pred=model_list_2[i].predict_proba(test_2)
        test_pred_avg_2_1+=test_pred
    test_pred_avg_2_1/=len(model_list_2)
    
    #Loading All 20 Saved Model_2_2 Neural Network Models
    model_list_3=[]
    for i in range(20):
        model=model_2_2(train_2.shape[1],12)
        model.load_weights('Saved_Models/Events/Neural_Network_2/Model_2_2_'+str(i+1)+'.h5')
        model_list_3.append(model)
        
    train_pred_avg_2_2=np.zeros((train_2.shape[0],12))
    for i in range(len(model_list_3)):
        train_pred=model_list_3[i].predict_proba(train_2)
        train_pred_avg_2_2+=train_pred
    train_pred_avg_2_2/=len(model_list_3)
    
    cv_pred_avg_2_2=np.zeros((cv_2.shape[0],12))
    for i in range(len(model_list_3)):
        cv_pred=model_list_3[i].predict_proba(cv_2)
        cv_pred_avg_2_2+=cv_pred
    cv_pred_avg_2_2/=len(model_list_3)
    
    test_pred_avg_2_2=np.zeros((test_2.shape[0],12))
    for i in range(len(model_list_3)):
        test_pred=model_list_3[i].predict_proba(test_2)
        test_pred_avg_2_2+=test_pred
    test_pred_avg_2_2/=len(model_list_3)
    
    print("Models Predictions Done Time Taken:",datetime.now()-start)
    
    print("Ensembling Models......")
    w1_1=0.15
    w1_2=0.75
    w1_3=0.1
    
    w2_1=0.5
    w2_2=0.5
    
    #Esembling and Calculating weighted average predictions
    train_prediction_1=(w1_1*lr_no_events_train_prediction)+(w1_2*train_pred_avg_1_1)+(w1_3*train_pred_1_2)
    cv_prediction_1=(w1_1*lr_no_events_cv_prediction)+(w1_2*cv_pred_avg_1_1)+(w1_3*cv_pred_1_2)
    train_prediction_2=(w2_1*train_pred_avg_2_1)+(w2_2*train_pred_avg_2_2)
    cv_prediction_2=(w2_1*cv_pred_avg_2_1)+(w2_2*cv_pred_avg_2_2)
    Test_Prediction_1=(w1_1*lr_no_events_test_prediction)+(w1_2*test_pred_avg_1_1)+(w1_3*test_pred_1_2)
    Test_Prediction_2=(w2_1*test_pred_avg_2_1)+(w2_2*test_pred_avg_2_2)
    
    
    print("No Events Train Log-Loss: ",log_loss(y_train_1, train_prediction_1))
    print("Events Train Log-Loss: ",log_loss(y_train_2, train_prediction_2))
    print("No Events CV Log-Loss: ",log_loss(y_cv_1, cv_prediction_1))
    print("Events CV Log-Loss: ",log_loss(y_cv_2, cv_prediction_2))
    
    print("Returned Test Predictions for Submission")
    print("Total Time Taken: ",datetime.now()-start)
    
    return Test_Prediction_1,Test_Prediction_2    

In [17]:
no_events_test_predictions,events_test_predictions=ML_pipeline(train_data, test_data,events,phone_data,app_events,app_labels,label_categories)

Preparing Data......
Data Preparation Complete Time Taken:  0:03:36.871455
Preparing Features......
Feature Preparation Done Time Taken:  0:04:04.209560
Predicting Output......
Models Predictions Done Time Taken: 0:12:59.428000
Ensembling Models......
No Events Train Log-Loss:  2.3620290803707564
Events Train Log-Loss:  1.606074682066437
No Events CV Log-Loss:  2.3597141116725457
Events CV Log-Loss:  1.8937726608417007
Returned Test Predictions for Submission
Total Time Taken:  0:12:59.490495
