### imports 

In [15]:
import numpy as np
import pandas as pd
pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
import seaborn as sns
import math
# %matplotlib inline
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix
from scipy.stats import uniform as sp_randFloat
from scipy.stats import randint as sp_randInt
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
import xgboost as xgb
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import category_encoders as ce
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer
# from sklearn.impute import KNNImputer
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn import tree
import random
from sklearn.ensemble import RandomForestClassifier
from imblearn.over_sampling import (RandomOverSampler, 
                                    SMOTE, 
                                    ADASYN)

# from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
import keras
from keras.models import Sequential
from keras.layers import Dense

### read data 

In [2]:
lo={'recommendation_set_id':str, 'user_id':str, 'session_id':str, 'query_identifier':str,
'query_word_count':float, 'query_char_count':float, 'query_detected_language':str,
'query_document_id':str, 'document_language_provided':str, 'year_published':float,
'number_of_authors':float, 'abstract_word_count':float, 'abstract_char_count':float,
'abstract_detected_language':str, 'first_author_id':str,
'num_pubs_by_first_author':float, 'organization_id':str, 'application_type':str,
'item_type':str, 'request_received':str, 'hour_request_received':str,
'response_delivered':str, 'rec_processing_time':float, 'app_version':str, 'app_lang':str,
'user_os':str, 'user_os_version':str, 'user_java_version':str, 'user_timezone':str,
'country_by_ip':str, 'timezone_by_ip':str, 'local_time_of_request':str,
'local_hour_of_request':str, 'number_of_recs_in_set':float,
'recommendation_algorithm_id_used':str, 'algorithm_class':str, 'cbf_parser':str,
'search_title':str, 'search_keywords':str, 'search_abstract':str,
'time_recs_recieved':str, 'time_recs_displayed':str, 'time_recs_viewed':str,
'clicks':float, 'ctr':float,'set_clicked':float
   }
pars=['request_received', 'response_delivered','local_time_of_request','time_recs_recieved','time_recs_displayed','time_recs_viewed']

df_w=pd.read_csv('tcdml1920-rec-click-pred--training.csv',na_values=["\\N","nA"], dtype=lo, parse_dates=pars)
df_kag=pd.read_csv('tcdml1920-rec-click-pred--test.csv',na_values=["\\N","nA"], dtype=lo, parse_dates=pars)

### cleaning jabref - based on EDA

In [3]:
# global dict_key
# dict_key={}
def process(df,run,dict_key):
#     global dict_key
#     dict_key={}
    
    df=df[df.organization_id=='1'] # for jabref
    df=df[['query_word_count','query_char_count', 'query_detected_language', 'query_document_id','year_published',
           'number_of_authors','abstract_word_count', 'abstract_char_count','first_author_id','num_pubs_by_first_author',
           'request_received','hour_request_received','rec_processing_time','app_version', 'app_lang','user_os','user_timezone','country_by_ip',
           'timezone_by_ip','local_hour_of_request','recommendation_algorithm_id_used', 'algorithm_class', 'cbf_parser',
           'search_title', 'search_keywords','search_abstract','clicks','set_clicked']]


    # query_word_count
    # query_char_count
    # query_detected_language
#     df['query_detected_language'].fillna(df['query_detected_language'].mode()[0], inplace=True)


    # query_document_id
    df['query_doc_id_present']=df.query_document_id.isna()*1
    df.drop(columns='query_document_id',inplace=True)

    # year_published

    df.drop(df[df.year_published >2019].index, inplace=True)
    # df['yr']=df.year_published.map(lambda x: '1950' if x<1950 else '1950-1980' if x<1980 else '1980-2000' if x<2000 else '2000-2010' if x<2010 else '2010-2019' if x<2019 else 0)
    # df.groupby('yr').set_clicked.mean()
    # df.drop(columns='yr',inplace=True)
#     df.year_published.fillna(df.year_published.min(),inplace=True) # because pos correlation between year and clicks

    # number_of_authors
#     df.number_of_authors.fillna(10,inplace=True) #nulls have same mean of set_clicked as 10 authors

    # abstract_word_count
#     df.abstract_word_count.fillna(df.abstract_word_count.median(),inplace=True)
    # abstract_char_count
#     df.abstract_char_count.fillna(df.abstract_char_count.median(),inplace=True)

# first_author_id
    
    # num_pubs_by_first_author
#     df.num_pubs_by_first_author.fillna(df.local_hour_of_request.median(),inplace=True) 

    # request_received

#     df['request_received']=df['request_received'].astype('datetime64[D]')
    df['day_of_week'] = df['request_received'].dt.day_name()
    df['month'] = df['request_received'].dt.month
    df['year']=df['request_received'].dt.year
    df['month']=df.month.astype(str)
    df.drop(columns=['request_received'],inplace=True)

    # hour_request_received
#     df['xhr_req_recvd'] = df.hour_request_received.map(lambda x:math.sin(2.0*math.pi*float(x)/24))
#     df['yhr_req_recvd']  = df.hour_request_received.map(lambda x:math.cos(2.0*math.pi*float(x)/24))
#     df.drop(columns='hour_request_received',inplace=True)
    
#     rec_processing_time
    if run=='train':
        df=df[df.rec_processing_time<25]
    
    df.drop(columns=['rec_processing_time'],inplace=True)



    # app_version
#     df['app_version'].fillna(df['app_version'].mode()[0], inplace=True)

#     app_lang
#     df['app_lang'].fillna(df['app_lang'].mode()[0], inplace=True)

    # user_os
    # df[(df.user_os.isna()==False)&(df.user_os!='Not provided')].groupby('user_os').set_clicked.sum()
#     df['user_os_linux']=df.user_os.map(lambda x: 1 if x=='Linux' else 0)
#     df['user_os_Mac_OS_X']=df.user_os.map(lambda x: 1 if x=='Mac OS X' else 0)
#     df['user_os_Not_provided']=df.user_os.map(lambda x: 1 if x=='Not provided' else 0)
#     df['user_os_Windows_10']=df.user_os.map(lambda x: 1 if x=='Windows 10' else 0)
#     df['user_os_Windows_7']=df.user_os.map(lambda x: 1 if x=='Windows 7' else 0)
    df['user_os_Windows_8_1']=df.user_os.map(lambda x: 1 if x=='Windows 8.1' else 0)
    df['user_os_provided']=df.user_os.map(lambda x: 1 if x else 0)
    df.drop(columns='user_os',inplace=True)


    #     user_timezone
    df['user_timezone_present']=df.user_timezone.map(lambda x:1 if x else 0)
    df['user_timezone_aus']=df.user_timezone.map(lambda x:1 if x=='Australia/Sydney' else 0)
    df.drop(columns='user_timezone',inplace=True)

    # country_by_ip
#     df['country_by_ip'].fillna(df['country_by_ip'].mode()[0], inplace=True) #do after 'other' thingy 

#     timezone_by_ip
    # local_hour_of_request
#     df.local_hour_of_request.fillna(df.local_hour_of_request.mode()[0],inplace=True) 
#     df['xhr_local_req_recvd'] = df.local_hour_of_request.map(lambda x:math.sin(2.0*math.pi*float(x)/24))
#     df['yhr_local_req_recvd']  = df.local_hour_of_request.map(lambda x:math.cos(2.0*math.pi*float(x)/24))
#     df.drop(columns='local_hour_of_request',inplace=True)

    # recommendation_algorithm_id_used
#     df['recommendation_algorithm_id_used']=df['recommendation_algorithm_id_used'].astype(str)
#     df['recommendation_algorithm_id_used'].fillna(df['recommendation_algorithm_id_used'].mode()[0], inplace=True)

    # algorithm_class
    # cbf_parser
#     df['cbf_standard_QP']=df.cbf_parser.map(lambda x:1 if x=='standard_QP' else 0)
#     df['cbf_edismax_QP']=df.cbf_parser.map(lambda x:1 if x=='cbf_edismax_QP' else 0)
#     df['cbf_mlt_QP']=df.cbf_parser.map(lambda x:1 if x=='cbf_mlt_QP' else 0)
    df['cbf_parser_used']=df.cbf_parser.map(lambda x: 1 if x else 0)
    df.drop(columns=['cbf_parser'],inplace=True)

    # search_title
    # search_keywords
    # set_clicked

    
    def convert_sparse_values(df, cols, threshold, replacement='other'):
        for col in [cols]:
            counts = df[col].value_counts()
            to_convert = counts[counts <= threshold].index.values
            dict_key[cols]=to_convert
            df[col] = df[col].replace(to_convert, replacement)

                
    if run=='train':
        convert_sparse_values(df,cols='query_detected_language', threshold=1000)
        convert_sparse_values(df,cols='app_lang', threshold=500)
        convert_sparse_values(df,cols='country_by_ip', threshold=150)
        convert_sparse_values(df,cols='timezone_by_ip', threshold=500)
        convert_sparse_values(df,cols='app_version', threshold=800)

    else:
        df['query_detected_language']=df.query_detected_language.map(lambda x: x if x in dict_key['query_detected_language'] else 'others')
        df['app_lang']=df.app_lang.map(lambda x: x if x in dict_key['app_lang'] else 'others')
        df['country_by_ip']=df.country_by_ip.map(lambda x: x if x in dict_key['country_by_ip'] else 'others')
        df['timezone_by_ip']=df.timezone_by_ip.map(lambda x: x if x in dict_key['country_by_ip'] else 'others')
        df['app_version']=df.app_version.map(lambda x: x if x in dict_key['app_version'] else 'others')
        
#     df['country_by_ip'].fillna(df['country_by_ip'].mode()[0], inplace=True)
#     df.timezone_by_ip.fillna(df.timezone_by_ip.mode()[0],inplace=True) 

#     df.drop(columns=['year_published','number_of_authors','abstract_word_count', 'abstract_char_count',
#                     'first_author_id','num_pubs_by_first_author'],inplace=True)
    
    df=df.drop(columns=['clicks','set_clicked']).merge(df[['clicks','set_clicked']], 
                                    on=df[['clicks','set_clicked']].index).drop(columns='key_0')
    
    return df,dict_key

In [4]:
df_w_p,d=process(df_w,run='train',dict_key={}) #train

df_kag_p,d=process(df_kag,run='test',dict_key=d) #kaggle


### category encodering 

In [5]:
#  making function to try combinations of all type of encodings

def encode_all(df,df2,encoder_to_use,handle_missing='return_nan'):  #handle_missing='value'/'return_nan'
    encoders_used={}
    for col in encoder_to_use:
        encoders_used[col]={}

        if encoder_to_use[col]['name']=='BackwardDifferenceEncoder':
            encoder=ce.BackwardDifferenceEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder'
            encoders_used[col]['encoder_object']=encoder

        if encoder_to_use[col]['name']=='BaseNEncoder':
            encoder=ce.BaseNEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,base=encoder_to_use[col]['base']) 
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder'
            encoders_used[col]['encoder_object']=encoder

        if encoder_to_use[col]['name']=='BinaryEncoder':
            encoder=ce.BinaryEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder'
            encoders_used[col]['encoder_object']=encoder

        if encoder_to_use[col]['name']=='CatBoostEncoder':
            encoder=ce.CatBoostEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=encoder_to_use[col]['sigma'],a=encoder_to_use[col]['a'])
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder'
            encoders_used[col]['encoder_object']=encoder

    #     if encoder_to_use[col]['name']=='HashingEncoder':
    #         encoder=ce.HashingEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
    #         encoder.fit(X=df,y=df['set_clicked'])
    #         df=encoder.transform(df)
    #         encoders_used[col]['encoder_type']='BackwardDifferenceEncoder'
#             encoders_used[col]['encoder_object']=encoder

        if encoder_to_use[col]['name']=='HelmertEncoder':
            encoder=ce.HelmertEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder'
            encoders_used[col]['encoder_object']=encoder

        if encoder_to_use[col]['name']=='JamesSteinEncoder':
            encoder=ce.JamesSteinEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing, model=encoder_to_use[col]['model'])
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder'
            encoders_used[col]['encoder_object']=encoder

        if encoder_to_use[col]['name']=='LeaveOneOutEncoder':
            encoder=ce.LeaveOneOutEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,sigma=encoder_to_use[col]['sigma'])
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder' 
            encoders_used[col]['encoder_object']=encoder

        if encoder_to_use[col]['name']=='MEstimateEncoder':
            encoder=ce.MEstimateEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=encoder_to_use[col]['randomized'],sigma=encoder_to_use[col]['sigma'],m=encoder_to_use[col]['m'])
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder' 
            encoders_used[col]['encoder_object']=encoder

        if encoder_to_use[col]['name']=='OneHotEncoder':
            encoder=ce.OneHotEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,use_cat_names=True)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder'
            encoders_used[col]['encoder_object']=encoder

        if encoder_to_use[col]['name']=='OrdinalEncoder':
            encoder=ce.OrdinalEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder' 
            encoders_used[col]['encoder_object']=encoder

        if encoder_to_use[col]['name']=='SumEncoder':
            encoder=ce.SumEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder'
            encoders_used[col]['encoder_object']=encoder

        if encoder_to_use[col]['name']=='PolynomialEncoder':
            encoder=ce.PolynomialEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing)
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder'
            encoders_used[col]['encoder_object']=encoder

        if encoder_to_use[col]['name']=='TargetEncoder':
            encoder=ce.TargetEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,min_samples_leaf=encoder_to_use[col]['min_samples_leaf'], smoothing=encoder_to_use[col]['smoothing'])
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder' 
            encoders_used[col]['encoder_object']=encoder


        if encoder_to_use[col]['name']=='WOEEncoder':
            encoder=ce.WOEEncoder(cols=[col],return_df=1,drop_invariant=1,handle_missing=handle_missing,randomized=encoder_to_use[col]['randomized'],sigma=encoder_to_use[col]['sigma'])
            encoder.fit(X=df,y=df['set_clicked'])
            df=encoder.transform(df)
            df2=encoder.transform(df2)
            encoders_used[col]['encoder_type']='BackwardDifferenceEncoder' 
            encoders_used[col]['encoder_object']=encoder
            
    return df,df2

In [7]:
# either use above function to encode different columns differently or encode all using single encoder as below

encoder=ce.WOEEncoder(randomized=True,sigma=0.1)
encoder.fit(X=df_w_p.iloc[:,:-2],y=df_w_p['set_clicked'])
df_w_encoded=encoder.transform(df_w_p.iloc[:,:-2])
df_kag_encoded=encoder.transform(df_kag_p.iloc[:,:-2])

### delete unnecessary columns

In [12]:
df_w_encoded.drop(columns=['algorithm_class','search_title','search_keywords','search_abstract'],inplace=True)
df_kag_encoded.drop(columns=['algorithm_class','search_title','search_keywords','search_abstract'],inplace=True)

### imputation 

In [8]:
est=IterativeImputer(random_state=0, estimator= KNeighborsRegressor(n_neighbors=10),n_nearest_features=6)
est=IterativeImputer(random_state=0, initial_strategy='median')
est.fit(df_w_encoded)
X_w_im=est.transform(df_w_encoded)
X_kag_im=est.transform(df_kag_encoded)

In [9]:
# remove set_clicked and clicks from feature data
X_w=X_w_im[:,:-2]
X_kag=X_kag_im[:,:-2]

y_w=df_w_p.iloc[:,-1]

### train test split 

In [22]:
# X = df_w_p.drop(columns=['set_clicked']).values
# y = df_w_p.loc[:, 'set_clicked'].values

# df_x=df_w_p.drop(columns=['set_clicked'])
# df_y=df_w_p.loc[:, 'set_clicked']


# Splitting the dataset into the Training set and Test set
# X_train, X_test, y_train, y_test = train_test_split(X_w,y_w, test_size = 0.2)



### oversampling 

In [None]:
from imblearn.over_sampling import (RandomOverSampler, 
                                    SMOTE, 
                                    ADASYN)
# RandomOverSampler
  # With over-sampling methods, the number of samples in a class
  # should be greater or equal to the original number of samples.
# sampler = RandomOverSampler(ratio={1: 1927, 0: 300},random_state=0)
# X_rs, y_rs = sampler.fit_sample(X, y)
# print('RandomOverSampler {}'.format(Counter(y_rs)))
# plot_this(X_rs,y_rs)
# SMOTE
# sampler = SMOTE(ratio=0.4)
# X_train, y_train = sampler.fit_sample(X_train, y_train)
# print('SMOTE {}'.format(Counter(y_rs)))
# plot_this(X_rs,y_rs)
# ADASYN;n_neighbors=9,
# sampler = ADASYN(ratio={1: 1927, 0: 300},random_state=0)
# X_rs, y_rs = sampler.fit_sample(X, y)
# print('ADASYN {}'.format(Counter(y_rs)))
# plot_this(X_rs,y_rs)


### catboost 

In [None]:


model = CatBoostClassifier(iterations=1000,learning_rate=.02,depth=4
                                       ,task_type="GPU",devices='0:2'
                          )
# Fit model
model.fit(X_train, y_train,verbose=True,eval_set=(X_test,y_test))

preds_class = model.predict(X_test)
# Get predicted probabilities for each class
preds_proba = model.predict_proba(X_test)

print("")
# print("rate:",rate/100,"iters:",iters,"depth:",depth)
acc=sum(y_test==preds_class)/len(y_test)
print("accuracy: ",acc)
print("f1: ",f1_score(y_test, preds_class))
print("cm:",confusion_matrix(y_test, preds_class))

print("kaggle: ",sum(model.predict(kag_del)))

In [None]:
pd.Series(clf.predict(kag_del)).to_csv('aa.csv')

### xgboost 

In [None]:

paramGrid = {"subsample" : [1, 0.9, 0.5],
             'depth'         : [3,5,8],
                  'learning_rate' : [.2,.5,.9],
                  'iterations'    : [500,1000,1500]
                 
            }


model = xgb.XGBClassifier()

gridsearch = GridSearchCV(model, paramGrid, verbose=True,             
         cv=TimeSeriesSplit(n_splits=3).get_n_splits([X_train, y_train]))

gridsearch.fit(X_train, y_train,verbose=True)

In [None]:
from sklearn.metrics import classification_report
y_true, y_pred = y_test, gridsearch.predict(X_test)
print(classification_report(y_true, y_pred))

In [None]:
confusion_matrix(y_test, y_pred)

In [None]:
# preds=gridsearch.predict(X_kag)
sum(gridsearch.predict(X_kag_p1))

### logistic 

In [None]:
from sklearn.linear_model import LogisticRegression

In [None]:
clf = LogisticRegression(random_state=0, solver='lbfgs',multi_class='multinomial').fit(X_train, y_train)


In [None]:
preds_class = clf.predict(X_test)
acc=sum(y_test==preds_class)/len(y_test)
print("accuracy: ",acc)
print("f1: ",f1_score(y_test, preds_class))
print("cm:",confusion_matrix(y_test, preds_class))

print("kaggle: ",sum(clf.predict(X_kag)))

### RF 

In [18]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
clf = RandomForestClassifier(n_estimators=100, max_depth=5,random_state=0).fit(X_train, y_train)
preds_class = model.predict(X_test)
acc=sum(y_test==preds_class)/len(y_test)
print("accuracy: ",acc)
print("f1: ",f1_score(y_test, preds_class))
print("cm:",confusion_matrix(y_test, preds_class))

print("kaggle: ",sum(clf.predict(kag_del)))

### decision tree 

In [None]:

ff=[]
kag=[]
for i in range(2,30,1):
    clf = tree.DecisionTreeClassifier(max_depth=i,).fit(X_train, y_train)

    preds_class = clf.predict(X_test)
    acc=sum(y_test==preds_class)/len(y_test)
    print(i)
    print("accuracy: ",acc)
    print("f1: ",f1_score(y_test, preds_class))
    print("cm:",confusion_matrix(y_test, preds_class))

    print("kaggle: ",sum(clf.predict(X_kag)))
    ff.append(f1_score(y_test, preds_class))
    kag.append(sum(clf.predict(X_kag)))

### neural net 

In [None]:
import keras
from keras.models import Sequential
from keras.layers import Dense
# Neural network
model = Sequential()
model.add(Dense(5, input_dim=52, activation='relu'))
# model.add(Dense(12, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(X_train, y_train,validation_data = (X_test,y_test), epochs=10, batch_size=100)


In [None]:
preds_class = model.predict(X_test).ravel()
acc=sum(y_test==preds_class)/len(y_test)
print(i)
print("accuracy: ",acc)
print("f1: ",f1_score(y_test, preds_class))
print("cm:",confusion_matrix(y_test, preds_class))

print("kaggle: ",sum(clf.predict(X_kag)))
ff.append(f1_score(y_test, preds_class))
kag.append(sum(clf.predict(X_kag)))

### trying different combination of encodings 

In [10]:
# {'name':'BackwardDifferenceEncoder'},
# {'name':'LeaveOneOutEncoder','sigma':.3}, #optimal value is commonly between 0.05 and 0.6
# {'name':'BaseNEncoder','base':2},
# {'name':'JamesSteinEncoder','model':'binary'},
# {'name':'TargetEncoder','min_samples_leaf':10, 'smoothing':4.0},
# {'name':'WOEEncoder','randomized':True, 'sigma':0.1},
# {'name':'MEstimateEncoder','randomized':True, 'sigma':0.1,'m':2}, #default m=1
# {'name':'CatBoostEncoder','sigma':None,'a':2}, #default m=1


encoders_to_use=[
{
'query_detected_language':{'name':'TargetEncoder','min_samples_leaf':10, 'smoothing':4.0},
'first_author_id':{'name':'TargetEncoder','min_samples_leaf':10, 'smoothing':4.0},
'hour_request_received':{'name':'BaseNEncoder','base':2},
'app_version':{'name':'TargetEncoder','min_samples_leaf':10, 'smoothing':4.0},
'app_lang':{'name':'TargetEncoder','min_samples_leaf':10, 'smoothing':4.0},
'country_by_ip':{'name':'TargetEncoder','min_samples_leaf':10, 'smoothing':4.0},
'timezone_by_ip':{'name':'TargetEncoder','min_samples_leaf':10, 'smoothing':4.0},
'local_hour_of_request':{'name':'BaseNEncoder','base':2},
'recommendation_algorithm_id_used':{'name':'BaseNEncoder','base':2},
'day_of_week':{'name':'BaseNEncoder','base':2},
'month':{'name':'BaseNEncoder','base':2}
},
    {
'query_detected_language':{'name':'MEstimateEncoder','randomized':True, 'sigma':0.1,'m':2},
'first_author_id':{'name':'MEstimateEncoder','randomized':True, 'sigma':0.1,'m':2},
'hour_request_received':{'name':'BaseNEncoder','base':2},
'app_version':{'name':'MEstimateEncoder','randomized':True, 'sigma':0.1,'m':2},
'app_lang':{'name':'MEstimateEncoder','randomized':True, 'sigma':0.1,'m':2},
'country_by_ip':{'name':'MEstimateEncoder','randomized':True, 'sigma':0.1,'m':2},
'timezone_by_ip':{'name':'MEstimateEncoder','randomized':True, 'sigma':0.1,'m':2},
'local_hour_of_request':{'name':'BaseNEncoder','base':2},
'recommendation_algorithm_id_used':{'name':'BaseNEncoder','base':2},
'day_of_week':{'name':'BaseNEncoder','base':2},
'month':{'name':'BaseNEncoder','base':2}
},
    {
'query_detected_language':{'name':'LeaveOneOutEncoder','sigma':.3},
'first_author_id':{'name':'LeaveOneOutEncoder','sigma':.3},
'hour_request_received':{'name':'BaseNEncoder','base':2},
'app_version':{'name':'LeaveOneOutEncoder','sigma':.3},
'app_lang':{'name':'LeaveOneOutEncoder','sigma':.3},
'country_by_ip':{'name':'LeaveOneOutEncoder','sigma':.3},
'timezone_by_ip':{'name':'LeaveOneOutEncoder','sigma':.3},
'local_hour_of_request':{'name':'BaseNEncoder','base':2},
'recommendation_algorithm_id_used':{'name':'BaseNEncoder','base':2},
'day_of_week':{'name':'BaseNEncoder','base':2},
'month':{'name':'BaseNEncoder','base':2}
},
    {
'query_detected_language':{'name':'LeaveOneOutEncoder','sigma':.1},
'first_author_id':{'name':'LeaveOneOutEncoder','sigma':.1},
'hour_request_received':{'name':'BaseNEncoder','base':2},
'app_version':{'name':'LeaveOneOutEncoder','sigma':.1},
'app_lang':{'name':'LeaveOneOutEncoder','sigma':.1},
'country_by_ip':{'name':'LeaveOneOutEncoder','sigma':.1},
'timezone_by_ip':{'name':'LeaveOneOutEncoder','sigma':.1},
'local_hour_of_request':{'name':'BaseNEncoder','base':2},
'recommendation_algorithm_id_used':{'name':'BaseNEncoder','base':2},
'day_of_week':{'name':'BaseNEncoder','base':2},
'month':{'name':'BaseNEncoder','base':2}
},
    {
'query_detected_language':{'name':'WOEEncoder','randomized':True, 'sigma':0.1},
'first_author_id':{'name':'WOEEncoder','randomized':True, 'sigma':0.1},
'hour_request_received':{'name':'BaseNEncoder','base':2},
'app_version':{'name':'WOEEncoder','randomized':True, 'sigma':0.1},
'app_lang':{'name':'WOEEncoder','randomized':True, 'sigma':0.1},
'country_by_ip':{'name':'WOEEncoder','randomized':True, 'sigma':0.1},
'timezone_by_ip':{'name':'WOEEncoder','randomized':True, 'sigma':0.1},
'local_hour_of_request':{'name':'BaseNEncoder','base':2},
'recommendation_algorithm_id_used':{'name':'BaseNEncoder','base':2},
'day_of_week':{'name':'BaseNEncoder','base':2},
'month':{'name':'BaseNEncoder','base':2}
},
    {
'query_detected_language':{'name':'WOEEncoder','randomized':True, 'sigma':0.3},
'first_author_id':{'name':'WOEEncoder','randomized':True, 'sigma':0.3},
'hour_request_received':{'name':'BaseNEncoder','base':2},
'app_version':{'name':'WOEEncoder','randomized':True, 'sigma':0.3},
'app_lang':{'name':'WOEEncoder','randomized':True, 'sigma':0.3},
'country_by_ip':{'name':'WOEEncoder','randomized':True, 'sigma':0.3},
'timezone_by_ip':{'name':'WOEEncoder','randomized':True, 'sigma':0.3},
'local_hour_of_request':{'name':'BaseNEncoder','base':2},
'recommendation_algorithm_id_used':{'name':'BaseNEncoder','base':2},
'day_of_week':{'name':'BaseNEncoder','base':2},
'month':{'name':'BaseNEncoder','base':2}
}
]
    
encoders_used={}

In [None]:
ff=[]
kag=[]

for xx,encoder_to_use in enumerate(encoders_to_use):
    print("encoding: ",xx)
    df_w_encoded, df_kag_encoded=encode_all(df_w_p.copy(), df_kag_p.copy(),encoder_to_use)

    df_w_encoded.drop(columns=['algorithm_class','search_title','search_keywords','search_abstract','year_published',
                                'number_of_authors',
                                'abstract_word_count',
                                'abstract_char_count',
                                'first_author_id',
                                'num_pubs_by_first_author'],inplace=True)
    df_kag_encoded.drop(columns=['algorithm_class','search_title','search_keywords','search_abstract','year_published',
                                'number_of_authors',
                                'abstract_word_count',
                                'abstract_char_count',
                                'first_author_id',
                                'num_pubs_by_first_author'],inplace=True)
    print("cols: ",df_w_encoded.shape)
    for i in range(5):
        print("i: ",i)
        print("imputing...")

        est=IterativeImputer(random_state=0, estimator= None,initial_strategy='mean')
        est.fit(df_w_encoded)
        X_w_im=est.transform(df_w_encoded)
        X_kag_im=est.transform(df_kag_encoded)

        X_w=X_w_im[:,:-2]
        X_kag=X_kag_im[:,:-2]
        y_w=df_w_p.iloc[:,-1]

        
        X_train, X_test, y_train, y_test = train_test_split(X_w,y_w, test_size = random.randrange(10,30)/100)
            

    #     DTREE
        print("dtree...")
        clf = tree.DecisionTreeClassifier(max_depth=random.randrange(4,5)).fit(X_train, y_train)

        preds_class = clf.predict(X_test)
#         acc=sum(y_test==preds_class)/len(y_test)
#         print(i)
    #         print("accuracy: ",acc)
        print("f1: ",f1_score(y_test, preds_class))
    #         print("cm:",confusion_matrix(y_test, preds_class))

#         print("kaggle: ",sum(clf.predict(X_kag)))
#         ff.append(f1_score(y_test, preds_class))
        kag.append(list(clf.predict(X_kag)))


    # CAT
        print("cat...")
        model = CatBoostClassifier(iterations=2000,learning_rate=.02,depth=random.randrange(4,5)
                                               ,task_type="GPU",devices='0:2'
                                  )
        # Fit model
        model.fit(X_train, y_train,verbose=False,eval_set=(X_test,y_test))

        preds_class = model.predict(X_test)
#         print("")
        # print("rate:",rate/100,"iters:",iters,"depth:",depth)
    #     acc=sum(y_test==preds_class)/len(y_test)
    #     print("accuracy: ",acc)
        print("f1: ",f1_score(y_test, preds_class))
    #     print("cm:",confusion_matrix(y_test, preds_class))

#         print("kaggle: ",sum(model.predict(X_kag)))
#         ff.append(f1_score(y_test, preds_class))
        kag.append(list(clf.predict(X_kag)))

#     RF
        print('RF...')
        clf = RandomForestClassifier(n_estimators=500, max_depth=random.randrange(4,5)).fit(X_train, y_train)
        preds_class = clf.predict(X_test)
#         acc=sum(y_test==preds_class)/len(y_test)
#         print("accuracy: ",acc)
        print("f1: ",f1_score(y_test, preds_class))
#         print("cm:",confusion_matrix(y_test, preds_class))

#         print("kaggle: ",sum(clf.predict(kag_del)))
        kag.append(list(clf.predict(X_kag)))

pd.DataFrame(kag).to_csv('aa.csv')

encoding:  0
cols:  (267537, 33)
i:  0
imputing...
dtree...
f1:  0.006134969325153374
cat...
f1:  0.01676829268292683
RF...
f1:  0.0
i:  1
imputing...
dtree...
f1:  0.003427592116538132
cat...


### stacking 

In [11]:
x_train, x_test, y_train, y_test = train_test_split(X_w, y_w, test_size=0.4 )
sampler = SMOTE(ratio=0.4)
x_train, y_train = sampler.fit_sample(x_train, y_train)
# x_train=pd.DataFrame(x_train).reset_index().iloc[:,1:].values
# x_test=pd.DataFrame(x_test).reset_index().iloc[:,1:].values
# y_train=pd.DataFrame(y_train).reset_index().iloc[:,1].values
# y_test=pd.DataFrame(y_test).reset_index().iloc[:,1].values

test_preds=[]
kag_preds=[]

print("training model1...")
model1 = tree.DecisionTreeClassifier(max_depth=4)
model1.fit(x_train,y_train)
train_pred1=model1.predict(x_train)
test_pred1=model1.predict(x_test)
kag_pred1=model1.predict(X_kag)
test_preds.append(test_pred1)
kag_preds.append(kag_pred1)

print("training model2...")
model2 = CatBoostClassifier(iterations=1500,learning_rate=.02,depth=random.randrange(4,5)
                                               ,task_type="GPU",devices='0:2'
                                  )
model2.fit(x_train,y_train, eval_set=(x_test,y_test),verbose=False)
train_pred2=model2.predict(x_train)
test_pred2=model2.predict(x_test)
kag_pred2=model2.predict(X_kag)
test_preds.append(test_pred2)
kag_preds.append(kag_pred2)

print("training model3...")
model3 = RandomForestClassifier(n_estimators=1000)
model3.fit(x_train,y_train)
train_pred3=model3.predict(x_train)
test_pred3=model3.predict(x_test)
kag_pred3=model3.predict(X_kag)
test_preds.append(test_pred3)
kag_preds.append(kag_pred3)

print("training model4...")
model4 = tree.DecisionTreeClassifier(max_depth=5)
model4.fit(x_train,y_train)
train_pred4=model4.predict(x_train)
test_pred4=model4.predict(x_test)
kag_pred4=model4.predict(X_kag)
test_preds.append(test_pred4)
kag_preds.append(kag_pred4)

# stack
test_df=pd.DataFrame(test_preds).T
kag_df=pd.DataFrame(kag_preds).T

model = LogisticRegression(random_state=1)
model.fit(test_df,y_test)
kag_preds=model.predict(kag_df)

kag_preds.sum()

training model1...
training model2...
training model3...
training model4...




43.0

In [209]:
pd.Series(kag_preds).to_csv('aa.csv')

  """Entry point for launching an IPython kernel.
