# Question 8

## Pre-processing:

In [1]:
!pip install xlearn
import numpy as np
import pandas as pd
import glob

Collecting xlearn
[?25l  Downloading https://files.pythonhosted.org/packages/ab/1b/9ad0093cac05d6f95d3d768bc855804b18723c72120ce45cd930bd303587/xlearn-0.40a1.tar.gz (4.9MB)
[K     |████████████████████████████████| 4.9MB 1.4MB/s eta 0:00:01
[?25hBuilding wheels for collected packages: xlearn
  Building wheel for xlearn (setup.py) ... [?25ldone
[?25h  Created wheel for xlearn: filename=xlearn-0.40a1-cp36-none-any.whl size=226076 sha256=cc3be4ed85b941bec73dac9a8b3eb885e7f53fab6d4562b3667931858a00fc24
  Stored in directory: /root/.cache/pip/wheels/59/7d/71/699578f3cb69f16a2e5f648d978259dba959c92a5a6eca9451
Successfully built xlearn
Installing collected packages: xlearn
Successfully installed xlearn-0.40a1


In [2]:
TAXONOMY_WITH_LESS_THAN_100 = set([
    'BUSINESS~PERSONAL-FINANCE', 
    'ENTERTAINMENT~MOVIES',
    'LIFE~ASTROLOGY',
    'LIFE~FITNESS',
    'SPORTS~CRICKET',
    'TECH~WEB',
    'TECH~SCIENCE~SPACE'
])

In [3]:
dtypes = {
        'page_view_start_time':                            'int16',
        'user_id_hash':                                         'category',
        'target_id_hash':                                       'category',
        'syndicator_id_hash':                                'category',
        'campaign_id_hash':                                 'category',
        'empiric_calibrated_recs':                          'float16',
        'empiric_clicks':                                          'float16',
        'target_item_taxonomy':                             'category',
        'placement_id_hash':                                 'category',
        'user_recs':                                                'int16',
        'user_clicks':                                               'int16',
        'user_target_recs':                                      'int16',
        'publisher_id_hash':                                   'category',
        'source_id_hash':                                        'category',
        'source_item_type':                                   'category',
        'browser_platform':                                    'category',
        'os_family':                                                 'category',
        'country_code':                                         'category',
        'region':                                                     'category',
        'day_of_week':                                           'int8',
        'time_of_day':                                             'int8',
        'gmt_offset':                                                'int8',
    
#         'empiric_ctr':                                             'int8',
        }

In [5]:
import os
from collections import defaultdict
from csv import DictReader
import math

num_cols = ['page_view_start_time', 
                    'empiric_calibrated_recs',
                    'empiric_clicks',
                    'user_recs',
                    'user_clicks',
                    'user_target_recs',
                    'day_of_week',
                    'time_of_day',
                    ]

too_many_vals = []

publisher_848 =  '848b127dbd0f3a647581f4a95325f5204f4577ad7956f6ccaee6bf1eec0c399e6b02015827a95741ae041adc2438fbd494076ece7796a5f6cf4eab29e1079cf3'
publisher_c29 = 'c29a980222a2a97b74ffbd067f2a4ce760e3e8d8f024944fbd55d0ecbec178d4fc94c38d88a56a177a1c302f47ac1293dbbed50638455d50b1769daedbd33aaf'
publisher_f8a = 'f8a7ba9b7c9b05464ee98daac522b3f3d2376453c70e57de45b65e65a06b3b45fedd82ec18291bb4e432f3a172893424a81bab47dad1b4355b68552cda00583a'


def create_train_test_files(max_files=68, validation_files = 8):
    dont_use = ['page_view_start_time', 'gmt_offset']
    categories = [k for k, v in dtypes.items() if k not in dont_use]
    categories_index = dict(zip(categories, range(len(categories))))
    field_features = defaultdict()

    max_val = 1
    for index, file in enumerate(glob.glob('./data/train/*.csv')):
        if index > max_files:
            break
        
        main_libffm = 'train.libffm'
        if index > max_files - validation_files:
            main_libffm = 'valid.libffm'

        with open(main_libffm, 'a') as the_file:
            for t, row in enumerate(DictReader(open(file))):
                if t % 100000 == 0:
                    print(t, len(field_features), max_val)

                label = [row['is_click']]
                ffeatures = []
                
                row['empiric_calibrated_recs'] = round(float(row['empiric_calibrated_recs']) / 1000) 
                if row['target_item_taxonomy'] in TAXONOMY_WITH_LESS_THAN_100:
                    row['target_item_taxonomy'] = row['target_item_taxonomy'][:row['target_item_taxonomy'].rfind('~')]
#                 if float(row['user_recs']) != 0:
#                     row['user_ctr'] = float(row['user_clicks']) / float(row['user_recs'])
#                 else:
#                     row['user_ctr'] = -1
                
                for field in categories:
                    if field == 'is_click':
                        continue
                    feature = row[field]

                    if feature == '':
                        feature = "unk"
                    if field not in num_cols:
                        ff = field + '_____' + feature
                    else:
                        if feature == "unk" or float(feature) == -1:
                            ff = field + '_____' + str(0)
                        else:
                            if field in too_many_vals:
                                ff = field + '_____' + str(int(round(math.log(1 + float(feature)))))
                            else:
                                ff = field + '_____' + str(int(round(float(feature))))
                    if ff not in field_features:
                        if len(field_features) == 0:
                            field_features[ff] = 1
                            max_val += 1
                        else:
                            field_features[ff] = max_val + 1
                            max_val += 1

                    fnum = field_features[ff]
                    ffeatures.append('{}:{}:1'.format(categories_index[field], fnum))

                
                line = label + ffeatures
                the_file.write('{}\n'.format(' '.join(line)))

                
    train_path = './data/test_file_v3/test_file.csv'
    with open('test.libffm', 'a') as the_file:
        for t, row in enumerate(DictReader(open(train_path))):
            if t % 100000 == 0:
                print(t, len(field_features), max_val)

            label = ['0']
            ffeatures = []

            row['empiric_calibrated_recs'] = round(float(row['empiric_calibrated_recs']) / 1000)
            if row['target_item_taxonomy'] in TAXONOMY_WITH_LESS_THAN_100:
                row['target_item_taxonomy'] = row['target_item_taxonomy'][:row['target_item_taxonomy'].rfind('~')]
                    
            for field in categories:
                if field == 'is_click':
                    continue
                feature = row[field]
                if feature == '':
                    feature = "unk"
                if field not in num_cols:
                    ff = field + '_____' + feature
                else:
                    if feature == "unk" or float(feature) == -1:
                        ff = field + '_____' + str(0)
                    else:
                        if field in too_many_vals:
                            ff = field + '_____' + str(int(round(math.log(1 + float(feature)))))
                        else:
                            ff = field + '_____' + str(int(round(float(feature))))
                if ff not in field_features:
                    if len(field_features) == 0:
                        field_features[ff] = 1
                        max_val += 1
                    else:
                        field_features[ff] = max_val + 1
                        max_val += 1

                fnum = field_features[ff]

                ffeatures.append('{}:{}:1'.format(categories_index[field], fnum))
            line = label + ffeatures
            the_file.write('{}\n'.format(' '.join(line)))



In [6]:
create_train_test_files(validation_files=5)
# create_files_for_publisher(publisher_c29)
# create_files_for_publisher(publisher_f8a)

0 0 1
100000 169846 169847
200000 291707 291708
300000 405119 405120
400000 513581 513582
0 580069 580070
100000 683662 683663
200000 785183 785184
300000 884634 884635
400000 982609 982610
0 1014645 1014646
100000 1110859 1110860
200000 1205666 1205667
300000 1299551 1299552
400000 1392071 1392072
0 1410622 1410623
100000 1501870 1501871
200000 1592348 1592349
300000 1681717 1681718
400000 1770440 1770441
0 1814399 1814400
100000 1901368 1901369
200000 1987852 1987853
300000 2073502 2073503
400000 2158438 2158439
0 2174487 2174488
100000 2258819 2258820
200000 2342164 2342165
300000 2424959 2424960
400000 2507147 2507148
0 2541261 2541262
100000 2622408 2622409
200000 2703068 2703069
300000 2782945 2782946
400000 2862074 2862075
0 2898437 2898438
100000 2976946 2976947
200000 3055022 3055023
300000 3132554 3132555
400000 3209514 3209515
0 3268651 3268652
100000 3344727 3344728
200000 3420326 3420327
300000 3495269 3495270
400000 3570152 3570153
0 3601972 3601973
100000 3676254 3676255

In [2]:
import xlearn as xl
import pandas as pd
import os

%env  USER = 'Test'
os.environ['USER'] = 'test'
# create ffm model
ffm_model = xl.create_ffm() 

# set training and validation data
ffm_model.setTrain("train.libffm")
ffm_model.setValidate("valid.libffm")

ffm_model.setOnDisk()

# define params
param = {'task':'binary', 
         'lr':0.2, 
         'k':4,
                 'lambda':0.0002, 
                 'metric':'auc', 
                 'epoch': 100}

# train the model
ffm_model.fit(param, 'xl.out')

# set the test data
ffm_model.setTest("test.libffm")
ffm_model.setSigmoid()

# # make predictions
ffm_model.predict("xl.out", "output.txt")

# create submission file
# sample = pd.read_csv('sample_submission.csv')
# output = pd.read_csv('output.txt', header=None)[0].values
# sample.HasDetections = output
# sample.to_csv('submission.csv', index=False)

env: USER='Test'


In [2]:
# set the test data
ffm_model.setTest("test.libffm")
ffm_model.setSigmoid()

# make predictions
ffm_model.predict("xl.out", "output.txt")

In [4]:
pd.read_csv('./for_submission.csv', header=None, names=['Predicted']).to_csv('./submission.csv', index_label='Id')

## Loading the data

In [10]:
import pandas as pd
import glob

FILES_LIMIT = 1

all_files = glob.glob('./data/train/*.csv')

li = []

for index, filename in enumerate(all_files):
    df = pd.read_csv(filename)#, usecols=['user_id_hash', 'target_id_hash', 'is_click'])
    li.append(df)
    
    if index > FILES_LIMIT:
        break

train = pd.concat(li, axis=0, ignore_index=True)

# train = pd.concat(map(pd.read_csv, glob.glob('./data/train/*.csv')))
# train = pd.read_csv('./data/train1/part-00000.csv')

test = pd.read_csv('./data/test_file_v3/test_file.csv')
# train.dropna(inplace=True) # This is important

## Surprise model: SVD

In [4]:
svd = SurpriseSVDPlusPlus(train, test)
algo = svd.train()

In [5]:
print(algo)

<surprise.prediction_algorithms.matrix_factorization.SVDpp object at 0x7f7202d0dcf8>


In [6]:
estimations = np.zeros((len(test)))

for index, row in enumerate(test.itertuples()):
    estimations[index] = algo.predict(row.user_id_hash, row.target_id_hash).est
    if (index % 100000) == 0 :
        print(index)

0
100000
200000
300000
400000
500000
600000
700000
800000
900000


In [7]:
test['Predicted'] = estimations

In [8]:
test[['Predicted']].to_csv('./submission.csv', index_label='Id')

## Surprise Model: SVD++

In [6]:
svd = SurpriseSVDPlusPlus(train, test)
svd.train()
svd.predictRating()

RMSE: 1.2774
MAE:  1.0178


1.0178469771834977

# Ensembling
Take all the former models and try to average all the scores in order to create a more robust model

In [20]:
import glob
df = None
for x in glob.glob('./**/submission.csv', recursive=True):
    if df is None:
        df = pd.read_csv(x)
    else:
        df = pd.merge(df, pd.read_csv(x), on = "Id")

feature_vector = df.drop(['Id'], axis='columns')

In [21]:
feature_vector['Predicted'] = feature_vector.mean(axis=1)

In [23]:
feature_vector['Predicted'].to_csv('./submission.csv', index_label='Id', %%writefileader=True)

# Per publisher

In [15]:
train_path = './data/test_file_v3/test_file.csv'
test_df  = pd.read_csv(train_path, usecols=['publisher_id_hash'])

publisher_848 =  '848b127dbd0f3a647581f4a95325f5204f4577ad7956f6ccaee6bf1eec0c399e6b02015827a95741ae041adc2438fbd494076ece7796a5f6cf4eab29e1079cf3'
publisher_c29 = 'c29a980222a2a97b74ffbd067f2a4ce760e3e8d8f024944fbd55d0ecbec178d4fc94c38d88a56a177a1c302f47ac1293dbbed50638455d50b1769daedbd33aaf'
publisher_f8a = 'f8a7ba9b7c9b05464ee98daac522b3f3d2376453c70e57de45b65e65a06b3b45fedd82ec18291bb4e432f3a172893424a81bab47dad1b4355b68552cda00583a'

In [17]:
best_df = pd.read_csv('./models/on_65_files_85819/for_submission.csv', header=None, names=['Predicted'])
pub_top = pd.read_csv('./models/top_pub/for_submission.csv', header=None, names=['Predicted'])
pub_f8a7b = pd.read_csv('./for_submission_f8a7b.csv', header=None, names=['Predicted'])
pub_c29 = pd.read_csv('./for_submission_c29a9.csv', header=None, names=['Predicted'])

0         False
1          True
2          True
3         False
4         False
          ...  
999523     True
999524    False
999525    False
999526    False
999527    False
Name: publisher_id_hash, Length: 999528, dtype: bool

In [18]:
best_df.loc[test_df['publisher_id_hash'] == publisher_f8a,:] = pub_f8a7b[test_df['publisher_id_hash'] == publisher_f8a].values

In [19]:
best_df.loc[test_df['publisher_id_hash'] == publisher_848,:] = pub_top[test_df['publisher_id_hash'] == publisher_848].values

In [20]:
best_df.to_csv('./submission.csv', index_label='Id')

In [4]:
test_df

Unnamed: 0,publisher_id_hash
0,c29a980222a2a97b74ffbd067f2a4ce760e3e8d8f02494...
1,f8a7ba9b7c9b05464ee98daac522b3f3d2376453c70e57...
2,f8a7ba9b7c9b05464ee98daac522b3f3d2376453c70e57...
3,c29a980222a2a97b74ffbd067f2a4ce760e3e8d8f02494...
4,848b127dbd0f3a647581f4a95325f5204f4577ad7956f6...
...,...
999523,f8a7ba9b7c9b05464ee98daac522b3f3d2376453c70e57...
999524,848b127dbd0f3a647581f4a95325f5204f4577ad7956f6...
999525,c29a980222a2a97b74ffbd067f2a4ce760e3e8d8f02494...
999526,c29a980222a2a97b74ffbd067f2a4ce760e3e8d8f02494...


In [1]:
import dask.dataframe as dd

In [34]:
category = 'BUSINESS~PERSONAL~FINANCE'
category[:category.rfind('~')]

'BUSINESS~PERSONAL'

In [9]:
from dask_ml.xgboost import XGBClassifier
from distributed import Client

client = Client()

train, test = df.random_split([0.8, 0.2])

train_labels = train['is_click']
test_labels = test['is_click']

del train['is_click']  # remove informative column from data
del test['is_click']  # remove informative column from data

est = XGBClassifier(num_class=2, verbose_eval=True, silent=False)
est.fit(train, train_labels)

prediction = est.predict(test)

Port 8787 is already in use. 
Perhaps you already have a cluster running?
Hosting the diagnostics dashboard on a random port instead.


ValueError: DataFrame.dtypes for data must be int, float or bool.
                Did not expect the data types in fields user_id_hash, target_id_hash, syndicator_id_hash, campaign_id_hash, target_item_taxonomy, placement_id_hash, publisher_id_hash, source_id_hash, source_item_type, browser_platform, country_code, region

In [7]:
np.multiply([0.12312, 0.2312, 0.431], 100)

array([12.312, 23.12 , 43.1  ])

In [5]:
import numpy as np
def bucket_ctr_feature(feature):
    buckets = np.linspace(0, 5, 100)
    

array([0.        , 0.05050505, 0.1010101 , 0.15151515, 0.2020202 ,
       0.25252525, 0.3030303 , 0.35353535, 0.4040404 , 0.45454545,
       0.50505051, 0.55555556, 0.60606061, 0.65656566, 0.70707071,
       0.75757576, 0.80808081, 0.85858586, 0.90909091, 0.95959596,
       1.01010101, 1.06060606, 1.11111111, 1.16161616, 1.21212121,
       1.26262626, 1.31313131, 1.36363636, 1.41414141, 1.46464646,
       1.51515152, 1.56565657, 1.61616162, 1.66666667, 1.71717172,
       1.76767677, 1.81818182, 1.86868687, 1.91919192, 1.96969697,
       2.02020202, 2.07070707, 2.12121212, 2.17171717, 2.22222222,
       2.27272727, 2.32323232, 2.37373737, 2.42424242, 2.47474747,
       2.52525253, 2.57575758, 2.62626263, 2.67676768, 2.72727273,
       2.77777778, 2.82828283, 2.87878788, 2.92929293, 2.97979798,
       3.03030303, 3.08080808, 3.13131313, 3.18181818, 3.23232323,
       3.28282828, 3.33333333, 3.38383838, 3.43434343, 3.48484848,
       3.53535354, 3.58585859, 3.63636364, 3.68686869, 3.73737

In [None]:
def create_files_for_publisher(publisher, how_many_files=None):
    dont_use = ['page_view_start_time', 'gmt_offset', 'publisher_id_hash']
    categories = [k for k, v in dtypes.items() if k not in dont_use]
    categories_index = dict(zip(categories, range(len(categories))))
    field_features = defaultdict()

    max_val = 1
    for index, file in enumerate(glob.glob('./data/train/*.csv')):
        main_libffm = 'train_%s.libffm' % publisher[:5]
        if index > 60:
            main_libffm = 'valid_%s.libffm' % publisher[:5]

        with open(main_libffm, 'a') as the_file:
            for t, row in enumerate(DictReader(open(file))):
                if t % 100000 == 0:
                    print(t, len(field_features), max_val)

                if row['publisher_id_hash'] !=  publisher:
                    continue

                label = [row['is_click']]
                ffeatures = []
                print(row)
                for field in categories:
                    if field == 'is_click':
                        continue
                    feature = row[field]

                    if feature == '':
                        feature = "unk"
                    if field not in num_cols:
                        ff = field + '_____' + feature
                    else:
                        if feature == "unk" or float(feature) == -1:
                            ff = field + '_____' + str(0)
                        else:
                            if field in too_many_vals:
                                ff = field + '_____' + str(int(round(math.log(1 + float(feature)))))
                            else:
                                ff = field + '_____' + str(int(round(float(feature))))
                    if ff not in field_features:
                        if len(field_features) == 0:
                            field_features[ff] = 1
                            max_val += 1
                        else:
                            field_features[ff] = max_val + 1
                            max_val += 1

                    fnum = field_features[ff]
                    ffeatures.append('{}:{}:1'.format(categories_index[field], fnum))

                
                line = label + ffeatures
                print(line)
                print('{}\n'.format(' '.join(line)))
                print(field_features)
                break
                the_file.write('{}\n'.format(' '.join(line)))
            break
        break

                
    train_path = './data/test_file_v3/test_file.csv'
    with open('test_%s.libffm' % publisher[:5], 'a') as the_file:
        for t, row in enumerate(DictReader(open(train_path))):
            if t % 100000 == 0:
                print(t, len(field_features), max_val)

            label = ['0']
            ffeatures = []

            for field in categories:
                if field == 'is_click':
                    continue
                feature = row[field]
                if feature == '':
                    feature = "unk"
                if field not in num_cols:
                    ff = field + '_____' + feature
                else:
                    if feature == "unk" or float(feature) == -1:
                        ff = field + '_____' + str(0)
                    else:
                        if field in too_many_vals:
                            ff = field + '_____' + str(int(round(math.log(1 + float(feature)))))
                        else:
                            ff = field + '_____' + str(int(round(float(feature))))
                if ff not in field_features:
                    if len(field_features) == 0:
                        field_features[ff] = 1
                        max_val += 1
                    else:
                        field_features[ff] = max_val + 1
                        max_val += 1

                fnum = field_features[ff]

                ffeatures.append('{}:{}:1'.format(categories_index[field], fnum))
            line = label + ffeatures
            the_file.write('{}\n'.format(' '.join(line)))