# Importing libraries and loading data

In [0]:
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import datetime
import os
import gc
import random
import dask.dataframe as dd
import sys
import pickle
sns.set()

In [0]:
#connecting to kaggle and importing datasets
from google.colab import files
files.upload()
!mkdir -p ~/.kaggle
!mv kaggle.json ~/.kaggle/
!chmod 600 /root/.kaggle/kaggle.json

!kaggle competitions download -c talkingdata-adtracking-fraud-detection
!unzip train.csv.zip

In [0]:
#loading train data as dask dataframe
dtypes = {
        'ip'            : 'uint32',
        'app'           : 'uint16',
        'device'        : 'uint16',
        'os'            : 'uint16',
        'channel'       : 'uint16',
        'click_id'      : 'uint32'
        }

train = dd.read_csv('mnt/ssd/kaggle-talkingdata2/competition_files/train.csv', dtype=dtypes, usecols=(['ip', 'app', 'device', 'os', 'channel']))

Want to use all of the is_attributed clicks when training dataset. Computing them from dd and pickling for later use. 

In [0]:
#creating is_attributed dataset
is_attributed=train[train['is_attributed']==1]
is_attributed=is_attributed.compute()

outfile=open('/gdrive/My Drive/kaggle/pickles/is_attributed', 'wb')
pickle.dump(is_attributed, outfile)
outfile.close()
gc.collect()

There are 456,846 is_attributed clicks,. For the negative outcome clicks, I'm going to randomly sample them from the training data. I'm going to
downsample the data to 2-1. Using a method to randomly sample the dask object that I picked up from Stack Overflow. This method will sample across partitions which is something I was struggling with earlier. When I would try to randomly sample, it would only sample within a partition which was a problem as the training dataset is ordered by date-time.

In [0]:
#getting random sample of dask rows
a, b = train.random_split([0.995, 0.005], random_state=123)
train_random=b.compute()

del a, b
gc.collect

not_attributed=train_random[train_random.is_attributed==0]
outfile=open('/gdrive/My Drive/kaggle/pickles/random_not_attributed', 'wb')
pickle.dump(not_attributed, outfile)
outfile.close()
gc.collect()

In [0]:
#creating training_dataset of is_attributed and not_attributed. Pickling for later use
train_df = is_attributed.append(not_attributed)
display(len(train_df))
display(train_df['is_attributed'].value_counts())
display(train_df.columns)

del is_attributed, not_attributed 
gc.collect()

outfile = open('/gdrive/My Drive/kaggle/pickles/train_comb', 'wb')
pickle.dump(train_df, outfile)
outfile.close()

Since the test dataset is day 10, hours 4, 5, 9, 10, 13, 14. We want the validation dataset to be day 9 (with the theory that day 9 will be similar to day 10 than the other days) and the same hours. Here we're subsetting the train data to create a validation dataset with these specifications

In [0]:
validation=train[(train.day==9) & (train.hour.isin([4, 5, 9, 10, 13, 14]))]
validation=validation.compute()

In [0]:
display(len(validation))
display(validation['hour'].value_counts())
display(validation.is_attributed.value_counts())
display(validation.head(5))

#creating smaller validation dataset, pickling for later use
val_is_attributed = validation[validation['is_attributed']==1]
val_sample = validation[validation['is_attributed']==0].sample(n=55000)
val_sample = val_sample.append(val_is_attributed)

outfile=open('/gdrive/My Drive/kaggle/pickles/val_data', 'wb')
pickle.dump(val_sample, outfile)
outfile.close()

20895641

4     4032691
5     3671741
13    3457523
14    3443283
10    3304199
9     2986204
Name: hour, dtype: int64

0    20843421
1       52220
Name: is_attributed, dtype: int64

Unnamed: 0,ip,app,device,os,channel,click_time,attributed_time,is_attributed,hour,day
333747,272385,35,1,18,21,2017-11-09 04:00:00,2017-11-09 12:57:38,1,4,9
333799,159422,12,1,13,178,2017-11-09 04:00:00,2017-11-09 04:10:23,1,4,9
334217,314192,15,1,40,315,2017-11-09 04:00:00,2017-11-09 04:00:28,1,4,9
334809,70921,35,1,19,21,2017-11-09 04:00:01,2017-11-09 04:39:22,1,4,9
336173,38950,19,0,21,213,2017-11-09 04:00:02,2017-11-09 07:45:21,1,4,9
