In [None]:
import datetime as dt
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

### Load Dataset

In [None]:
df = pd.read_csv('training_data.csv', sep=';')

### Basic Data Exploration
1. Find out the dimensions of the dataset.
2. Find out the number of missing values.
3. Find out the optimal strategy to deal with the missing values.

In [None]:
df.columns

In [None]:
df.shape

In [None]:
df["install"].value_counts()

In [None]:
print("NaN Counts")
for col_name in df.columns:
    print("{}: {}".format(col_name, df[col_name].isna().sum()))

## Preprocess Data
### 1. Remove rows with NaNs or impute the data

In [None]:
df.dropna(subset=['lastStart', 'country'], inplace=True)
df.tail()

In [None]:
df['install'].value_counts()
print("Non-install row removed: {:.1f}%".format(100 * (2180493.0 - 2020048) / 2180493.0))
print("Install row removed: {:.1f}".format(100 * (26497.0 - 23675) / 26497))

This analysis shows that roughly similar percentages of the install and non-install rows are removed when filtering out NaN values in the country and lastStart fields. If I were to impute the missing values that would require a careful analysis best left for a later stage.

### 2. Use timestamp and lastStart to create timeSinceLastStart feature

In [None]:
def datetime_parser(datetime_str):
    date_str, time_str = datetime_str.split('T')
    time_str = time_str[:8]
    year, month, day = date_str.split("-")
    hour, minut, sec = time_str.split(":")
    
    return dt.datetime(int(year), int(month), int(day), int(hour), int(minut), int(sec) )

In [None]:
def time_diff_in_minutes(dt_0, dt_1):
    return np.round((dt_1 - dt_0).total_seconds() / 60.0, 0)

In [None]:
df.timestamp = df.timestamp.apply(datetime_parser)
df.lastStart = df.lastStart.apply(datetime_parser)

In [None]:
df['timeSinceLastStart'] = df.apply(lambda row: time_diff_in_minutes(row['lastStart'], row['timestamp']), axis=1) 
df.info()
df.head()

### 3. Study Range Of Values For Each Feature

In [None]:
df['softwareVersion'] = df.apply(lambda row: row.softwareVersion.split('.')[0], axis=1)

In [None]:
categorical_nominal_feat = ['campaignId', 'sourceGameId', 'country', 'platform', 'softwareVersion', 'connectionType', 'deviceType', 'softwareVersion']
for feat in categorical_nominal_feat:
    print(feat)
    print("========")
    print(df[feat].value_counts())
    print("        ")

In [None]:
numerical_feat = ['startCount', 'viewCount', 'clickCount', 'installCount', 'startCount1d', 'startCount7d', 'install', 'timeSinceLastStart']

### 4. Drop Columns

In [None]:
df.drop(['id', 'lastStart'], axis=1)

### 5. Convert timestamp to an ordinal

In [None]:
df.timestamp = df.timestamp.apply(lambda x: x.toordinal())

In [None]:
df.reset_index(drop=True)
df.head()

In [None]:
df['install'].value_counts()