# Base table creation
# Setup

In [1]:
# dependencies
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
%matplotlib inline 

import seaborn as sns

import missingno as msno

# Get data

In [2]:
df = pd.read_csv('data/clean_table.csv')

# Data Exploration
## Standard analysis

In [3]:
df.shape

(3386, 8)

In [4]:
df.dtypes

Unnamed: 0              int64
activity               object
category               object
start_date[ms]          int64
start_date             object
end_date[ms]            int64
end_date               object
activityDuration[m]     int64
dtype: object

In [5]:
df.head(5)

Unnamed: 0.1,Unnamed: 0,activity,category,start_date[ms],start_date,end_date[ms],end_date,activityDuration[m]
0,0,Trello,Personal Adjusting,1540159273005,Mon Oct 22 00:01:13 GMT+02:00 2018,1540159869559,Mon Oct 22 00:11:09 GMT+02:00 2018,9
1,1,Series / Docu,Entertainment,1540159869559,Mon Oct 22 00:11:09 GMT+02:00 2018,1540162820068,Mon Oct 22 01:00:20 GMT+02:00 2018,49
2,2,Sleep,Refresh,1540162820068,Mon Oct 22 01:00:20 GMT+02:00 2018,1540189458018,Mon Oct 22 08:24:18 GMT+02:00 2018,443
3,3,Moving - youtube,Transport,1540189458018,Mon Oct 22 08:24:18 GMT+02:00 2018,1540189949037,Mon Oct 22 08:32:29 GMT+02:00 2018,8
4,4,Trello,Personal Adjusting,1540189949037,Mon Oct 22 08:32:29 GMT+02:00 2018,1540190444165,Mon Oct 22 08:40:44 GMT+02:00 2018,8


# Base table creation
Goal is to create a cleaned data, selected features and augmented table.

## Feature engineering 

Let's take only relevant features into our base table:
- activity, 
- category, 
- start_date_timestamp, 
- start_date,
- end_date_timestamp,
- end_date,
- activity_duration[m]   # minutes

In [6]:
base_table = pd.DataFrame()

Lets transform activitiy duration in minutes. Because minutes is a more relevant metric than milliseconds for our activitiy analysis.

In [7]:
base_table.dtypes

Series([], dtype: object)

In [8]:
base_table.head()

## Generating Insights

### Displaying unique categories

In [9]:
categories = base_table['category'].unique()
for category in categories:
    print(category)

KeyError: 'category'

### Displaying unique activities

In [None]:
activities = base_table['activity'].unique()
for activity in activities:
    print(activity)

### Distributions of numerical features

Lets plot a histogram of activity duration (numeric feature) to see the distribution of time spend on all activities. 

In [None]:
def get_most_asctivity_per_category(series):
    series_by_category = base_table.groupby('category')
    return series_by_category.count().sort_values('activity'
                                                  ,ascending=False) 

In [None]:
get_most_asctivity_per_category(series)

In [None]:
series = base_table['activityDuration[m]']

In [None]:
series.describe()

In [None]:
minutes_to_inspect = 60
bins = int(minutes_to_inspect / 1) # try out values from 1 to 30
bins_in_minutes = minutes_to_inspect / bins
inspection_range = (0,minutes_to_inspect)
print("Observing {} minutes - bin size: {} minutes".format(
    minutes_to_inspect,bins_in_minutes))

# Plottting
print("")
axes = series.hist(xrot=-45, figsize=(8,8),
                                       bins=bins, range=inspection_range)
axes.set_xlabel("minutes")
axes.set_ylabel("activity occurences")
plt.show()

TOP 5 for minutes of activity (minutes | occurence of activity) 

In [None]:
series.value_counts()[:5]

# Save clean table

In [None]:
base_table.to_csv('data/base_table.csv')