In [4]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import scale

# You can scale your data or normalize

In [5]:
# generally scaling your data means conforming that data to have a range from 0 to 1
# however in sklearn this actually NORMALIZES your data, which is more robust to outliers 
df = pd.read_csv('daily_activities_and_happiness.csv')

In [6]:
df.head()

Unnamed: 0,daily_minutes_hobby,daily_minutes_exercise,daily_minutes_grooming,daily_minutes_commuting,daily_minutes_tv,daily_minutes_talking_to_friend,happiness_rating
0,19,16,14,20,95,22,2
1,17,23,8,64,72,14,1
2,20,21,18,53,30,18,2
3,23,21,20,65,22,15,2
4,28,7,19,38,36,14,2


In [10]:
scale??

# Think about extracting data from irregular
# and heterogenous time series

In [23]:
from numpy.random import poisson, randint, choice

In [18]:
poisson(lam = 5, size = 10)

array([7, 4, 4, 6, 3, 5, 4, 3, 3, 3])

In [36]:
user_dates = []
user_vals = []
possible_indices = list(range(10))
for _ in range(10):
    dates = pd.date_range('2017-06-11', periods = 10, freq = 'd')
    num_indices = randint(low = 1, high = 10)
    indices = choice(a = possible_indices, size = num_indices, replace = False)
    use_dates = dates[sorted(indices)]
    use_vals = poisson(lam = 10, size = num_indices)
    user_dates.append(use_dates)
    user_vals.append(use_vals)
    
df = pd.DataFrame({'dates': user_dates,
                  'vals': user_vals})

In [37]:
df

Unnamed: 0,dates,vals
0,"DatetimeIndex(['2017-06-12', '2017-06-14', '20...","[6, 10, 7, 7, 5]"
1,"DatetimeIndex(['2017-06-18'], dtype='datetime6...",[11]
2,"DatetimeIndex(['2017-06-12', '2017-06-13', '20...","[5, 12, 10, 10, 10, 10, 12, 10, 6]"
3,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[14, 8, 5, 4, 10, 7, 12, 14, 6]"
4,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[11, 7, 13, 12, 6, 11, 14, 14, 10]"
5,"DatetimeIndex(['2017-06-11', '2017-06-13', '20...","[19, 6, 15, 15, 4]"
6,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[6, 16, 4, 11, 14, 6, 7, 9]"
7,"DatetimeIndex(['2017-06-15', '2017-06-18'], dt...","[13, 10]"
8,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[12, 11, 13, 10, 10, 9, 9, 11, 11]"
9,"DatetimeIndex(['2017-06-13', '2017-06-15', '20...","[14, 10, 10, 14]"


In [40]:
df.loc[2]['dates']

DatetimeIndex(['2017-06-12', '2017-06-13', '2017-06-14', '2017-06-15',
               '2017-06-16', '2017-06-17', '2017-06-18', '2017-06-19',
               '2017-06-20'],
              dtype='datetime64[ns]', freq=None)

# Think about how you might characterize these individuals

In [41]:
# high use/ low use individuals
df['usage_val'] = df.vals.apply(lambda x: len(x))

In [42]:
df.head()

Unnamed: 0,dates,vals,usage_val
0,"DatetimeIndex(['2017-06-12', '2017-06-14', '20...","[6, 10, 7, 7, 5]",5
1,"DatetimeIndex(['2017-06-18'], dtype='datetime6...",[11],1
2,"DatetimeIndex(['2017-06-12', '2017-06-13', '20...","[5, 12, 10, 10, 10, 10, 12, 10, 6]",9
3,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[14, 8, 5, 4, 10, 7, 12, 14, 6]",9
4,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[11, 7, 13, 12, 6, 11, 14, 14, 10]",9


In [43]:
# how long was the 'user lifetime'?
df['user_lifetime'] = df.dates.apply(lambda x: max(x) - min(x))

In [44]:
df.head()

Unnamed: 0,dates,vals,usage_val,user_lifetime
0,"DatetimeIndex(['2017-06-12', '2017-06-14', '20...","[6, 10, 7, 7, 5]",5,6 days
1,"DatetimeIndex(['2017-06-18'], dtype='datetime6...",[11],1,0 days
2,"DatetimeIndex(['2017-06-12', '2017-06-13', '20...","[5, 12, 10, 10, 10, 10, 12, 10, 6]",9,8 days
3,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[14, 8, 5, 4, 10, 7, 12, 14, 6]",9,9 days
4,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[11, 7, 13, 12, 6, 11, 14, 14, 10]",9,9 days


In [46]:
# what was the range of values a user input?
df['range_values'] = df.vals.apply(lambda x: max(x) - min(x))

In [47]:
df.head()

Unnamed: 0,dates,vals,usage_val,user_lifetime,range_values
0,"DatetimeIndex(['2017-06-12', '2017-06-14', '20...","[6, 10, 7, 7, 5]",5,6 days,5
1,"DatetimeIndex(['2017-06-18'], dtype='datetime6...",[11],1,0 days,0
2,"DatetimeIndex(['2017-06-12', '2017-06-13', '20...","[5, 12, 10, 10, 10, 10, 12, 10, 6]",9,8 days,7
3,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[14, 8, 5, 4, 10, 7, 12, 14, 6]",9,9 days,10
4,"DatetimeIndex(['2017-06-11', '2017-06-12', '20...","[11, 7, 13, 12, 6, 11, 14, 14, 10]",9,9 days,8


In [59]:
# identify users who provided input on a certain day 
# (maybe it's revealing that they did log something on Father's Day)
df.dates.apply(lambda x: pd.Timestamp('2016-06-18').day in [d.day for d in x])


0     True
1     True
2     True
3     True
4     True
5    False
6     True
7     True
8     True
9     True
Name: dates, dtype: bool