In [None]:
# you need inst_h_d and instagram.csv to fully use this notebook

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from scipy.stats import kruskal, spearmanr
import scikit_posthocs as sp
import pprint
from load_inst import *
from pandas.plotting import register_matplotlib_converters
%matplotlib inline
register_matplotlib_converters()
plt.rcParams.update({'figure.max_open_warning': 0})
days = ['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday']

# Basic analysis

In [None]:
instagram = load_instagram() # use this line if you want load whole data

In [None]:
instagram.head(5)

In [None]:
len(instagram.ID.unique())

In [None]:
plt.figure(figsize=(30,20))
sns.lineplot(data=instagram, x='Time', y='Likes', hue='ID');

In [None]:
get_rows = rows_choose(days=range(1,31))
inst_h_d = get_rows.transform(instagram) # do not use unless you have updated the data or you want to
                                         # get different timeshifts, it takes a lot of time to compute
with open('inst_h_d', 'wb') as save_pickle:
    pickle.dump(inst_h_d, save_pickle)

In [None]:
with open('inst_h_d', 'rb') as tmp_pickle: # line to load data frame with default time shifts
    inst_h_d = pickle.load(tmp_pickle)

In [None]:
inst_h_d.loc['1h'].head(5)

In [None]:
dividing_point = (4, 7, 10, 16)
# default pipeline, it adds two columns - hours of first appereance as integers (it first rounds
# that time to full hours) and intervals of the time of first appereance
inst_h_d_pipe = Pipeline([('hours_to_int', get_hours()),
                          ('hours_intervals', hours_interval(dividing_points=dividing_point))])
inst_h_d_to_use = inst_h_d_pipe.fit_transform(inst_h_d)
inst_h_d_to_use.loc['1h']

## Plots

In [None]:
my_order = create_time_intervals(dividing_point)
time_shift_h_list = ['%sh' %num for num in range(1,24,1)]
hours_names = ['1 hour']
hours_names.extend(['%s hours' %num for num in range(2,24,1)])
time_shift_d_list = ['%sd' %num for num in range(1,31,1)]
days_names = ['1 day']
days_names.extend(['%s days' %num for num in range(2,31,1)])

In [None]:
for time_shift, m_hours in zip(time_shift_h_list, hours_names):
    plt.figure(figsize=(10,15))
    sns.boxplot(x="Time_intervals", y="Likes", data=inst_h_d_to_use.loc[time_shift],  order=my_order)
    sns.stripplot(x="Time_intervals", y="Likes", data=inst_h_d_to_use.loc[time_shift], jitter=True,
                  color='black', order=my_order).set(title='Likes after '+m_hours, xlabel='Time intervals')
        

In [None]:
plt.figure(figsize=(15, 10))
sns.regplot(x='Int_hour', y='Likes', data=inst_h_d_to_use.loc['1h'], color='black', label='1h', x_jitter=0.2, fit_reg=True)
sns.regplot(x='Int_hour', y='Likes', data=inst_h_d_to_use.loc['6h'], color='red', label='6h', x_jitter=0.2,fit_reg=True)
sns.regplot(x='Int_hour', y='Likes', data=inst_h_d_to_use.loc['12h'], color='blue', label='12h', x_jitter=0.2, fit_reg=True)
plt.legend();

In [None]:
for time_shift, m_day in zip(time_shift_d_list, days_names):
    plt.figure(figsize=(10,15))
    sns.boxplot(x="Time_intervals", y="Likes", data=inst_h_d_to_use.loc[time_shift],  order=my_order)
    sns.stripplot(x="Time_intervals", y="Likes", data=inst_h_d_to_use.loc[time_shift], jitter=True,
                  color='black', order=my_order).set(title='Likes after '+m_day, xlabel='Time intervals')

In [None]:
plt.figure(figsize=(15, 10))
sns.regplot(x='Int_hour', y='Likes', data=inst_h_d_to_use.loc['1d'], color='black', label='1d', x_jitter=0.2, fit_reg=True)
sns.regplot(x='Int_hour', y='Likes', data=inst_h_d_to_use.loc['6d'], color='red', label='6d', x_jitter=0.2,fit_reg=True)
sns.regplot(x='Int_hour', y='Likes', data=inst_h_d_to_use.loc['12d'], color='blue', label='12d', x_jitter=0.2, fit_reg=True)
plt.legend();

## Statistics

In [None]:
inst_h_d_to_use.loc['1h'][['Likes', 'Time_intervals']].groupby(['Time_intervals']).describe()

In [None]:
inst_h_d_to_use.loc['6h'][['Likes', 'Time_intervals']].groupby(['Time_intervals']).describe()

In [None]:
inst_h_d_to_use.loc['12h'][['Likes', 'Time_intervals']].groupby(['Time_intervals']).describe()

In [None]:
inst_h_d_to_use.loc['1d'][['Likes', 'Time_intervals']].groupby(['Time_intervals']).describe()

In [None]:
inst_h_d_to_use.loc['6d'][['Likes', 'Time_intervals']].groupby(['Time_intervals']).describe()

In [None]:
inst_h_d_to_use.loc['12d'][['Likes', 'Time_intervals']].groupby(['Time_intervals']).describe()

### Correlation

In [None]:
ind_names = ('1h', '6h', '12h', '1d', '6d', '12d')
corr_dict = {'%s' %ind :inst_h_d_to_use.loc[ind]['Likes'].corr(inst_h_d_to_use.loc[ind]['Int_hour']) for ind in ind_names}
for name, corr in corr_dict.items():
    print('%s: %s\n' %(name, corr))

In [None]:
corr_dict2 = {'%s' %ind : spearmanr(inst_h_d_to_use.loc[ind]['Likes'], inst_h_d_to_use.loc[ind]['Int_hour']) for ind in ind_names}
for name, corr in corr_dict2.items():
    print('%s: %s\n' %(name, corr.correlation))

## Tests

In [None]:
kruskal(*[group["Likes"].values for _, group in inst_h_d_to_use.loc['1h'][['Likes', 'Time_intervals']].groupby(['Time_intervals'])])

In [None]:
kruskal(*[group["Likes"].values for _, group in inst_h_d_to_use.loc['6h'][['Likes', 'Time_intervals']].groupby(['Time_intervals'])])

In [None]:
kruskal(*[group["Likes"].values for _, group in inst_h_d_to_use.loc['12h'][['Likes', 'Time_intervals']].groupby(['Time_intervals'])])

In [None]:
sp.posthoc_dunn(inst_h_d_to_use.loc['12h'][['Likes', 'Time_intervals']], val_col='Likes', group_col='Time_intervals', p_adjust = 'holm')

In [None]:
kruskal(*[group["Likes"].values for _, group in inst_h_d_to_use.loc['1d'][['Likes', 'Time_intervals']].groupby(['Time_intervals'])])

In [None]:
kruskal(*[group["Likes"].values for _, group in inst_h_d_to_use.loc['6d'][['Likes', 'Time_intervals']].groupby(['Time_intervals'])])

In [None]:
kruskal(*[group["Likes"].values for _, group in inst_h_d_to_use.loc['12d'][['Likes', 'Time_intervals']].groupby(['Time_intervals'])])

In [None]:
kruskal(*[group["Likes"].values for _, group in inst_h_d_to_use.loc['12d'][['Likes', 'Time_intervals']].groupby(['Time_intervals'])])

## Number of likes in time intervals

### Each photo as single observation

In [None]:
diff_pipe = Pipeline([('diff_maker', select_by_time(time_freq='3h')),
                      ('add_intervals', hours_interval(dividing_points=range(0, 25, 3),
                                                       low_int=True, int_time_col='Time',
                                                       full_day=True)),
                     ('first_app_int', hours_interval(dividing_points=range(0, 25, 3),
                                                       low_int=False, int_time_col='First_app',
                                                       full_day=True, new_col_name='First_app_int'))])
inst_diff_likes = diff_pipe.transform(instagram)

In [None]:
inst_no0_diff_likes = inst_diff_likes.loc[inst_diff_likes.Diff_likes != 0]

#### Plots

In [None]:
hue_order_time_int = create_time_intervals(range(0,25,3))

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(x="Time_intervals", y="Diff_likes", data=inst_diff_likes, showfliers=False)
sns.stripplot(x="Time_intervals", y="Diff_likes", data=inst_diff_likes, jitter=True, hue='First_app_int',
             hue_order=hue_order_time_int)
plt.legend(title='Time interval of upload');

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(x="Time_intervals", y="Diff_likes", data=inst_no0_diff_likes, showfliers=False)
sns.stripplot(x="Time_intervals", y="Diff_likes", data=inst_no0_diff_likes, jitter=True, hue='First_app_int',
             hue_order=hue_order_time_int)
plt.legend(title='Time interval of upload');

In [None]:
time_int_hist = inst_no0_diff_likes.Time_intervals.unique()
for time_int in time_int_hist:
    plt.figure(figsize=(10,5))
    sns.distplot(inst_diff_likes.loc[inst_diff_likes.Time_intervals == time_int]['Diff_likes'],
                 color="black").set_title(time_int);

In [None]:
for time_int in time_int_hist:
    plt.figure(figsize=(10,5))
    sns.distplot(inst_no0_diff_likes.loc[inst_no0_diff_likes.Time_intervals == time_int]['Diff_likes'],
                 color="black").set_title(time_int);

#### Statistics

In [None]:
inst_diff_likes[['Diff_likes', 'Time_intervals']].groupby(['Time_intervals'], sort=False).describe()

In [None]:
inst_no0_diff_likes[['Diff_likes', 'Time_intervals']].groupby(['Time_intervals'], sort=False).describe()

#### Tests

In [None]:
kruskal(*[group["Diff_likes"].values for _, group in inst_diff_likes[['Diff_likes', 'Time_intervals']].groupby(['Time_intervals'])])

In [None]:
time_int_photo_ph = sp.posthoc_dunn(inst_diff_likes[['Diff_likes', 'Time_intervals']], val_col='Diff_likes',
                group_col='Time_intervals',
                p_adjust='holm-sidak').replace(-1,1)
reindex_ph(time_int_photo_ph).style.background_gradient(cmap='coolwarm_r')

In [None]:
kruskal(*[group["Diff_likes"].values for _, group in inst_no0_diff_likes[['Diff_likes', 'Time_intervals']].groupby(['Time_intervals'])])

In [None]:
time_int_photo_ph_no0 = sp.posthoc_dunn(inst_no0_diff_likes[['Diff_likes', 'Time_intervals']], val_col='Diff_likes',
                group_col='Time_intervals',
                p_adjust='holm-sidak').replace(-1,1)
reindex_ph(time_int_photo_ph_no0).style.background_gradient(cmap='coolwarm_r')

### Each day as single observation

In [None]:
diff_group_pipe = Pipeline([('diff_maker', select_by_time(time_freq='3h')),
                            ('add_intervals', hours_interval(dividing_points=range(0, 25, 3),
                                                             low_int=True, int_time_col='Time', full_day=True)),
                            ('group_int', group_by_days())])
inst_int_group_likes = diff_group_pipe.transform(instagram)
inst_int_no0_group_likes = inst_int_group_likes.loc[inst_int_group_likes['Diff_likes'] != 0]

#### Plots

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(x="Time_intervals", y="Diff_likes", data=inst_int_group_likes, showfliers = False);
sns.stripplot(x="Time_intervals", y="Diff_likes", data=inst_int_group_likes, jitter=True, color='black');

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(x="Time_intervals", y="Diff_likes", data=inst_int_no0_group_likes, showfliers = False);
sns.stripplot(x="Time_intervals", y="Diff_likes", data=inst_int_no0_group_likes, jitter=True, color='black');

In [None]:
for time_int in time_int_hist:
    plt.figure(figsize=(10,5))
    sns.distplot(inst_int_group_likes.loc[inst_int_group_likes.Time_intervals == time_int]['Diff_likes'],
                 color="black").set_title(time_int);

In [None]:
for time_int in time_int_hist:
    plt.figure(figsize=(10,5))
    sns.distplot(inst_int_no0_group_likes.loc[inst_int_no0_group_likes.Time_intervals == time_int]['Diff_likes'],
                 color="black").set_title(time_int);

#### Statistics

In [None]:
inst_int_group_likes.groupby('Time_intervals', sort=False).describe()

In [None]:
inst_int_no0_group_likes.groupby('Time_intervals', sort=False).describe()

#### Tests

In [None]:
kruskal(*[group["Diff_likes"].values for _, group in inst_int_group_likes[['Diff_likes', 'Time_intervals']].groupby(['Time_intervals'])])

In [None]:
time_int_day_ph = sp.posthoc_dunn(inst_int_group_likes[['Diff_likes', 'Time_intervals']], val_col='Diff_likes',
                group_col='Time_intervals',
                p_adjust='holm-sidak').replace(-1,1)
reindex_ph(time_int_day_ph).style.background_gradient(cmap='coolwarm_r')

In [None]:
kruskal(*[group["Diff_likes"].values for _, group in inst_int_no0_group_likes[['Diff_likes', 'Time_intervals']].groupby(['Time_intervals'])])

In [None]:
time_int_day_ph_no0 = sp.posthoc_dunn(inst_int_no0_group_likes[['Diff_likes', 'Time_intervals']], val_col='Diff_likes',
                group_col='Time_intervals',
                p_adjust='holm-sidak').replace(-1,1)
reindex_ph(time_int_day_ph_no0).style.background_gradient(cmap='coolwarm_r')

# Days of week

### Each photo as single observation

In [None]:
time_days_likes_diff_day_group = Pipeline([('Time_select', select_by_time(time_freq='1D')),
                                 ('Day_of_week',get_day_of_week(prev_day=True, sort_by_num=True)),
                                 ('Day_of_week_f_app',get_day_of_week(prev_day=False, time_col='First_app',
                                                                     day_col_name='Day_of_first_app', 
                                                                     day_col_num='Day_of_first_app_num',
                                                                     sort_by_num=False))])
photo_day_diff_likes = time_days_likes_diff_day_group.transform(instagram)

In [None]:
photo_day_diff_likes_no0 = photo_day_diff_likes.loc[photo_day_diff_likes.Diff_likes != 0]
photo_day_diff_likes_no_out = photo_day_diff_likes_no0[photo_day_diff_likes_no0.Diff_likes < 80]

In [None]:
hue_order_days = photo_day_diff_likes.Day_of_week.unique()

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(x="Day_of_week", y="Diff_likes", data=photo_day_diff_likes, showfliers=False);
sns.stripplot(x="Day_of_week", y="Diff_likes", data=photo_day_diff_likes, jitter=True, hue='Day_of_first_app', 
              hue_order = hue_order_days)
plt.legend(title='Day of upload');

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(x="Day_of_week", y="Diff_likes", data=photo_day_diff_likes_no0, showfliers=False);
sns.stripplot(x="Day_of_week", y="Diff_likes", data=photo_day_diff_likes_no0, jitter=True, hue='Day_of_first_app', 
              hue_order = hue_order_days)
plt.legend(title='Day of upload');

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(x="Day_of_week", y="Diff_likes", data=photo_day_diff_likes_no_out, showfliers=False);
sns.stripplot(x="Day_of_week", y="Diff_likes", data=photo_day_diff_likes_no_out, jitter=True, hue='Day_of_first_app', 
              hue_order = hue_order_days)
plt.legend(title='Day of upload');

In [None]:
week_days = photo_day_diff_likes['Day_of_week'].unique()

In [None]:
for week_day in week_days:
    plt.figure(figsize=(10,5))
    sns.distplot(photo_day_diff_likes.loc[photo_day_diff_likes.Day_of_week == week_day]['Diff_likes'],
                 color="black").set_title(week_day);

In [None]:
for week_day in week_days:
    plt.figure(figsize=(10,5))
    sns.distplot(photo_day_diff_likes_no0.loc[photo_day_diff_likes_no0.Day_of_week == week_day]['Diff_likes'],
                 color="black").set_title(week_day);

In [None]:
for week_day in week_days:
    plt.figure(figsize=(10,5))
    sns.distplot(photo_day_diff_likes_no_out.loc[photo_day_diff_likes_no_out.Day_of_week == week_day]['Diff_likes'],
                 color="black").set_title(week_day);

#### Statistics

In [None]:
photo_day_diff_likes[['Diff_likes', 'Day_of_week']].groupby('Day_of_week', sort=False).describe()

In [None]:
photo_day_diff_likes_no0[['Diff_likes', 'Day_of_week']].groupby('Day_of_week', sort=False).describe()

In [None]:
photo_day_diff_likes_no_out[['Diff_likes', 'Day_of_week']].groupby('Day_of_week', sort=False).describe()

#### Tests

In [None]:
kruskal(*[group["Diff_likes"].values for _, group in photo_day_diff_likes[['Diff_likes', 'Day_of_week']].groupby(['Day_of_week'])])

In [None]:
photo_day_diff_likes_ph = sp.posthoc_dunn(photo_day_diff_likes[['Diff_likes', 'Day_of_week']], val_col='Diff_likes',
                group_col='Day_of_week',
                p_adjust='holm-sidak').replace(-1,1)
reindex_ph(photo_day_diff_likes_ph, my_key=days, sort=False).style.applymap(two_colors)

In [None]:
kruskal(*[group["Diff_likes"].values for _, group in photo_day_diff_likes_no0[['Diff_likes', 'Day_of_week']].groupby(['Day_of_week'])])

In [None]:
photo_day_diff_likes_no0_ph = sp.posthoc_dunn(photo_day_diff_likes_no0[['Diff_likes', 'Day_of_week']], val_col='Diff_likes',
                group_col='Day_of_week',
                p_adjust='holm-sidak').replace(-1,1)
reindex_ph(photo_day_diff_likes_no0_ph, my_key=days, sort=False).style.applymap(two_colors)

### Each day as single observation

In [None]:
time_days_likes_diff_ID_group = Pipeline([('Time_select', select_by_time(time_freq='1D')),
                                 ('Day_of_week',get_day_of_week(prev_day=True)),
                                 ('Group_by_days_ID', group_by_days(time_int_col='Day_of_week'))])
photo_day_ID_diff_likes = time_days_likes_diff_ID_group.transform(instagram)

In [None]:
photo_day_ID_diff_likes_no0 = photo_day_ID_diff_likes.loc[photo_day_ID_diff_likes.Diff_likes != 0]

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(x="Day_of_week", y="Diff_likes", data=photo_day_ID_diff_likes, showfliers=False);
sns.stripplot(x="Day_of_week", y="Diff_likes", data=photo_day_ID_diff_likes, jitter=True, color='black');

In [None]:
plt.figure(figsize=(15,15))
sns.boxplot(x="Day_of_week", y="Diff_likes", data=photo_day_ID_diff_likes_no0, showfliers=False);
sns.stripplot(x="Day_of_week", y="Diff_likes", data=photo_day_ID_diff_likes_no0, jitter=True, color='black');

In [None]:
week_days = photo_day_diff_likes['Day_of_week'].unique()

In [None]:
for week_day in week_days:
    plt.figure(figsize=(10,5))
    sns.distplot(photo_day_ID_diff_likes.loc[photo_day_ID_diff_likes.Day_of_week == week_day]['Diff_likes'],
                 color="black").set_title(week_day);

In [None]:
for week_day in week_days:
    plt.figure(figsize=(10,5))
    sns.distplot(photo_day_ID_diff_likes_no0.loc[photo_day_ID_diff_likes_no0.Day_of_week == week_day]['Diff_likes'],
                 color="black").set_title(week_day);

#### Statistics

In [None]:
photo_day_ID_diff_likes.groupby('Day_of_week', sort=False).describe()

In [None]:
photo_day_ID_diff_likes_no0.groupby('Day_of_week', sort=False).describe()

#### Tests

In [None]:
kruskal(*[group["Diff_likes"].values for _, group in photo_day_ID_diff_likes[['Diff_likes', 'Day_of_week']].groupby(['Day_of_week'])])

In [None]:
photo_day_ID_diff_likes_ph = sp.posthoc_dunn(photo_day_ID_diff_likes[['Diff_likes', 'Day_of_week']], val_col='Diff_likes',
                group_col='Day_of_week',
                p_adjust='holm-sidak').replace(-1,1)
reindex_ph(photo_day_ID_diff_likes_ph, my_key=days, sort=False).style.applymap(two_colors)

In [None]:
kruskal(*[group["Diff_likes"].values for _, group in photo_day_ID_diff_likes_no0[['Diff_likes', 'Day_of_week']].groupby(['Day_of_week'])])

In [None]:
photo_day_ID_diff_likes_no0_ph = sp.posthoc_dunn(photo_day_ID_diff_likes_no0[['Diff_likes', 'Day_of_week']], val_col='Diff_likes',
                group_col='Day_of_week',
                p_adjust='holm-sidak').replace(-1,1)
reindex_ph(photo_day_ID_diff_likes_no0_ph, my_key=days, sort=False).style.applymap(two_colors)