# Imports

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import datetime
import math

# Load data

In [None]:
DATA_FILE = 'log.csv'
df = pd.read_csv(DATA_FILE)

# Preview data

In [None]:
print(df.head(10))

# Stats

## List participants ordered by decreasing message counts

In [None]:
name_totals = df.groupby(['name']).count()
name_totals = name_totals.sort_values(['message'], ascending=False)
name_totals = name_totals.drop(columns="datetime")
print(name_totals)

## Top n longest messages

In [None]:
n = 10
df['msg_len'] = df['message'].str.len()
longest_messages = df.sort_values(['msg_len'], ascending=False)
print(longest_messages.head(n))

## Most media sent by someone

In [None]:
n = 10
df['is_media'] = df['message'] == '<Media omitted>'
media_counts = df[df['is_media']].groupby(['name']).count()
media_counts = media_counts.sort_values(['message'], ascending=False)
media_counts = media_counts.drop(columns=["datetime", "message", "msg_len"])
print(media_counts.head(n))

## Word frequencies
Ref: https://stackoverflow.com/questions/46786211/counting-the-frequency-of-words-in-a-pandas-data-frame

TODO: optimize this

In [None]:
n = 100
word_freqs = df.message.str.split(expand=True).stack().value_counts()
print(word_freqs.head(n))

## Activity graph
Plot message frequency with respect to time, to analyze when each individual is most active

### Get number of hours of activity

In [None]:
first_message_datetime = df.at[0, 'datetime']
last_message_datetime = df.at[df.index[-1], 'datetime']
print(last_message_datetime)

time_delta = (pd.Timestamp(last_message_datetime) - pd.Timestamp(first_message_datetime)) / np.timedelta64(1, 'D')
time_delta = math.ceil(time_delta)
print(time_delta)

### Create timeseries of required period

In [None]:
dti = pd.date_range(first_message_datetime, periods=time_delta+1, freq='D')
dti

### Digitize datetime using timeseries

In [None]:
bins = pd.IntervalIndex.from_arrays(dti[:-1], dti[1:], closed='left')
bins

In [None]:
timestamps = pd.to_datetime(df['datetime'].values)
cat_obj = pd.cut(timestamps, bins)
#print(cat_obj)
vals = pd.value_counts(cat_obj, sort=False)
vals.keys()

### Create graph
- [] Check if you can use bins argument to value_counts() by converting df['datetime']
- [x] Fix messy x-axis display 

In [None]:
plt.xlabel('Date')
plt.ylabel('Message frequency')
plt.suptitle('Group Activity Graph')
plt.plot(dti[:-1], vals.values)