# Imports

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import math

# Load data

In [None]:
DATA_FILE = 'data/test.csv'
df = pd.read_csv(DATA_FILE)

# Preview data

In [None]:
print(df.head(10))

# Stats

## List participants ordered by decreasing message counts

In [None]:
name_totals = df.groupby(['name']).count()
name_totals = name_totals.sort_values(['message'], ascending=False)
name_totals = name_totals.drop(columns="datetime")
print(name_totals)

## Top-n longest messages

In [None]:
n = 10
longest_messages = df.sort_values(['message'], ascending=False, key=lambda x: x.str.len())
print(longest_messages.head(n))

## Most media sent by someone

In [None]:
n = 10
media_counts = df[df['message_type'] == 'media_omitted'].groupby(['name']).count().drop(columns=['datetime', 'message'])
print(media_counts.head(n))

## Various statistics

- Most group name changes
- Number of birthdays
- Number of group exits

### Most group name changes

In [None]:
n = 10

name_change_counts = df[df['message_type'] == 'subject_name_change'].groupby(['name']).count().drop(columns=['datetime', 'message']).sort_values(by=['message_type'], ascending=False)
print(name_change_counts.head(n))

### Number of birthdays

In [None]:
def is_birthday_message(message):
    word_list = ['birthday', 'hbd', 'happy birthday']
    for word in word_list:
        if word in str(message):
            return True
    return False

In [None]:
birthday_messages = df[(df['message_type'] == 'subject_name_change') & (df['message'].apply(is_birthday_message))]
birthday_messages

### Number of group exits

In [None]:
group_exits = df[(df['message_type'] == 'left') | (df['message_type'] == 'you_left')]
num_group_exits = len(group_exits)
print(num_group_exits)

## Word frequencies
Ref: https://stackoverflow.com/questions/46786211/counting-the-frequency-of-words-in-a-pandas-data-frame

In [None]:
n = 100
word_freqs = df[df['message_type'] == 'message']['message'].str.split(expand=True).stack().value_counts()
print(word_freqs.head(n))

# Word Cloud

Ref: https://www.kaggle.com/code/olgaberezovsky/word-cloud-using-python-pandas/notebook

In [None]:
from wordcloud import WordCloud, STOPWORDS
import re

plt.style.use('bmh')

In [None]:
all_messages = " ".join(df[df['message_type'] == 'message']['message'].astype(str))
all_messages = re.sub(r'http\S+', '<LINK>', all_messages)
stopwords = STOPWORDS.add('LINK')

In [None]:
wc = WordCloud(background_color='white', stopwords=stopwords, max_words=100, width=800, height=600)
wc.generate(all_messages)
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')

## Activity graph
Plot message frequency with respect to time, to analyze when each individual is most active

### Get number of hours of activity

In [None]:
first_message_datetime = df.at[0, 'datetime']
last_message_datetime = df.at[df.index[-1], 'datetime']
print(last_message_datetime)

time_delta = (pd.Timestamp(last_message_datetime) - pd.Timestamp(first_message_datetime)) / np.timedelta64(1, 'h')
time_delta = math.ceil(time_delta)
print(time_delta)

### Create timeseries of required period

In [None]:
dti = pd.date_range(first_message_datetime, periods=time_delta+1, freq='h')
dti

### Digitize datetime using timeseries

In [None]:
bins = pd.IntervalIndex.from_arrays(dti[:-1], dti[1:], closed='left')
bins

In [None]:
timestamps = pd.to_datetime(df['datetime'].values)
cat_obj = pd.cut(timestamps, bins)
#print(cat_obj)
vals = pd.value_counts(cat_obj, sort=False)
vals.keys()

### Create graph
- [] Check if you can use bins argument to value_counts() by converting df['datetime']

In [None]:
plt.xlabel('Date')
plt.ylabel('Message frequency per hour')
plt.suptitle('Group Activity Graph')
plt.plot(dti[:-1], vals.values)
plt.xticks(rotation=90)