# EDA

In [None]:
from src.data import *

import matplotlib.pyplot as plt

dataset = process_raw_data(year_start=2017, drop_dupes=False)
dataset, binner = create_features(dataset, clip_max=50000, n_bins=5)
dataset.sort_values('claps', ascending=False)

## Clap statistics

In [None]:
dataset['claps'].describe()

## Distribution of claps

In [None]:
dataset.plot(y='claps', kind='hist', bins=100)

## Distribution of clipped claps

In [None]:
dataset.plot(y='clip-claps', kind='hist', bins=100)

## Distribution of log claps

In [None]:
dataset.plot(y='log-claps', kind='hist', bins=100)

In [None]:
f, axes = plt.subplots(ncols=2, figsize=(15, 5))
dataset.plot(ax=axes[0], y='n-characters', kind='hist', bins=100)
dataset.plot(ax=axes[1], y='n-words', kind='hist', bins=100)

## Binning the claps

In [None]:
target = dataset.groupby('binned-class').count().loc[:, 'headline']
target

In [None]:
target.plot(kind='bar')

In [None]:
edges = binner.bin_edges_
for label, edge in enumerate(edges[0].flatten().tolist()[1:]):
    print(f'label {label} ends at {int(edge)}')

## How do claps change over time?

In [None]:
years = dataset.groupby('year').agg({'claps': ['median', 'mean']})
years.columns = years.columns.get_level_values(1)
years.plot(kind='bar', y='median')

In [None]:
months = dataset.groupby('month').agg({'claps': ['median', 'mean']})
months.columns = months.columns.get_level_values(1)
months.plot(kind='bar', y='median')

## By site

In [None]:
sites = dataset.groupby('site_id').agg({'claps': ['median', 'mean', 'count']})
sites.columns = sites.columns.get_level_values(1)
sites.sort_values('median')

## By site and year

In [None]:
from math import ceil

sites = list(set(dataset['site_id']))
f, axes = plt.subplots(ncols=2, nrows=ceil(len(sites) / 2), sharex=True, sharey=True, figsize=(15, 6))

for site, ax in zip(sites, axes.flatten()):
    mask = dataset.loc[:, 'site_id'] == site
    subset = dataset.loc[mask, :]
    grp = subset.groupby('year').agg({'claps': 'median'})
    grp.plot(ax=ax, kind='bar')
    ax.get_legend().remove()
    ax.set_title(site, loc='right', y=0.7)

## Effect of num words

In [None]:
dataset.plot(x='n-characters', y='claps', kind='scatter')

In [None]:
dataset.plot(x='n-words', y='claps', kind='scatter')

## Headline features

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer

vec = CountVectorizer()
x = vec.fit_transform(dataset['headline'])
x = x.todense()
print(x.shape)

In [None]:
names = np.array(vec.get_feature_names())
counts = np.squeeze(np.array(x.sum(axis=0)))

In [None]:
counts[:10]

In [None]:
sort_idx = np.squeeze(np.array(counts.argsort()))

In [None]:
counts[sort_idx][-32:]

In [None]:
names[sort_idx][-32:]