# Manual labeling of randomly sampled hosts
Choose 100 hosts at random, at for each host, see if its articles have only relevant videos, meaning no "recommended videos" or something like that in the sidebar.

In [195]:
import pandas as pd
import numpy as np
import os
import pickle
import sqlite3
from urllib.parse import urlparse
%load_ext autoreload
%autoreload 2

import matplotlib.pyplot as plt

from src import util

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [49]:
# Read from sqlite, extract and group by host, filter, aggregate, save to file.
# This should be greatly simplified once we work with the whole dataset.
conn = sqlite3.connect(os.environ["DATA_PATH"] + '/interim/GDELT_20180731.db')
pd.set_option('display.max_colwidth', -1)
query = "SELECT website_url, count(website_url) AS video_count FROM found_videos GROUP BY website_url;"
df = pd.read_sql_query(query, conn)
df['hostname'] = [urlparse(url).hostname for url in df['website_url']]
analysis = df.groupby(['hostname']).agg(['count', 'sum', 'mean', 'std']).reset_index()
sample = analysis[analysis["count"] > 1].sample(100) # This leaves only 416/1148 TODO how many in the whole dataset?
sample.to_csv(os.environ["DATA_PATH"] + '/interim/sample100_%d.csv' % time.time(), index=False)

In [204]:
# Read from file 
filename = 'sample100_1535986839.csv'
sample = pd.read_csv(os.environ["DATA_PATH"] + '/interim/%s' % filename)
sample.head()

Unnamed: 0,hostname,count,sum,mean,std
0,patch.com,3,3,1.0,0.0
1,townhall.com,2,2,1.0,0.0
2,thepointsguy.com,2,2,1.0,0.0
3,www.breitbart.com,7,7,1.0,0.0
4,www.cairoscene.com,2,10,5.0,5.656854


In [198]:
# Classify
videos_relevant = [True, True, True, True, True, True, True, True, True, False,\
                  True, False, True, True, True, True, False, True, True, True,\
                  True, True, True, True, True, False, False, False, True, False,\
                  False,False, False, True, True, True, True, True, False, True,\
                  True, True, False, False, False, True, False, False, True, True,\
                  False, True, True, False, False, True, False, False, True, True,\
                  False, False, True, False, True, True, True, True, True, True,\
                  False, False, True, False, False, True, False, False, False, True,\
                  True, True, True, True, False, True, False, False, True, True,
                  True, True, False, True, False, False, False, True, False, True]
index = 99
for index, row in df[df["hostname"] == sample.iloc[index][0]].iterrows():
    print("%02d: %s" % (row['video_count'], row['website_url']))

02: http://leaderpost.com/news/local-news/they-speak-the-truth-allies-show-support-at-protest-camp-powwow
01: http://leaderpost.com/news/stephen-harper-criticized-for-speaking-at-anti-iran-event-hosted-by-cult-former-terrorist-group/wcm/e343c486-507b-4067-a67b-782bc91526bd
01: http://leaderpost.com/news/world/cruise-ship-worker-rescued-22-hours-after-going-overboard-by-another-cruise-ship/wcm/aad26adf-4469-4323-8e17-8c82def086ed


In [201]:
# Save the labels to file
pickle.dump(videos_relevant, open(os.environ["DATA_PATH"] + '/interim/%s.labels' % filename, "wb+"))

## TODO
- Make this an application
- Safety Validation of manual labels to reveal mistakes made
- Use integers instead of Bools to allow for more distinction of reason (e.g. allows comments containing videos (relevance cannot be guaranteed), Sidebar)
- Use whole dataset (not just those that have count > 1 from the subset of 1148)