# From youtube to tree

In [1]:
# NLP words source:
# title, tags("taga"|"tagb"), description

# herarcial sources:
# channel_title, category, tags

# boolean statistical columns
# existance tags\words, comments_disabled, ratings_disabled, video_error_or_removed

# also possible - ancestral reconstrion for continues veriables. these bring into play:
# views, likess, dislikes, comment_count

# implemented in R here: http://www.phytools.org/eqg/Exercise_5.2/
# BayesTraits has BayesContinuous also
# This looks very promissing:
# https://github.com/michaelgruenstaeudl/WARACS

In [2]:
# possible questions:

# which tags are in the same semantic felid?
# what tag is most correlated with view/comment/likes/(like/dislike ratio) etc.
# what tag's are assosiated with like/dislike ratio (or binirize the ratio)

## Workplan
* clean tag and description text, and use tokenize pipe
* create sparse "token over vid" matrix - (include - categories columns !!!)
* herarchial clustering based on sparse onehot matrix (cat, tag, channel)
* ancestral reconstruction of traits: useing MaxParsimony for sparse tokens
* Bonus: ancestral reconstruction of cont traits - using BayesContinuous of the 5 continues columns

In [40]:
import numpy as np
import pandas as pd
from scipy.optimize import curve_fit
import seaborn as sns

import matplotlib.pyplot as plt
import matplotlib.colors as colors
%matplotlib inline

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import plotly.figure_factory as ff

import tqdm
import pickle
import os

# Reconstrtuction libraries
import scipy.sparse as sp
import ete3
from Analytical_solution_test_parallel import SetHomoplasy

# nlp libraries
from process_wikipedia import remove_special_chars, remove_html_tags, clean_string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from string import punctuation

In [5]:
# vids without likes or dislikes are wierd 
us_videos_first = pd.read_parquet('youtube_dataset/USfirst.parq')

In [6]:
# vids without likes or dislikes are wierd 
us_videos_first = us_videos_first[(us_videos_first["likes"] > 0) & (us_videos_first["dislikes"] > 0)]

In [7]:
us_videos_first.index.name = 'original_index'
us_videos_first = us_videos_first.reset_index()

## Cleaning tags

In [8]:
def text_cleanup(text):
    text = remove_html_tags(text)
    text = remove_special_chars(text, ['\n', '–']+list(punctuation))
    text = clean_string(text, set(stopwords.words('english')))
    return text

stemm = PorterStemmer()
lemma = WordNetLemmatizer()
def tokanizer(text, ret_trans=True, unique=False):
    words = nltk.word_tokenize(text)
    words, inv_words = np.unique(words, return_inverse=True)
    word_trans_ = dict.fromkeys(words)
    for i in range(len(words)): 
        word_trans_[words[i]] = stemm.stem(lemma.lemmatize(words[i]))
        words[i] = word_trans_[words[i]]
    
    tokens, inv_tokens = np.unique(words, return_inverse=True)
    if not unique: tokens = (tokens[inv_tokens])[inv_words]
    tokens = tokens.tolist()
    if ret_trans:
        return (tokens, word_trans_)
    else:
        return tokens

In [9]:
from gensim.corpora import Dictionary
from gensim import matutils
def SetWordDataDictTable(table, no_below=3, full_rec=False):
    # collect tokens and translations
    table['tokens'] = None
    token_corp = Dictionary()
    word_to_token = dict()
    diverse_dict = dict()
    for i,row in tqdm.tqdm_notebook(table.iterrows(), total=len(table), desc='Collecting vid Tokens'):
        # cleanup
        text = text_cleanup(row['tags'].replace('|',' '))
        tokens, trans = tokanizer(text, unique=True, ret_trans=True)
        # diverse track
        for token in tokens:
            if token == '': continue
            diverse_dict[token] = diverse_dict.get(token, [])+[row['channel_title']]
        # placement
        table.loc[i, 'tokens'] = ' '.join(tokens)
        word_to_token.update(trans)
        token_corp.add_documents([table.loc[i, 'tokens'].split(' ')])

    # filter token space to relevant tokens
    token_corp.filter_extremes(no_below=no_below)
    
    # filter token space to diverse tokens only
    for k in diverse_dict:
        diverse_dict[k] = len(set(diverse_dict[k]))
    
    # at least 2 channels using a token... - we have "channel title" for the rest.
    tag_diverse_series = pd.Series(diverse_dict)
    un_usable_tags = tag_diverse_series[tag_diverse_series<2].index.tolist()
    token_corp.filter_tokens(token_corp.doc2idx(un_usable_tags))
    
    # token_to_word inv map
    dat_size = len(token_corp)
    table.token_corp = token_corp
    table.word_to_token = word_to_token
    table.token_to_word = dict()
    for k, v in word_to_token.items():
        table.token_to_word[v] = table.token_to_word.get(v, [])
        table.token_to_word[v].append(k)
    
    # word2vec mapping
    sparse_tokens = []
    for i, row in tqdm.tqdm_notebook(table.iterrows(), total=len(table), desc='Token2Vec'):
        cols = np.array(token_corp.doc2idx(row['tokens'].split(' ')))
        cols = cols[cols>-1]
        rows = [0]*len(cols)
        dat = [1]*len(cols)
        sparse_tokens.append(sp.csr_matrix((dat, (rows, cols)), shape=(1, dat_size)))
    sparse_tokens = sp.vstack(sparse_tokens)
    return table, sparse_tokens


## sparse rep

In [10]:
us_videos_first, sparse_tokens = SetWordDataDictTable(us_videos_first)

HBox(children=(IntProgress(value=0, description='Collecting vid Tokens', max=6244, style=ProgressStyle(descrip…





Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access


Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access


Pandas doesn't allow columns to be created via a new attribute name - see https://pandas.pydata.org/pandas-docs/stable/indexing.html#attribute-access



HBox(children=(IntProgress(value=0, description='Token2Vec', max=6244, style=ProgressStyle(description_width='…




In [11]:
sparse_channels = sp.csr_matrix(pd.get_dummies(us_videos_first['channel_title'], sparse=True).values)
sparse_channels_rep = sparse_channels[:, np.argwhere(sparse_channels.sum(axis=0) > 1)[:, 1]]

In [12]:
sparse_cat = sp.csr_matrix(pd.get_dummies(us_videos_first['category_id']).values)

In [128]:
quality_map = sp.hstack([sparse_cat, sparse_channels_rep, sparse_tokens]) # todo add tokens...
quality_map_no_tag = sp.hstack([sparse_cat, sparse_channels_rep]) # todo add tokens...

## Tree reconstrtuction

In [129]:
from sklearn.metrics import pairwise
dist_mat = pairwise.cosine_distances(quality_map)
dist_mat_no_tags = pairwise.cosine_distances(quality_map_no_tag)

In [130]:
from TreeOperations import TreeFromDist
recon_tree = TreeFromDist(dist_mat, colnames=us_videos_first.index.tolist())
recon_tree_no_tags = TreeFromDist(dist_mat_no_tags, colnames=us_videos_first.index.tolist())

## state reconstrtuction

In [134]:
def MaxParsimony(X, Tree, tip_to_row):
    # 2 represents {0,1} set
    sp_to_arr = lambda sp_arr: np.array(sp_arr.todense().astype(np.int8))[0]
    wrap = lambda x: sp_to_arr(X[tip_to_row(x.name)]) if x.is_leaf() else sp_to_arr(x.dat)
    Tree.size = 0
    for _ in Tree.traverse(): Tree.size += 1
    for i, node in tqdm.tqdm(enumerate(Tree.traverse('postorder')), total=Tree.size, desc='Ancestral Reconstruction: 1st pass'):
        if node.is_leaf():
            node.dat = X[tip_to_row(node.name)]
            continue
        # node.name = i
        children = [wrap(c) for c in node.children]
        res = children[0].copy()
        eq = np.equal(*children)
        res[children[0] == 2] = children[1][children[0] == 2]  # 2 is the union {0,1}
        res[children[1] == 2] = children[0][children[1] == 2]
        res[(children[0] != 2) & (children[1] != 2) & ~eq] = 2
        node.dat = sp.csr_matrix(res)

    post = Tree.traverse('preorder')
    root = next(post)
    root.random = sp.csr_matrix((wrap(root) == 2))
    root.dat[root.dat == 2] = np.random.choice([1, 0], size=(root.dat == 2).sum())
    for node in tqdm.tqdm(post, total=Tree.size - 1, desc='Ancestral Reconstruction: 2nd pass'):
        if node.is_leaf(): 
            node.random = sp.csr_matrix((node.up.random) * 0)
            continue
        parent_ = wrap(node.up)
        node_ = wrap(node)
        res = node_.copy()
        res[node_ == 2] = parent_[node_ == 2]
        node.random = sp.csr_matrix((node.up.random).multiply(sp.csr_matrix(node_) == 2))  # these are unstable positions - will not be counted
        node.dat = sp.csr_matrix(res)

    return Tree

In [132]:
from TreeOperations import *
pickler = Tree_pickler()
saved_attr = ['token_corp', 'word_to_token', 'token_to_word', 'homoplasy', 'homoplasy_hist', 'size']
sp_list = ['dat', 'random']

### w/tags

In [124]:
recon_tree = MaxParsimony(sparse_tokens, recon_tree, int)
recon_tree = SetHomoplasy(recon_tree)
# inherit
recon_tree.token_corp = us_videos_first.token_corp
recon_tree.word_to_token = us_videos_first.word_to_token
recon_tree.token_to_word = us_videos_first.token_to_word

Ancestral Reconstruction: 1st pass: 100%|██████████| 12487/12487 [00:03<00:00, 3428.76it/s]
Ancestral Reconstruction: 2nd pass: 100%|██████████| 12486/12486 [00:05<00:00, 2204.67it/s]
100%|█████████▉| 12486/12488 [00:04<00:00, 2623.87it/s]


In [None]:
pickler.saveTree(recon_tree, 'youtube_dataset/Saved_tree', saved_attr, sp_list)

### w/o tags

In [135]:
recon_tree_no_tags = MaxParsimony(sparse_tokens, recon_tree_no_tags, int)
recon_tree_no_tags = SetHomoplasy(recon_tree_no_tags)
# inherit
recon_tree_no_tags.token_corp = us_videos_first.token_corp
recon_tree_no_tags.word_to_token = us_videos_first.word_to_token
recon_tree_no_tags.token_to_word = us_videos_first.token_to_word

Ancestral Reconstruction: 1st pass: 100%|██████████| 12487/12487 [00:03<00:00, 3393.93it/s]
Ancestral Reconstruction: 2nd pass: 100%|██████████| 12486/12486 [00:05<00:00, 2172.89it/s]
100%|█████████▉| 12486/12488 [00:04<00:00, 2668.32it/s]


In [136]:
pickler.saveTree(recon_tree_no_tags, 'youtube_dataset/Saved_tree_no_tags', saved_attr, sp_list)