# Channels' tags network

In [264]:
import os
import shlex
import json
import csv
import nltk
import itertools
import networkx as nx
import pandas as pd
import collections

from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

### Tag network using .json file

In [235]:
i=0
with open(fileName2) as f:
    for line in f:
        #read each json line as a json dict
        json_dict = json.loads(line)
        all_keys=[] # all tag/keywords for each channel

        if (json_dict['statistics']['viewCount']!='0'):
            if 'keywords' in json_dict['brandingSettings']['channel']:
                try:
                    for kw in shlex.split(json_dict['brandingSettings']['channel']['keywords']):
                        kw_id = kw.lower().replace("'", "").replace("´", "").replace("’", "")
                        single_keys=nltk.word_tokenize(kw_id)
                        
                        for key in single_keys:
                            
                            if key not in all_keys:
                                all_keys.append(key)
                                g.add_node(key, type='keyword')

                except:
                    i+=1
                    for kw in json_dict['brandingSettings']['channel']['keywords'].split():
                        kw_id = kw.lower().replace("'", "").replace("´", "").replace("’", "")
                        single_keys=nltk.word_tokenize(kw_id)
                        for key in single_keys:
                            if key not in all_keys:
                                all_keys.append(key)
                                g.add_node(key, type='keyword')

            for key in all_keys:
                if key in stopwords.words('english'):
                    all_keys.remove(key)
            edges= itertools.combinations(all_keys,2)
            g.add_edges_from(edges)
            
        i+=1
        if (i==100):
            break

### Tag network using .csv file

In [373]:
def create_tag_network(fileName, quantile, g, limit):
    j=0
    lemmatizer = WordNetLemmatizer()
    df=pd.read_csv(fileName)
    df = df[df.views>quantile]
    df=df.sort_values(by=['views'],ascending=False)

    for n in range(0,len(df)):
        all_keys=[] # all tag/keywords for each channel without repetition
        try:
            if(df.keywords[n]!=""):

                try:
                    for kw in shlex.split(df.keywords[n]):
                        kw_id = kw.lower().replace("'", "").replace("´", "").replace("’", "")
                        single_keys=nltk.word_tokenize(kw_id)
                        for key in single_keys:
                            key2=lemmatizer.lemmatize(key)
                            if key2 not in all_keys:
                                all_keys.append(key2)

                except:
                    for kw in df.keywords[n].split():
                        kw_id = kw.lower().replace("'", "").replace("´", "").replace("’", "")
                        single_keys=nltk.word_tokenize(kw_id)
                        for key in single_keys:
                            key2=lemmatizer.lemmatize(key)
                            if key2 not in all_keys:
                                all_keys.append(key2)
                
                #remove english stopwords and add to graph
                for key in all_keys:
                    if key in stopwords.words('english'):
                        all_keys.remove(key)
                    else:
                        if g.has_node(key):
                            g.node[key]['weight']+=1
                        else:
                            g.add_node(key, type='keyword', weight=1 )
                        
                #add all edges (all keywords appearing together will be connected)
                edges= itertools.combinations(all_keys,2)
                
                for edge in edges:
                    #if the edge exist add +1 weight
                    if g.has_edge(edge[0],edge[1]):
                        g[edge[0]][edge[1]]['weight'] += 1
                    else:
                        g.add_edge(edge[0],edge[1], weight=1)            
                
                #filter just first limit number of channels
#                 j+=1
#                 if (j==limit):
#                     break
        except:
            continue       

### Get degree quantile to filter information

In [270]:
def get_quantile(q_edge, gf):
    #find maximum and minimum degree 
    degrees = [val for (node, val) in gf.degree()]
    ctr=collections.Counter(degrees)
    #find third quartile in degree
    df2 = pd.DataFrame.from_dict(ctr,orient='index').reset_index()
    q=df2.quantile(q_edge)
    return float(q.values[1])

## Merge nodes 

In [271]:
def merge_nodes(new_node, node_list, gf):
    H =gf.copy()
    gf.add_node(new_node, type='keyword') # Add the 'merged' node   

    for n1,n2,data in H.edges(data=True):
        # For all edges related to one of the nodes to merge,
        # make an edge going to or coming from the `new gene`.
        if n1 in node_list:
            g.add_edge(new_node,n2)
        elif n2 in node_list:
            g.add_edge(n1,new_node)

    for n in node_list: # remove the merged nodes
        gf.remove_node(n)

### Create network using file name

In [374]:
fileNames=['minecraft', 'overwatch','agario', 'LOL', 'callOD','fivenaf','pkgo','roblox', 'gta','happyW']
data_folder_channels='/media/aruiz/data/channels_clean_data/'
fileName2=os.path.join(data_folder_channels,'all_channels_clean.csv')
g = nx.Graph()
create_tag_network(fileName2, 42261, g, 400)       

In [333]:
H =g.copy()
q=get_quantile(0.75,g)
for node in H.nodes():
    if g.degree(node)< q:
        g.remove_node(node)

In [375]:
print(len(g))
print(q)

23373
20.0


In [291]:
#minecraft --- node merging
merge_nodes('lets play',['let','play'],g)
merge_nodes('gta V',['gta','v'],g)
merge_nodes('call of duty',['call','duty'],g)

In [292]:
nx.write_gexf(g, "testG.gexf")

In [383]:
english_vocab = set(w.lower() for w in nltk.corpus.words.words())
spanish_vocab=set(w.lower() for w in nltk.corpus.cess_esp.words())
esp=[]
eng=[]
other=[]
w=0
c=0
for node, d in g.nodes(data=True):
    try:
        if d['weight']>2:
            if node in english_vocab:
                eng.append(node)
            elif node in spanish_vocab:
                esp.append(node)
            else:
                other.append(node)
    except:
        continue

In [400]:
print(other)

['(', 'com', 'минекрафт', 'dr', '4d', 'ゆっくり実況', 'pb', 'youtubeur', '2b2t', 'soundtrack', '1.10.2', 'mmorpg', 'frags', 'iv', 'brony', 'ep', 'bigfoot', 'przygody', 'modded', '2015', 'thepals', 'dicas', 'deutsche', 'spiele', 'glp', 'arten', 'teso', 'fnaf', 'kunst', 'rp', 'miley', 'montaż', 'hardstyle', 'fps', 'retrogaming', 'deviantart', 'howto', 'gopro', 'blackops3', 'мини', 'rockstar', 'kong', 'teamfortress2', 'игр', 'học', '720', 'aphmau', 'montreal', 'iballisticsquid', 'graciosos', '4k', 'crossfire', 'suisse', 'garrysmod', 'academia', '當個創世神', 'го', 'постройки', 'ftb', 'ssp', 'elrubius', 'skyblock', '2016', 'pvr', 'astuce', 'игры', 'bmx', 'loquendo', 'nhl', 'laptop', 'ogn', 'ireland', 'приколы', 'マインクラフト', 'makeup', 'jogo', 'warhammer', 'servidores', 'ماينكرافت', 'pe', 'マリオ', 'vid', '教學', 'gameplayer', 'deutschland', 'survivalgames', 'hry', 'ssbm', 'teenager', 'ty', 'летсплей', 'tecnico', '1.7', 'fallout', 'xbl', 'oopsclub', 'saicopvp', 'titanfall', 'pikachu', 'activision', 'tuto', 't

In [399]:
g.node['directo']

{'type': 'keyword', 'weight': 5}