## Finding 10 similar Youtube channels

In [1]:
from pandas import Series, DataFrame
import pandas as pd
import numpy as np
import os
import matplotlib.pylab as plt
from numpy.random import RandomState
import copy
import csv

from sklearn.neighbors import NearestNeighbors
#from sklearn.neighbors import LSHForest

In [16]:
data_train_act = pd.read_csv(r'C:\Users\user\Anaconda3\activities.csv.gz', 
                        sep = ',',
                       )
data_channels = pd.read_csv(r'C:\Users\user\Anaconda3\channels.csv.gz', 
                        sep = ',',
                       )

### 1st model - subscribed on > 10 channels (similarity by audience)

In [3]:
# number of users and channels
n_users = data_train_act.author_id.unique().shape[0]
n_items = data_train_act.channel_id.unique().shape[0]

In [5]:
# type of action
type_s = 'subscribed'
type_c = 'commented'

In [6]:
def grouped_by_user(data, act_type):    
    return data[data.activity == act_type].groupby('author_id').agg({'created_at': pd.Series.nunique,
                        'channel_id': pd.Series.tolist})

In [29]:
def add_channels_as_features(data_grouped_by_user, max_num_actions):
    data_grouped_by_user = data_grouped_by_user[data_grouped_by_user.created_at > max_num_actions]
    for channel in channels:
        data_grouped_by_user[channel] = [channel in a for a in data_grouped_by_user.channel_id]    
    print(data_grouped_by_user.shape)
    return data_grouped_by_user

In [30]:
def create_channel_by_user_matrix(data_with_channels):
    return data_with_channels[channels].transpose()

In [11]:
def train_nearest_neigbours_model(data_to_train, params):
    nn = NearestNeighbors(**params)
    nn = nn.fit(data_to_train)
    result_neighbors = nn.kneighbors(data_to_train, return_distance = True)
    return result_neighbors

In [110]:
def add_k_closest(data_channels, train_channels, model_results, k, prefix):
    for nghbr_num in range(1, k):
        name = prefix+str(nghbr_num)
        name_id = name + '_id'
        names = list()
        name_ids = list()
        for channel_id in list(data_channels.YT_id):
            i = train_channels.index(channel_id)
            nn_ix = model_results[1][i][nghbr_num]
            names.append(data_channels.ix[nn_ix].YT_Title)
            name_ids.append(data_channels.ix[nn_ix].YT_id)
        data_channels[name] = names
        data_channels[name_id] = name_ids
    return data_channels

In [None]:
### Find neigbors with subscribed dataset:

In [19]:
channels = list(data_channels.YT_id.values)
print(len(channels))

1000


In [26]:
data_act_grouped = grouped_by_user(data_train_act, type_s)
print(data_act_grouped.shape)
print('index:', data_act_grouped.index.names, ' columns:',data_act_grouped.columns.values)

(8864137, 2)
index: ['author_id']  columns: ['created_at' 'channel_id']


In [27]:
sum(data_act_grouped.created_at > 10)

20858

In [32]:
data_act_grouped = add_channels_as_features(data_act_grouped, 10)
#print(data_act_grouped.shape)
data_to_train = create_channel_by_user_matrix(data_act_grouped)
print(data_to_train.shape)

(20858, 1002)
(1000, 20858)


In [33]:
# TRAIN MODEL WITH DEFINED SET OF PARAMETERS:
params = {'n_neighbors': 10, 'metric': 'jaccard', 'algorithm': 'brute'}
results_subscribed_jaccard = train_nearest_neigbours_model(data_to_train, params)

In [42]:
def show_neighbors(user_index, model_results):
    nn_ix = model_results[1][user_index]
    print(data_channels.ix[nn_ix][['YT_Title', 'country', 'YT_category', 'YT_Total_Subscribers']])


In [64]:
show_neighbors(0, results_subscribed_jaccard)
show_neighbors(2, results_subscribed_jaccard)
show_neighbors(273, results_subscribed_jaccard)

             YT_Title country YT_category  YT_Total_Subscribers
0    JustinBieberVEVO      US       Music              20974056
35    TaylorSwiftVEVO      US       Music              18952084
39   OneDirectionVEVO      GB       Music              19931142
14        RihannaVEVO      US       Music              20489720
60   ArianaGrandeVevo      US       Music              10961767
74      KatyPerryVEVO      US       Music              18405805
127   SelenaGomezVEVO      US       Music               9119991
38          AdeleVEVO      GB       Music              11500000
58   FifthHarmonyVEVO      US       Music               4505141
110         DrakeVEVO      US       Music               5319146
                 YT_Title country       YT_category  YT_Total_Subscribers
2                     WWE      US            Sports              10964777
49    Movieclips Trailers      US  Film & Animation               9069807
748          PrankvsPrank      US            Comedy               9892516


In [58]:
# add 9 closest friends (1st is same channel - the closest)
data_channels = add_k_closest(data_channels, list(data_to_train.index.values), results_subscribed_jaccard, 10, 'model1_NN_')

In [92]:
# CHECK JACCARD DISTANCE:

def jaccard_distance(data_to_train, index1, index2):
    intersection = sum(np.array(data_to_train.ix[index1]) & np.array(data_to_train.ix[index2]))
    union = sum(np.array(data_to_train.ix[index1]) | np.array(data_to_train.ix[index2]))
    return 1 - intersection / union

print(results_subscribed_jaccard[0][0][1])
print(results_subscribed_jaccard[1][0][1])
print('check:',jaccard_distance(data_to_train, 0, 35))
print('')
print(results_subscribed_jaccard[0][0][9])
print(results_subscribed_jaccard[1][0][9])
print('check:',jaccard_distance(data_to_train, 0, 110))


0.809921811809
35
check: 0.809921811809

0.877620396601
110
check: 0.877620396601


In [61]:
# save results of the 1st model:
import csv
data_channels.to_csv('A:\PROGRAMMING\Tubular_results\First_model_results.csv', 
                                             sep = ',' ,index= False,encoding = 'utf-8')

### 2nd model (similarity by general information)

In [94]:
## Create numerical categories:
category = list(set(data_channels['country']))
data_channels['country_ix'] = [category.index(i) for i in data_channels['country']]
category = list(set(data_channels['YT_category']))
data_channels['YT_category_ix'] = [category.index(i) for i in data_channels['YT_category']]

In [95]:
## Create set of all possible topics from channels dataset:
topics = set()
for topic_set in data_channels.topics:
    if pd.isnull(topic_set):
        pass
    else:        
        topics |= set(topic_set.upper().split(','))
print('First 10 topics', list(topics)[:10])
print('Last 10 topics', list(topics)[-10:])
print(len(list(topics)))

First 10 topics ['DREAMWORKS ANIMATION', 'SPANISH LANGUAGE', 'ATV', 'CARTOON NETWORK', 'WEST COAST HIP HOP', 'FIRST LADY OF THE UNITED STATES', 'BOSTON RED SOX', 'GOOGLE PLAY', 'REAL-LIFE SUPERHERO', 'GOOGLE']
Last 10 topics ['POLLY POCKET', 'APPLE DAILY', 'MINECRAFT', 'JOSEPH GARRETT', 'THE VERGE', 'BINOMIO DE ORO DE AMÉRICA', 'WA', 'PSY', 'GOIÂNIA', 'THE VOICE UK']
1699


In [99]:
def check_word_in_topics(data, word):
    check = list()    
    for i in range(len(data)):
        topics_i = data.topics[i]
        if pd.isnull(topics_i):
            check.append(False)
        else:
            check.append(word in topics_i.upper().split(','))
    return check

In [100]:
### ADD TOPICS AS FEATURES:
for word in list(topics):
    data_channels[word] = check_word_in_topics(data_channels, word)


In [103]:
print('Number of topics added as features:', data_channels[list(topics)].shape[1])

Number of topics added as features: 1699


In [105]:
features_top = list(topics)
data_channels['scaled_subscriptions'] = data_channels.YT_Total_Subscribers / data_channels.YT_Total_Subscribers.max()
features_top.extend(['scaled_subscriptions', 'country_ix'])

params = {'n_neighbors': 10, 'metric': 'minkowski', 'algorithm': 'brute'}
result_topics = train_nearest_neigbours_model(data_channels[features_top], params)

In [108]:
show_neighbors(0, result_topics)
show_neighbors(2, result_topics)
show_neighbors(1, result_topics)

              YT_Title country YT_category  YT_Total_Subscribers
0     JustinBieberVEVO      US       Music              20974056
285               ELLO      US       Music               2176108
17            emimusic      US       Music               5360039
110          DrakeVEVO      US       Music               5319146
540  BlackEyedPeasVEVO      US       Music               2628326
807           NeYoVEVO      US       Music               1661516
333   ChainsmokersVEVO      US       Music                687240
274   NickiMinajAtVEVO      US       Music              10508077
223         Bruno Mars      US       Music              10327921
137        beyonceVEVO      US       Music               9297164
                                 YT_Title country            YT_category  \
2                                     WWE      US                 Sports   
577  UFC - Ultimate Fighting Championship      US                 Sports   
658                  SuperHero Reality TV      US       F

In [111]:
# add 9 closest friends (1st is same channel - the closest)
data_channels = add_k_closest(data_channels, list(data_channels.YT_id), result_topics, 10, 'model2_NN_')

In [112]:
# save results of the 1st AND 2nd models:
import csv
data_channels.to_csv('A:\PROGRAMMING\Tubular_results\First_AND_Second_model_results.csv', 
                                             sep = ',' ,index= False,encoding = 'utf-8')

In [114]:
data_channels[['model1_NN_1', 'model1_NN_2', 'model2_NN_1', 'model2_NN_2']][:3]

Unnamed: 0,model1_NN_1,model1_NN_2,model2_NN_1,model2_NN_2
0,TaylorSwiftVEVO,OneDirectionVEVO,ELLO,emimusic
1,Family Fun Pack,Baby Big Mouth,Kid Studio,Blu Toys Surprise Brinquedos & Juegos
2,Movieclips Trailers,PrankvsPrank,UFC - Ultimate Fighting Championship,SuperHero Reality TV


### 3d model - commented on > x channels (similarity by audience)

In [126]:
data_act_grouped = grouped_by_user(data_train_act, type_c)
print(data_act_grouped.shape)
print('index:', data_act_grouped.index.names, ' columns:',data_act_grouped.columns.values)

(9423299, 2)
index: ['author_id']  columns: ['created_at' 'channel_id']


In [129]:
sum(data_act_grouped.created_at>30)

21625

In [130]:
data_act_grouped = add_channels_as_features(data_act_grouped, 30)
#print(data_act_grouped.shape)
data_to_train = create_channel_by_user_matrix(data_act_grouped)
print(data_to_train.shape)

# TRAIN MODEL WITH DEFINED SET OF PARAMETERS:
params = {'n_neighbors': 10, 'metric': 'jaccard', 'algorithm': 'brute'}
results_commented_jaccard = train_nearest_neigbours_model(data_to_train, params)

(21625, 1002)
(1000, 21625)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [135]:
show_neighbors(0, results_commented_jaccard)
show_neighbors(2, results_commented_jaccard)
show_neighbors(200, results_commented_jaccard)

              YT_Title country YT_category  YT_Total_Subscribers
0     JustinBieberVEVO      US       Music              20974056
14         RihannaVEVO      US       Music              20489720
35     TaylorSwiftVEVO      US       Music              18952084
58    FifthHarmonyVEVO      US       Music               4505141
39    OneDirectionVEVO      GB       Music              19931142
60    ArianaGrandeVevo      US       Music              10961767
38           AdeleVEVO      GB       Music              11500000
128  MeghanTrainorVEVO      US       Music               5356018
66         Wiz Khalifa      US       Music               7786975
338        officialpsy      KR       Music               8980066
                                   YT_Title country    YT_category  \
2                                       WWE      US         Sports   
321                       ComedyShortsGamer      GB  Entertainment   
202                                     IGN      US         Gaming   
27   

In [132]:
# add 9 closest friends (1st is same channel - the closest)
data_channels = add_k_closest(data_channels, list(data_to_train.index.values), results_commented_jaccard, 10, 'model3_NN_')

In [133]:
# save results of the 1st AND 2nd AND 3d models:
import csv
data_channels.to_csv('A:\PROGRAMMING\Tubular_results\First_AND_Second_AND_third_model_results.csv', 
                                             sep = ',' ,index= False,encoding = 'utf-8')

In [169]:
data_channels[['model1_NN_1', 'model1_NN_2', 'model2_NN_1', 'model2_NN_2', 'model3_NN_1', 'model3_NN_2']][:3]

Unnamed: 0,model1_NN_1,model1_NN_2,model2_NN_1,model2_NN_2,model3_NN_1,model3_NN_2
0,TaylorSwiftVEVO,OneDirectionVEVO,ELLO,emimusic,RihannaVEVO,TaylorSwiftVEVO
1,Family Fun Pack,Baby Big Mouth,Kid Studio,Blu Toys Surprise Brinquedos & Juegos,Baby Big Mouth,ToyTrains4u
2,Movieclips Trailers,PrankvsPrank,UFC - Ultimate Fighting Championship,SuperHero Reality TV,ComedyShortsGamer,IGN


### Building final model

In [136]:
def extend_k1_with_unique_k2(list1, k1, list2, k2):
    list3 = list1[:k1]
    uniq_list2 = [f for f in list2 if f not in list3]
    list3.extend(uniq_list2[:k2])
    return list3
print(extend_with_unique_k2([1,2,6,4,2,6,6,7,4,4], 5, [2,6,7,9,8,0], 3))

[1, 2, 6, 4, 2, 7, 9, 8]


In [152]:
nn1_list = list()
nn2_list = list()
nn3_list = list()
for i in range(1,10):
    nn1_list.append('model1_NN_'+str(i))
    nn2_list.append('model2_NN_'+str(i))
    nn3_list.append('model3_NN_'+str(i))
print(nn1_list)

['model1_NN_1', 'model1_NN_2', 'model1_NN_3', 'model1_NN_4', 'model1_NN_5', 'model1_NN_6', 'model1_NN_7', 'model1_NN_8', 'model1_NN_9']


In [153]:
a1 = list(data_channels[nn1_list].ix[0].values)
print(a1)
a2 = list(data_channels[nn2_list].ix[0].values)
print(a2)
a3 = list(data_channels[nn3_list].ix[0].values)
print(a3)
a4 = extend_k1_with_unique_k2(a1,6,a2,2)
print(extend_k1_with_unique_k2(a4,8,a3,2))

['TaylorSwiftVEVO', 'OneDirectionVEVO', 'RihannaVEVO', 'ArianaGrandeVevo', 'KatyPerryVEVO', 'SelenaGomezVEVO', 'AdeleVEVO', 'FifthHarmonyVEVO', 'DrakeVEVO']
['ELLO', 'emimusic', 'DrakeVEVO', 'BlackEyedPeasVEVO', 'NeYoVEVO', 'ChainsmokersVEVO', 'NickiMinajAtVEVO', 'Bruno Mars', 'beyonceVEVO']
['RihannaVEVO', 'TaylorSwiftVEVO', 'FifthHarmonyVEVO', 'OneDirectionVEVO', 'ArianaGrandeVevo', 'AdeleVEVO', 'MeghanTrainorVEVO', 'Wiz Khalifa', 'officialpsy']
['TaylorSwiftVEVO', 'OneDirectionVEVO', 'RihannaVEVO', 'ArianaGrandeVevo', 'KatyPerryVEVO', 'SelenaGomezVEVO', 'ELLO', 'emimusic', 'FifthHarmonyVEVO', 'AdeleVEVO']


In [170]:
def add_final(data, names1, k1, names2, k2, names3, k3, prefix, postfix):
    all_final_unique_neighbors = list()
    for i in range(len(data)):
        a1 = list(data[names1].ix[i].values)
        a2 = list(data[names2].ix[i].values)
        a3 = list(data[names3].ix[i].values)
        a4 = extend_k1_with_unique_k2(a1, k1, a2, k2)
        a4 = extend_k1_with_unique_k2(a4, k1+k2, a3, k3)
        all_final_unique_neighbors.append(a4)
    for i in range(10):
        name = prefix + str(i+1)+postfix
        data[name] = [neighbor[i] for neighbor in all_final_unique_neighbors]
    return data

In [162]:
data_channels = add_final(data_channels, nn1_list, 6, nn2_list, 2, nn3_list, 2, 'Final_Neighbor_', '')

In [176]:
data_channels[[ 'YT_Title','Final_Neighbor_1', 'Final_Neighbor_2', 'Final_Neighbor_3', 
               'Final_Neighbor_6', 'Final_Neighbor_8']]

Unnamed: 0,YT_Title,Final_Neighbor_1,Final_Neighbor_2,Final_Neighbor_3,Final_Neighbor_6,Final_Neighbor_8
0,JustinBieberVEVO,TaylorSwiftVEVO,OneDirectionVEVO,RihannaVEVO,SelenaGomezVEVO,emimusic
1,Ryan ToysReview,Family Fun Pack,Baby Big Mouth,FunToyzCollector,Blu Toys Surprise Brinquedos & Juegos,Hulyan Maya
2,WWE,Movieclips Trailers,PrankvsPrank,JoBlo Movie Trailers,CollegeHumor,SuperHero Reality TV
3,Family Fun Pack,Ryan ToysReview,Toy Freaks,Surprise Collector & Sunshine,Toy Monster,Vat19
4,T-Series,Eros Now,Zee Music Company,SonyMusicIndiaVEVO,SET India,Rajshri
5,LittleBabyBum ®,ChuChu TV Nursery Rhymes,Mother Goose Club,Baby Big Mouth,CVS 3D Rhymes,LittleBabyBum ® Español
6,netd müzik,DokuzSekiz Müzik,OHA diyorum!,YAPYAP,Scorp App,atv
7,Маша и Медведь,Get Movies,Masha and The Bear,ChuChu TV Nursery Rhymes,Маша та Ведмідь,Teremok TV
8,Toy Monster,HeroesIRL,SuperHeroes ForReal,Superheroes Life S. L.,SuperHero VS SuperHero,Acunn.com
9,Get Movies,Маша и Медведь,LittleBabyBum ®,Baby Big Mouth,Mother Goose Club,Teremok TV


In [171]:
## ADD FINAL NEIGHBORS IDS:
nn1_list_id = list()
nn2_list_id = list()
nn3_list_id = list()
for i in range(1,10):
    nn1_list_id.append('model1_NN_'+str(i)+'_id')
    nn2_list_id.append('model2_NN_'+str(i)+'_id')
    nn3_list_id.append('model3_NN_'+str(i)+'_id')
print(nn1_list_id)

data_channels = add_final(data_channels, nn1_list_id, 6, nn2_list_id, 2, nn3_list_id, 2, 'Final_Neighbor_', '_ID')

['model1_NN_1_id', 'model1_NN_2_id', 'model1_NN_3_id', 'model1_NN_4_id', 'model1_NN_5_id', 'model1_NN_6_id', 'model1_NN_7_id', 'model1_NN_8_id', 'model1_NN_9_id']


In [172]:
data_channels[[ 'YT_Title','Final_Neighbor_1', 'Final_Neighbor_1_ID', 'Final_Neighbor_8', 'Final_Neighbor_8_ID']]

Unnamed: 0,YT_Title,Final_Neighbor_1,Final_Neighbor_1_ID,Final_Neighbor_8,Final_Neighbor_8_ID
0,JustinBieberVEVO,TaylorSwiftVEVO,ANLZYMidaCbLQFWXBC95Jg,emimusic,2kTZB_yeYgdAg4wP2tEryA
1,Ryan ToysReview,Family Fun Pack,vthuVsurPaVz2a7_4LepGg,Hulyan Maya,3AgxXIspxGB8Mb-6bO9YlA
2,WWE,Movieclips Trailers,i8e0iOVk1fEOogdfu4YgfA,SuperHero Reality TV,hikza6yUVgrs7L879pzZyw
3,Family Fun Pack,Ryan ToysReview,hGJGhZ9SOOHvBB0Y4DOO_w,Vat19,DRbNGFusqlXX4a5vwi9ouQ
4,T-Series,Eros Now,X52tYZiEh_mHoFja3Veciw,Rajshri,EKWXRsfUHkan-D_ljU8Asw
5,LittleBabyBum ®,ChuChu TV Nursery Rhymes,BnZ16ahKA2DZ_T5W0FPUXg,LittleBabyBum ® Español,HicabXz9rUMWLcdMqBtbxQ
6,netd müzik,DokuzSekiz Müzik,bCFxJq-fCwiEo9CgowI7CQ,atv,UVZ7T_kwkxDOGFcDlFI-hg
7,Маша и Медведь,Get Movies,lZkHt2kNIgyrTTPnSQV3SA,Teremok TV,xWkNmkeAL52VTJyBlIJRXw
8,Toy Monster,HeroesIRL,k_GTlc9ARh6MuU61DUemnQ,Acunn.com,LBwKhRQL0ORY3GB8c2l5cg
9,Get Movies,Маша и Медведь,Rv76wLBC73jiP7LX4C3l8Q,Teremok TV,xWkNmkeAL52VTJyBlIJRXw


In [173]:
# save results of the 1st AND 2nd AND 3d models:
import csv
data_channels.to_csv('A:\PROGRAMMING\Final_results.csv', 
                                             sep = ',' ,index= False,encoding = 'utf-8')