In [51]:
import pandas as pd
import scipy.sparse as sp
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import os

In [52]:
video_id = '2kyS6SvSYSE'

In [53]:
video_data = pd.read_csv('../data/processed.csv')
video_data['title'] = video_data['title'].str.lower()
video_data.head(5)

Unnamed: 0.1,Unnamed: 0,video_id,title,channel_title,publish_time,tags,views,likes,dislikes,comment_count,thumbnail_link,description
0,10,9wRQljFNDW8,dion lewis' 103-yd kick return td vs. denver! ...,NFL,2017-11-13T02:05:26.000Z,"['NFL', 'Football', 'offense', 'defense', 'afc...",81377.0,655.0,25.0,177.0,https://i.ytimg.com/vi/9wRQljFNDW8/default.jpg,New England Patriots returner Dion Lewis blast...
1,36,Om_zGhJLZ5U,tl;dw - every dceu movie before justice league,Screen Junkies,2017-11-12T18:00:03.000Z,"['screenjunkies', 'screen junkies', 'sj news',...",288922.0,7515.0,792.0,2111.0,https://i.ytimg.com/vi/Om_zGhJLZ5U/default.jpg,With Justice League approaching fast we rewatc...
2,41,goP4Z5wyOlM,iraq-iran earthquake: deadly tremor hits borde...,BBC News,2017-11-12T21:16:40.000Z,"['bbc', 'bbc news', 'news', 'iran', 'iran news...",34785.0,308.0,26.0,413.0,https://i.ytimg.com/vi/goP4Z5wyOlM/default.jpg,A strong 7.2-magnitude earthquake has rattled ...
3,55,8NHA23f7LvU,jason momoa wows hugh grant with some dothraki...,The Graham Norton Show,2017-11-10T19:06:23.000Z,"['Graham Norton', 'Graham Norton Show Official...",1496225.0,16116.0,236.0,605.0,https://i.ytimg.com/vi/8NHA23f7LvU/default.jpg,I think Sarah Millican was very excited for th...
4,76,IE-xepGLVt8,mayo clinic's first face transplant patient me...,Mayo Clinic,2017-11-10T12:04:17.000Z,"['Mayo Clinic', 'Health Care (Issue)', 'Health...",237307.0,1896.0,74.0,260.0,https://i.ytimg.com/vi/IE-xepGLVt8/default.jpg,One and a half years after the surgery that tr...


In [67]:
combined_video_data = video_data.drop(columns=['video_id', 'title', 'publish_time', 'views', 'likes', 'dislikes', 'comment_count', 'thumbnail_link', 'description'])
combined_video_data['combined'] = combined_video_data[combined_video_data.columns[1:3]].apply(lambda x: ','.join(x.dropna().astype(str)),axis=1)
combined_video_data = combined_video_data.drop(columns=['tags','channel_title'])
combined_video_data = combined_video_data.iloc[:,1:]
combined_video_data.head(5)

Unnamed: 0,combined
0,"NFL,['NFL', 'Football', 'offense', 'defense', ..."
1,"Screen Junkies,['screenjunkies', 'screen junki..."
2,"BBC News,['bbc', 'bbc news', 'news', 'iran', '..."
3,"The Graham Norton Show,['Graham Norton', 'Grah..."
4,"Mayo Clinic,['Mayo Clinic', 'Health Care (Issu..."


In [68]:
# Create Bag of Words Matrix for Tags
count = CountVectorizer(stop_words='english')
count_matrix = count.fit_transform(combined_video_data['combined'])
count_matrix

<6436x23942 sparse matrix of type '<class 'numpy.int64'>'
	with 139196 stored elements in Compressed Sparse Row format>

In [69]:
# Create TFIDF Matrix for Description
# Use TFIDF since description words appearing less is more
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(video_data['description'].values.astype('U'))
tfidf_matrix

<6436x66329 sparse matrix of type '<class 'numpy.float64'>'
	with 460201 stored elements in Compressed Sparse Row format>

In [70]:
# Run Cosine Similarity on Sparse Matrix
combine_sparse = sp.hstack([count_matrix, tfidf_matrix], format='csr')
cosine_sim = cosine_similarity(combine_sparse, combine_sparse)
cosine_sim

array([[1.00000000e+00, 3.39740286e-04, 3.45938170e-04, ...,
        3.44861649e-04, 2.18475090e-04, 1.36670494e-03],
       [3.39740286e-04, 1.00000000e+00, 6.45338768e-02, ...,
        2.47520136e-04, 6.22415771e-02, 1.15957837e-03],
       [3.45938170e-04, 6.45338768e-02, 1.00000000e+00, ...,
        2.95428492e-04, 1.17995611e-03, 7.96371598e-04],
       ...,
       [3.44861649e-04, 2.47520136e-04, 2.95428492e-04, ...,
        1.00000000e+00, 2.72511611e-04, 1.26398343e-03],
       [2.18475090e-04, 6.22415771e-02, 1.17995611e-03, ...,
        2.72511611e-04, 1.00000000e+00, 5.86890802e-02],
       [1.36670494e-03, 1.15957837e-03, 7.96371598e-04, ...,
        1.26398343e-03, 5.86890802e-02, 1.00000000e+00]])

In [71]:
transform_video_data = cosine_sim

In [72]:
num_recs = 10

# Get Video's Index from Video ID
indices = pd.Series(video_data.index, index = video_data['video_id'])
index = indices[video_id]
index

341

In [73]:
sim_scores = list(enumerate(transform_video_data[index]))
sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
sim_scores = sim_scores[1:num_recs]
sim_scores

[(256, 0.4706990744550658),
 (1625, 0.4665606722812171),
 (5590, 0.36981194253478966),
 (1159, 0.36310362510504895),
 (3290, 0.35926989874964016),
 (5288, 0.3577589898653427),
 (3493, 0.33564938914488596),
 (2609, 0.2943402133216258),
 (4344, 0.28582084284514575)]

In [74]:
# Get Indices of Similar Scores
recommended_indices = [i[0] for i in sim_scores]

# Grab Data from Indices
video_id = video_data['video_id'].iloc[recommended_indices]
video_title = video_data['title'].iloc[recommended_indices]
video_tags = video_data['tags'].iloc[recommended_indices]
video_channel = video_data['channel_title'].iloc[recommended_indices]
video_time = video_data['publish_time'].iloc[recommended_indices]
video_thumbnail = video_data['thumbnail_link'].iloc[recommended_indices]
video_description = video_data['description'].iloc[recommended_indices]

# Throw Data into DataFrame to return
recommended_data = pd.DataFrame(columns=['id','title','tags', 'channel', 'date', 'thumbnail', 'description'])
recommended_data['id'] = video_id
recommended_data['title'] = video_title
recommended_data['tags'] = video_tags
recommended_data['channel'] = video_channel
recommended_data['date'] = video_time
recommended_data['thumbnail'] = video_thumbnail
recommended_data['description'] = video_description

recommended_data

Unnamed: 0,id,title,tags,channel,date,thumbnail,description
256,vWYyZMH_QjA,casey neistat made a video about me,"['galexy note 8', 'samsung creators', 'neistat...",Shantell Martin,2017-11-13T17:26:03.000Z,https://i.ytimg.com/vi/vWYyZMH_QjA/default.jpg,I've know Casey for years now and we have want...
1625,a7NJ6Gek9v4,all time greatest airplane seat - emirates fir...,['Emirates first class'],CaseyNeistat,2017-12-11T23:45:03.000Z,https://i.ytimg.com/vi/a7NJ6Gek9v4/default.jpg,MY DOPE MERCH - https://shopcaseyneistat.com/\...
5590,og3aFSeEm94,just smack it.,['[none]'],CaseyNeistat,2018-04-20T15:47:24.000Z,https://i.ytimg.com/vi/og3aFSeEm94/default.jpg,DAN! https://www.youtube.com/DanmaceMusic: Bla...
1159,GcbsIv3QdFs,it's here!!! casey neistat merch,['[none]'],CaseyNeistat,2017-12-03T16:40:05.000Z,https://i.ytimg.com/vi/GcbsIv3QdFs/default.jpg,https://shopcaseyneistat.com\nhttps://shopcase...
3290,lfIhBu0TZBs,moving on from beme,['[none]'],CaseyNeistat,2018-01-25T14:58:16.000Z,https://i.ytimg.com/vi/lfIhBu0TZBs/default.jpg,this is hard for me. i rerecorded this video ...
5288,HQiDKE85P3U,some thoughts on the shooting at youtube,['[none]'],CaseyNeistat,2018-04-04T18:48:42.000Z,https://i.ytimg.com/vi/HQiDKE85P3U/default.jpg,https://nyti.ms/2q1IjTV\nhttps://wapo.st/2GvQR...
3493,sXe2fzd8als,"'martin' cast reunites, plays it coy about tv ...","['TMZ', 'Hollywood', 'Celebrity', 'Entertainme...",TMZ,2018-02-02T13:59:44.000Z,https://i.ytimg.com/vi/sXe2fzd8als/default.jpg,The cast of Martin is back together -- just fo...
2609,2dkl0fUKHAM,inside ricky martin's serene los angeles home ...,"['ricky martin', 'jwan yosef', 'ricky martin 2...",Architectural Digest,2018-01-09T13:00:44.000Z,https://i.ytimg.com/vi/2dkl0fUKHAM/default.jpg,Superstar singer Ricky Martin -- who plays Ant...
4344,e-XM8pek6Kg,ricky martin - fiebre (audio),"['Ricky Martin Music', 'Ricky Martin Official ...",RickyMartinVEVO,2018-02-23T05:00:00.000Z,https://i.ytimg.com/vi/e-XM8pek6Kg/default.jpg,Ricky Martin - Fiebre (Audio)\n“Fiebre”is now ...
