In [1]:
# Content based recommendation system
# Cosine similarity - ratings are seen as vectors in n -dimensional space 
# and the similarity is calculated based on the angle between these vectors
import pandas as pd
import numpy as np


In [2]:
data = pd.read_csv('./tedx_dataset.csv')

In [3]:
data.head()

Unnamed: 0,idx,main_speaker,title,details,posted,url,num_views
0,8d2005ec35280deb6a438dc87b225f89,Alexandra Auer,The intangible effects of walls,More barriers exist now than at the end of Wor...,Posted Apr 2020,https://www.ted.com/talks/alexandra_auer_the_i...,
1,b3072cd11f40eb57fd259555264476c6,Elizabeth Gilbert,It's OK to feel overwhelmed. Here's what to do...,If you're feeling anxious or fearful during th...,Posted Apr 2020,https://www.ted.com/talks/elizabeth_gilbert_it...,
2,4adc9fee977fa04c357ed4c9b52aa3cc,Butterscotch,"""Accept Who I Am""","Firing off her formidable beatboxing skills, m...",Posted Apr 2020,https://www.ted.com/talks/butterscotch_accept_...,0.0
3,59c641a72b495d522a7932145d6c02b3,Ethan Lisi,What it's really like to have autism,"""Autism is not a disease; it's just another wa...",Posted Apr 2020,https://www.ted.com/talks/ethan_lisi_what_it_s...,
4,d227f2faf6ec185e54436d86463f499b,Daniel Finkel,Can you solve the sea monster riddle?,"According to legend, once every thousand years...",Posted Apr 2020,https://www.ted.com/talks/daniel_finkel_can_yo...,0.0


In [4]:
data.columns

Index(['idx', 'main_speaker', 'title', 'details', 'posted', 'url',
       'num_views'],
      dtype='object')

In [7]:
df = data[['details', 'url', 'title']].copy()
df.reset_index()
df.head()

Unnamed: 0,details,url,title
0,More barriers exist now than at the end of Wor...,https://www.ted.com/talks/alexandra_auer_the_i...,The intangible effects of walls
1,If you're feeling anxious or fearful during th...,https://www.ted.com/talks/elizabeth_gilbert_it...,It's OK to feel overwhelmed. Here's what to do...
2,"Firing off her formidable beatboxing skills, m...",https://www.ted.com/talks/butterscotch_accept_...,"""Accept Who I Am"""
3,"""Autism is not a disease; it's just another wa...",https://www.ted.com/talks/ethan_lisi_what_it_s...,What it's really like to have autism
4,"According to legend, once every thousand years...",https://www.ted.com/talks/daniel_finkel_can_yo...,Can you solve the sea monster riddle?


In [10]:
'''
TF-IDF vectorizer - "Term Frequency-Inverse Document Frequency"
technique to convert text data into numerical format suitable for machine learning
bigrams (pairs of consecutive words) and unigrams (single words), with English stop words removed

transformed into TF-IDF matrices using the vectorizers - numerical form that captures the importance of words

cosine similarity - measures similarity of two documents based on vectorized representations

resulting similarity matrices (bi_sim and uni_sim) will be used for recommendation
'''

from sklearn.feature_extraction import text

ted_talks = df['details'].to_list()

# TfidfVectorizer - weighs word counts by how often they appear
# ngram_range - n-gram is just a string of n words in a row
# "I AM HERE" contains 2 grams 'I AM' & 'AM HERE', sentence itself is 3 grams

bi_tfidf = text.TfidfVectorizer(stop_words="english", ngram_range=(1,2))
bi_matrix = bi_tfidf.fit_transform(ted_talks)

In [12]:
uni_tfidf = text.TfidfVectorizer(stop_words="english")
uni_matrix = uni_tfidf.fit_transform(ted_talks)

In [13]:
from sklearn.metrics.pairwise import cosine_similarity

bi_sim = cosine_similarity(bi_matrix)
uni_sim = cosine_similarity(uni_matrix)

In [18]:
'''
 similarity vector x and sorts it in descending order
 finds the indices of the TED Talks that are most similar (highest values)
 returns the titles of these recommended talks

 generate recommended TED Talks for each TED Talk in the dataset
 stored under "ted_talks_uni" and "ted_talks_bi" on the dataframe
'''

def recommend_ted_talks(x):
    return ".".join(data["title"].loc[x.argsort()[-5:-1]])

df["ted_talks_uni"] = [recommend_ted_talks(x) for x in uni_sim]
df["ted_talks_bi"] = [recommend_ted_talks(x) for x in bi_sim]

df['ted_talks_uni'].str.replace("_", " ").str.upper().str.strip().str.split("\n")[0]

['MY FATHER THE FORGER.HOW POLICE AND THE PUBLIC CAN CREATE SAFER NEIGHBORHOODS TOGETHER.KIDS CAN TEACH THEMSELVES.THE QUEST TO UNDERSTAND CONSCIOUSNESS']

In [17]:
df.head()

Unnamed: 0,details,url,title,ted_talks_uni,ted_talks_bi
0,More barriers exist now than at the end of Wor...,https://www.ted.com/talks/alexandra_auer_the_i...,The intangible effects of walls,My father the forger.How police and the public...,"Why should you read ""Lord of the Flies"" by Wil..."
1,If you're feeling anxious or fearful during th...,https://www.ted.com/talks/elizabeth_gilbert_it...,It's OK to feel overwhelmed. Here's what to do...,How we must respond to the coronavirus pandemi...,Why sleep matters now more than ever.How to cr...
2,"Firing off her formidable beatboxing skills, m...",https://www.ted.com/talks/butterscotch_accept_...,"""Accept Who I Am""",Three anti-social skills to improve your writi...,Imaginative sculptures that explore how we per...
3,"""Autism is not a disease; it's just another wa...",https://www.ted.com/talks/ethan_lisi_what_it_s...,What it's really like to have autism,A new way to diagnose autism.The world needs a...,A new way to diagnose autism.The world needs a...
4,"According to legend, once every thousand years...",https://www.ted.com/talks/daniel_finkel_can_yo...,Can you solve the sea monster riddle?,How better transportation can reinvigorate a c...,Can you solve the troll's paradox riddle?.Can ...
