# **Importing libraries**

In [None]:
# importing python libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import string

# **Load Dataset**

In [None]:
ted_df=pd.read_csv("ted_talks.csv")
movies_df=pd.read_csv("movies_meta.csv")

In [None]:
## check our ted data
ted_df.head()

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...


# **preprocess movie data**

In [None]:
##Check our movie data
movies_df.head()

Unnamed: 0,adult,belongs_to_collection,budget,genres,homepage,id,imdb_id,original_language,original_title,overview,...,release_date,revenue,runtime,spoken_languages,status,tagline,title,video,vote_average,vote_count
0,False,"{'id': 10194, 'name': 'Toy Story Collection', ...",30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",http://toystory.disney.com/toy-story,862,tt0114709,en,Toy Story,"Led by Woody, Andy's toys live happily in his ...",...,1995-10-30,373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,,Toy Story,False,7.7,5415.0
1,False,,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",,8844,tt0113497,en,Jumanji,When siblings Judy and Peter discover an encha...,...,1995-12-15,262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,Roll the dice and unleash the excitement!,Jumanji,False,6.9,2413.0
2,False,"{'id': 645, 'name': 'James Bond Collection', '...",58000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 28, '...",http://www.mgm.com/view/movie/757/Goldeneye/,710,tt0113189,en,GoldenEye,James Bond must unmask the mysterious head of ...,...,1995-11-16,352194034.0,130.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,No limits. No fears. No substitutes.,GoldenEye,False,6.6,1194.0
3,False,,3600000,"[{'id': 18, 'name': 'Drama'}, {'id': 10749, 'n...",http://www.mgm.com/title_title.do?title_star=L...,451,tt0113627,en,Leaving Las Vegas,"Ben Sanderson, an alcoholic Hollywood screenwr...",...,1995-10-27,49800000.0,112.0,"[{'iso_639_1': 'en', 'name': 'English'}]",Released,I Love You... The Way You Are.,Leaving Las Vegas,False,7.1,365.0
4,False,,29500000,"[{'id': 878, 'name': 'Science Fiction'}, {'id'...",,63,tt0114746,en,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...",...,1995-12-29,168840000.0,129.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",Released,The future is history.,Twelve Monkeys,False,7.4,2470.0


In [None]:
## check the columns of our movie data
movies_df.columns

Index(['adult', 'belongs_to_collection', 'budget', 'genres', 'homepage', 'id',
       'imdb_id', 'original_language', 'original_title', 'overview',
       'popularity', 'poster_path', 'production_companies',
       'production_countries', 'release_date', 'revenue', 'runtime',
       'spoken_languages', 'status', 'tagline', 'title', 'video',
       'vote_average', 'vote_count'],
      dtype='object')

In [None]:
## we need only the data from movies is the title,id and overview column so let's extract that
movie_df=movies_df[['id','original_title', 'overview']]
movie_df.head()

Unnamed: 0,id,original_title,overview
0,862,Toy Story,"Led by Woody, Andy's toys live happily in his ..."
1,8844,Jumanji,When siblings Judy and Peter discover an encha...
2,710,GoldenEye,James Bond must unmask the mysterious head of ...
3,451,Leaving Las Vegas,"Ben Sanderson, an alcoholic Hollywood screenwr..."
4,63,Twelve Monkeys,"In the year 2035, convict James Cole reluctant..."


In [None]:
movie_df.shape

(4690, 3)

In [None]:
##check is there any missing data in our  movies dataset
movie_df.isnull().sum()

id                 0
original_title     0
overview          33
dtype: int64

In [None]:
## let's drop the rows that are missing in our movies dataset
movie_df.dropna(axis=0,inplace=True)

In [None]:
#merge overview and original_title both columns and make single column named as Subtitle
movie_df['Subtitle']=movie_df['original_title']+movie_df['overview']

In [None]:
# Preprocess the movie data
#converting text into lower case
movie_df["Subtitle"] = movie_df["Subtitle"].str.lower()

# **preprocess ted data**

In [None]:
ted_df.head()

Unnamed: 0,transcript,url
0,Good morning. How are you?(Laughter)It's been ...,https://www.ted.com/talks/ken_robinson_says_sc...
1,"Thank you so much, Chris. And it's truly a gre...",https://www.ted.com/talks/al_gore_on_averting_...
2,"(Music: ""The Sound of Silence,"" Simon & Garfun...",https://www.ted.com/talks/david_pogue_says_sim...
3,If you're here today — and I'm very happy that...,https://www.ted.com/talks/majora_carter_s_tale...
4,"About 10 years ago, I took on the task to teac...",https://www.ted.com/talks/hans_rosling_shows_t...


In [None]:
# check if there missing values exit
ted_df.isnull().sum()

transcript    0
url           0
dtype: int64

we can see that dataset has no missing values.

In [None]:
# Preprocess the ted talk data
#converting text into lower case
ted_df["transcript"] = ted_df["transcript"].str.lower()

# **Merge the dataset**

In [None]:
# Combine the movie and TED talk data
combined_df = pd.concat([movie_df[['id',"original_title", "overview", "Subtitle"]], ted_df[["url", "transcript"]]],
                        axis=1, ignore_index=True)
combined_df.columns =['id',"original_title", "overview", "Subtitle","url", "transcript"]

In [None]:
#shape of ted data
ted_df.shape

(2467, 2)

In [None]:
# shape of movie data
movie_df.shape

(4657, 4)

In [None]:
# So we can see that the shape of both dataset was not matched so we concat that dataset so we need to remove null rows
combined_df.dropna(inplace=True)

In [None]:
# shape of our final combined data
combined_df.shape

(2464, 6)

In [None]:
combined_df.head()

Unnamed: 0,id,original_title,overview,Subtitle,url,transcript
0,862.0,Toy Story,"Led by Woody, Andy's toys live happily in his ...","toy storyled by woody, andy's toys live happil...",https://www.ted.com/talks/ken_robinson_says_sc...,good morning. how are you?(laughter)it's been ...
1,8844.0,Jumanji,When siblings Judy and Peter discover an encha...,jumanjiwhen siblings judy and peter discover a...,https://www.ted.com/talks/al_gore_on_averting_...,"thank you so much, chris. and it's truly a gre..."
2,710.0,GoldenEye,James Bond must unmask the mysterious head of ...,goldeneyejames bond must unmask the mysterious...,https://www.ted.com/talks/david_pogue_says_sim...,"(music: ""the sound of silence,"" simon & garfun..."
3,451.0,Leaving Las Vegas,"Ben Sanderson, an alcoholic Hollywood screenwr...","leaving las vegasben sanderson, an alcoholic h...",https://www.ted.com/talks/majora_carter_s_tale...,if you're here today — and i'm very happy that...
4,63.0,Twelve Monkeys,"In the year 2035, convict James Cole reluctant...","twelve monkeysin the year 2035, convict james ...",https://www.ted.com/talks/hans_rosling_shows_t...,"about 10 years ago, i took on the task to teac..."


# **Tokenization of data**

In [None]:
import nltk
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [None]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


True

In [None]:
# Preprocess the subtitle and transcript data
nltk.download('stopwords')
nltk.download('wordnet')

stop_words = set(stopwords.words("english"))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()

    # Remove punctuation
    text = text.translate(str.maketrans(" ", " ", string.punctuation))

    # Tokenize text
    tokens = nltk.word_tokenize(text)

    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]

    # Lemmatize words
    tokens = [lemmatizer.lemmatize(token) for token in tokens]

    # Join tokens into a string
    text = " ".join(tokens)

    return text

combined_df["Subtitle"] = combined_df["Subtitle"].apply(preprocess_text)
combined_df["transcript"] = combined_df["transcript"].apply(preprocess_text)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


This above Python code block that contains a function and some code to preprocess subtitle and transcript data using the Natural Language Toolkit (nltk) library.

The code begins by downloading two resources from the nltk library: the stopwords list and the WordNetLemmatizer. Stop words are words that are commonly used in a language but don't carry much meaning, such as "the," "and," and "of." The WordNetLemmatizer is a tool for reducing words to their base or dictionary form, such as "run" to "ran" or "running" to "run".

The main function defined here is preprocess_text, which takes a single text string as input and returns a processed version of it. The function performs several text processing steps to clean and standardize the input text:



*  Convert all characters to lowercase using the .lower() method.
*   Remove all punctuation using the .translate() method and the string.punctuation constant.


*   Tokenize the text into a list of words using nltk.word_tokenize().
*  Remove any stop words from the list of tokens using a list comprehension

* Lemmatize each token in the list using the lemmatizer.lemmatize() method.
*  Join the list of tokens back into a single string using the .join() method.


After defining the preprocess_text function, the code applies it to two columns of a Pandas DataFrame called combined_df: the "Subtitle" and "transcript" columns. The apply() method is used to apply the preprocess_text function to each element of these columns, so that the entire columns are transformed into preprocessed versions of the text data.

# **vectorization and Cosine Matrix**

In [None]:
# Create a TF-IDF vectorizer
vectorizer = TfidfVectorizer()

# Create a TF-IDF matrix
tfidf_matrix = vectorizer.fit_transform(combined_df["Subtitle"])

# Compute the cosine similarity matrix
cosine_sim = cosine_similarity(tfidf_matrix, vectorizer.transform(combined_df["transcript"]))

This above Python code block creates a TF-IDF (Term Frequency-Inverse Document Frequency) vectorizer and matrix, and then uses them to compute the cosine similarity matrix between two text data columns in a Pandas DataFrame.

The TfidfVectorizer() function from the scikit-learn library is used to create a vectorizer that will convert the preprocessed text data in the "Subtitle" column of the combined_df DataFrame into a matrix of TF-IDF weights. TF-IDF is a numerical statistic that reflects the importance of a word in a document, based on how often it appears in the document (term frequency) and how common it is across all documents in a corpus (inverse document frequency). The resulting matrix will have one row for each document (i.e., subtitle) and one column for each unique word in the entire corpus.

The fit_transform() method is then used to fit the vectorizer to the preprocessed "Subtitle" column and transform it into the TF-IDF matrix. The resulting matrix is stored in a variable called tfidf_matrix.

The final step is to compute the cosine similarity matrix between the TF-IDF matrix of the "Subtitle" column and the vectorized "transcript" column of the combined_df DataFrame. Cosine similarity is a measure of the similarity between two non-zero vectors, which is commonly used in natural language processing to compare the similarity of two pieces of text. The cosine_similarity() function from the scikit-learn library is used to compute the cosine similarity matrix, which is stored in a variable called cosine_sim. The resulting matrix will have the same number of rows as the "Subtitle" column and the same number of columns as the "transcript" column, and each element will represent the cosine similarity between the TF-IDF vector of a subtitle and the TF-IDF vector of a transcript.

In [None]:
subtitle=combined_df.loc[movie_df['original_title'] =='Killing Bono', 'Subtitle']

In [None]:
subtitle.values

array([], dtype=object)

# **Recommendation system**

In [None]:
def recommend_transcripts(title, cosine_sim, data, n=10):
    # Get the row index of the given subtitle in the cosine similarity matrix
    subtitle=data.loc[combined_df['original_title'] ==title, 'Subtitle'].values[0]
    idx = data.loc[data["Subtitle"]==subtitle].index[0]

    # Get the cosine similarity scores for all transcripts
    transcript_scores = list(enumerate(cosine_sim[idx]))

    # Sort the transcripts by cosine similarity scores
    transcript_scores = sorted(transcript_scores, key=lambda x: x[1], reverse=True)

    # Get the top N transcripts
    top_transcripts = transcript_scores[:n]

    # Get the URLs and transcripts of the top N transcripts
    urls = []
    transcripts = []
    for transcript_idx, _ in top_transcripts:
        urls.append(data.loc[transcript_idx, "url"])
        transcripts.append(data.loc[transcript_idx, "transcript"])

    # Return the top N transcript URLs and transcripts
    return urls, transcripts

This above Python code block defines a function called recommend_transcripts that takes a subtitle, a cosine similarity matrix, and a Pandas DataFrame containing transcript data as input, and returns the URLs and transcripts of the most similar transcripts to the given subtitle.

The function begins by finding the row index of the given subtitle in the cosine similarity matrix by locating the row in the DataFrame data that has a "Subtitle" column value that matches the given subtitle, and then getting its index value using the .index[0] method.

The function then retrieves the cosine similarity scores between the given subtitle and all the transcripts by creating a list of tuples where each tuple contains an index value for a transcript and its corresponding cosine similarity score, using the enumerate() method and the previously computed idx variable to access the cosine similarity scores in the cosine similarity matrix.

Next, the transcript scores are sorted in descending order using the sorted() method, and the top n transcripts are selected using Python's slice notation.

The function then retrieves the URLs and transcripts of the top transcripts by iterating over the top_transcripts list and using the transcript index values to access the corresponding "url" and "transcript" values in the data DataFrame.

Finally, the function returns two lists: one containing the top n transcript URLs and another containing their transcripts. These lists are created using Python's append() method to add each transcript's URL and transcript to the appropriate list within the for loop.

Overall, the recommend_transcripts function allows users to input a subtitle and obtain a list of the top n transcripts that are most similar to it, along with their corresponding URLs and transcripts.

In [None]:
# Example usage
urls, transcripts=recommend_transcripts("Syn Babilonu", cosine_sim, combined_df, n=2)


In [None]:
print("Recommended TED talk URLs:")
for i in range(0,len(urls)):
  print(urls[i])

Recommended TED talk URLs:
https://www.ted.com/talks/michael_metcalfe_we_need_money_for_aid_so_let_s_print_it

https://www.ted.com/talks/robin_nagle_what_i_discovered_in_new_york_city_trash



In [None]:
print("Recommended TED talk transcripts:")
for i in range(0,len(transcripts)):
  print(transcripts[i])

Recommended TED talk transcripts:
thirteen year ago set goal end poverty success weve hit big hurdle aftermath financial crisis begun hit aid payment fallen two consecutive year question whether lesson learned saving financial system used help u overcome hurdle help million simply print money aidsurely common reaction laughter quick talk others channel john mcenroe seriousnow cant accent serious thanks two child youll learn much heart talk left pia life england two loving parent one standing right dorothy right life rural kenya shes one 13000 orphan vulnerable child assisted charity support believe dorothy like pia deserves best life chance afford give youll agree im sure un agrees overriding aim international aid strive life dignity allbut — here hurdle — afford aid aspiration history suggests 1970 government set target increase overseas aid payment 07 percent national income see big gap open actual aid target come millennium development goal eight ambitious target met 2015 tell one t