In [4]:
import pandas as pd

# Load your cleaned TED Talks dataset
df = pd.read_csv('tedx_dataset.csv')

# Display the first few rows to understand its structure
print(df.head())

# Assuming 'title' and 'details' are suitable for content-based recommendation
data = df[['title', 'author', 'date', 'views', 'likes', 'link']]


                                               title                author  \
0      Climate action needs new frontline leadership  Ozawa Bineshi Albert   
1        The dark history of the overthrow of Hawaii         Sydney Iaukea   
2     How play can spark new ideas for your business         Martin Reeves   
3  Why is China appointing judges to combat clima...     James K. Thornton   
4     Cement's carbon problem — and 2 ways to fix it       Mahendra Singhi   

             date   views  likes  \
0   December 2021  404000  12000   
1   February 2022  214000   6400   
2  September 2021  412000  12000   
3    October 2021  427000  12000   
4    October 2021    2400     72   

                                                link  
0  https://ted.com/talks/ozawa_bineshi_albert_cli...  
1  https://ted.com/talks/sydney_iaukea_the_dark_h...  
2  https://ted.com/talks/martin_reeves_how_play_c...  
3  https://ted.com/talks/james_k_thornton_why_is_...  
4  https://ted.com/talks/mahendra_singh

In [5]:
import string
import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')

def preprocess_text(text):
    # Example: Remove punctuation and stopwords
    text = text.translate(str.maketrans('', '', string.punctuation))
    stopwords_set = set(stopwords.words('english'))
    cleaned_text = ' '.join(word for word in text.lower().split() if word not in stopwords_set)
    return cleaned_text

# Example: Preprocess the 'title' column
data['title_cleaned'] = data['title'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/iamashu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [6]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer='word', max_features=1000)
tfidf_matrix = vectorizer.fit_transform(data['title_cleaned'])

# Optionally, convert tfidf_matrix to DataFrame for easier manipulation
tfidf_df = pd.DataFrame(tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out())


In [7]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy.stats import pearsonr

def get_similarities(talk_content, tfidf_matrix):
    talk_tfidf = vectorizer.transform([talk_content])
    cosine_similarities = cosine_similarity(talk_tfidf, tfidf_matrix)
    pearson_correlations = [pearsonr(talk_tfidf.toarray().flatten(), tfidf_vector.toarray().flatten())[0]
                            for tfidf_vector in tfidf_matrix]

    return cosine_similarities[0], pearson_correlations

def recommend_talks(talk_content, data, tfidf_matrix):
    cosine_similarities, pearson_correlations = get_similarities(talk_content, tfidf_matrix)

    # Combine similarities with original data
    data['cosine_similarity'] = cosine_similarities
    data['pearson_correlation'] = pearson_correlations

    # Sort by similarities
    recommended_talks = data.sort_values(by=['cosine_similarity', 'pearson_correlation'], ascending=[False, False])

    return recommended_talks[['title', 'author', 'date', 'views', 'likes', 'link']]




In [8]:
talk_content = 'Climate change and impact on the health\
. How can we change this world by reducing carbon footprints?'
recommendations = recommend_talks(talk_content, data, tfidf_matrix)
print(recommendations)




                                                  title               author  \
365   3 strategies for effectively talking about cli...        John Marshall   
3552            The emergent patterns of climate change        Gavin Schmidt   
4724        How cognitive surplus will change the world          Clay Shirky   
1519  An app that predicts the impact of global clim...  Sarvesh Subramanian   
3011           How do carbohydrates impact your health?      Richard J. Wood   
...                                                 ...                  ...   
5378                              In praise of slowness          Carl Honoré   
5388                    Dreams from endangered cultures           Wade Davis   
5413                  The freakonomics of crack dealing        Steven Levitt   
5437                                Greening the ghetto        Majora Carter   
5438                                   Simplicity sells          David Pogue   

               date    views   likes  \