In [1]:
# Install Required Libraries
!pip install pandas numpy matplotlib wordcloud nltk scikit-learn




Defaulting to user installation because normal site-packages is not writeable


In [2]:
#Data Preprocessing
#Load and Inspect the Data
import pandas as pd

df = pd.read_csv("C:/Users/anush/Downloads/tedx_dataset.csv")
print(df.head())


                                idx       main_speaker  \
0  8d2005ec35280deb6a438dc87b225f89     Alexandra Auer   
1  b3072cd11f40eb57fd259555264476c6  Elizabeth Gilbert   
2  4adc9fee977fa04c357ed4c9b52aa3cc       Butterscotch   
3  59c641a72b495d522a7932145d6c02b3         Ethan Lisi   
4  d227f2faf6ec185e54436d86463f499b      Daniel Finkel   

                                               title  \
0                    The intangible effects of walls   
1  It's OK to feel overwhelmed. Here's what to do...   
2                                  "Accept Who I Am"   
3               What it's really like to have autism   
4              Can you solve the sea monster riddle?   

                                             details           posted  \
0  More barriers exist now than at the end of Wor...  Posted Apr 2020   
1  If you're feeling anxious or fearful during th...  Posted Apr 2020   
2  Firing off her formidable beatboxing skills, m...  Posted Apr 2020   
3  "Autism is not a di

In [8]:
#Handle Missing Values
##Check for and handle missing values
print(df.isnull().sum())
df.dropna(inplace=True)


idx                0
main_speaker       1
title              0
details            0
posted             0
url                0
num_views       4258
dtype: int64


In [10]:
#Extract Year and Month
##Extract the year and month from the posted column
import pandas as pd

# Step 1: Load the dataset
df = pd.read_csv("C:/Users/anush/Downloads/tedx_dataset.csv")

# Step 2: Remove 'Posted ' from the 'posted' column
df['posted_clean'] = df['posted'].str.replace("Posted ", "", regex=False)

# Step 3: Convert cleaned column to datetime
df['posted_clean'] = pd.to_datetime(df['posted_clean'], format="%b %Y", errors='coerce')

# Step 4: Extract year and month
df['year'] = df['posted_clean'].dt.year
df['month'] = df['posted_clean'].dt.month



In [11]:
#Combine Title and Details
##Create a new column that combines the title and details
df['combined'] = df['title'] + ' ' + df['details']


In [12]:
# Text Processing and Feature Extraction
#Text Cleaning
##Use NLTK to clean the text data:
import nltk
import string
from nltk.corpus import stopwords

nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = text.split()
    tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(tokens)

df['processed'] = df['combined'].apply(preprocess_text)


In [13]:
#TF-IDF Vectorization
##Convert the text data into numerical features using TF-IDF
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(df['processed'])


In [14]:
#Build the Recommendation System (1 hour)
##Compute Cosine Similarity
##Calculate the cosine similarity between the TF-IDF vectors:
from sklearn.metrics.pairwise import cosine_similarity

cosine_sim = cosine_similarity(tfidf_matrix, tfidf_matrix)


In [15]:
#Define Recommendation Function
##Create a function to get recommendations based on a given TED Talk title:

def get_recommendations(title, cosine_sim=cosine_sim):
    indices = pd.Series(df.index, index=df['title']).drop_duplicates()
    idx = indices[title]
    sim_scores = list(enumerate(cosine_sim[idx]))
    sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)
    sim_scores = sim_scores[1:6]
    talk_indices = [i[0] for i in sim_scores]
    return df['title'].iloc[talk_indices]


In [17]:
print(df['title'].head(10))


0                      The intangible effects of walls
1    It's OK to feel overwhelmed. Here's what to do...
2                                    "Accept Who I Am"
3                 What it's really like to have autism
4                Can you solve the sea monster riddle?
5                 Why sleep matters now more than ever
6       How to make pandemics optional, not inevitable
7    How you can help save the monarch butterfly — ...
8                            History vs. Sigmund Freud
9    How we can navigate the coronavirus pandemic w...
Name: title, dtype: object


In [18]:
#Test the Recommendation System
##Test the recommendation function with a sample title
print(get_recommendations("What it's really like to have autism"))


2230                      The forgotten history of autism
3205                         A new way to diagnose autism
2590    Autism — what we know (and what we don't know ...
731                              The ruralities of autism
1608                To understand autism, don't look away
Name: title, dtype: object
