In [3]:
import pandas as pd 
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel

In [8]:
# load the data set

df = pd.read_csv('cleaned_text_pipeline.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5419 entries, 0 to 5418
Data columns (total 22 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   video_id               5419 non-null   object 
 1   title                  5419 non-null   object 
 2   description            4103 non-null   object 
 3   tags                   5419 non-null   object 
 4   published_at           5419 non-null   object 
 5   channel_id             5419 non-null   object 
 6   channel_title          5419 non-null   object 
 7   category_id            5419 non-null   int64  
 8   duration               5419 non-null   object 
 9   definition             5419 non-null   object 
 10  viewCount              5419 non-null   int64  
 11  likeCount              5419 non-null   int64  
 12  commentCount           5419 non-null   int64  
 13  title_length           5419 non-null   int64  
 14  published_year         5419 non-null   int64  
 15  publ

In [17]:
# TF-IDF Vertorization

df['text_soup'] = df['text_soup'].fillna('')
tfidf = TfidfVectorizer(stop_words='english')
tfidf_matrix = tfidf.fit_transform(df['text_soup'])

cosine_sim = linear_kernel(tfidf_matrix, tfidf_matrix)

indices = pd.Series(df.index, index=df['title'])

def get_recommendations(title):
    """"
    finds the top 10 most similar videos
    """

    try:
        idx = indices[title]
        sim_scores = list(enumerate(cosine_sim[idx]))
        sim_scores = sorted(sim_scores, key = lambda x: x[1], reverse = True)
        sim_scores = sim_scores[1:11]

        videos_indices = [i[0] for i in sim_scores]
        return df['title'].iloc[videos_indices]
    except KeyError:
        return f"Error: The title '{title}' was no found in the dataset"
    except Exception as e:
        return f"An error occurred: {e}"

print("Rcommendation system is ready")     

Rcommendation system is ready


In [18]:
# Test the recommendations system

test_title = df['title'].iloc[5]

print(f"\nRecommendations for: '{test_title}'")

recomm = get_recommendations(test_title)
print(recomm)


Recommendations for: 'Top 3 Smartphone gadgets under ₹50||🔥🤯 #shorts #gadgets #viralvideo #ytshorts'
3993    Top 3 best amazing cool gadgets 🔥🤯|| #shorts #...
4039    cool gadgets you can buy #gadgets #shorts #you...
243     amezing japan tech gadgets #shorts #gadgets #j...
3989            top 3 amazing gadgets #shorts #technology
398     Amazing Gadgets of 2025 You Won’t Believe Exis...
315                4 Satisfying Gadgets  #gadgets #shorts
378     top 3 gadgets 🤯 #gadgets #tech #unboxing #gami...
59                     4 School Students Gadgets🤩 #shorts
336                                       3 Cool Gadgets🔥
420                26 Coolest Tech Gadgets 2025 On Amazon
Name: title, dtype: object
