In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import warnings
warnings.filterwarnings('ignore')
!pip install gensim

In [None]:
df= pd.read_csv('videos-stats.csv')
df.head()

Unnamed: 0.1,Unnamed: 0,Title,Video ID,Published At,Keyword,Likes,Comments,Views
0,0,Apple Pay Is Killing the Physical Wallet After...,wAZZ-UWGVHI,2022-08-23,tech,3407.0,672.0,135612.0
1,1,The most EXPENSIVE thing I own.,b3x28s61q3c,2022-08-24,tech,76779.0,4306.0,1758063.0
2,2,My New House Gaming Setup is SICK!,4mgePWWCAmA,2022-08-23,tech,63825.0,3338.0,1564007.0
3,3,Petrol Vs Liquid Nitrogen | Freezing Experimen...,kXiYSI7H2b0,2022-08-23,tech,71566.0,1426.0,922918.0
4,4,Best Back to School Tech 2022!,ErMwWXQxHp0,2022-08-08,tech,96513.0,5155.0,1855644.0


## 3. Exploratory Data Analysis (EDA)

Let's perform some basic EDA to understand the dataset.

In [None]:
# Display the column names and their data types
print(df.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1881 entries, 0 to 1880
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Unnamed: 0    1881 non-null   int64  
 1   Title         1881 non-null   object 
 2   Video ID      1881 non-null   object 
 3   Published At  1881 non-null   object 
 4   Keyword       1881 non-null   object 
 5   Likes         1879 non-null   float64
 6   Comments      1879 non-null   float64
 7   Views         1879 non-null   float64
dtypes: float64(3), int64(1), object(4)
memory usage: 117.7+ KB
None


In [None]:
# Display descriptive statistics for numerical columns
display(df.describe())

Unnamed: 0.1,Unnamed: 0,Likes,Comments,Views
count,1881.0,1879.0,1879.0,1879.0
mean,940.0,170061.0,7863.331559,11612920.0
std,543.142247,796229.3,37879.964926,108445000.0
min,0.0,-1.0,-1.0,25.0
25%,470.0,2672.5,199.0,84515.0
50%,940.0,14787.0,814.0,591721.0
75%,1410.0,60906.0,3377.5,2804978.0
max,1880.0,16445560.0,732818.0,4034122000.0


In [None]:
# Check for missing values
print(df.isnull().sum())

Unnamed: 0      0
Title           0
Video ID        0
Published At    0
Keyword         0
Likes           2
Comments        2
Views           2
dtype: int64


In [None]:
# Explore the distribution of the 'Keyword' column
display(df['Keyword'].value_counts())

Unnamed: 0_level_0,count
Keyword,Unnamed: 1_level_1
crypto,50
tutorial,50
mrbeast,50
reaction,50
asmr,50
trolling,50
history,50
interview,50
cubes,50
marvel,50


In [None]:
# Continue with EDA and data cleaning

# Handle missing values: Fill missing numerical values with 0 (or another appropriate strategy)
df.fillna({'Likes': 0, 'Comments': 0, 'Views': 0}, inplace=True)

# Convert numerical columns to integer type after handling NaNs
df['Likes'] = df['Likes'].astype(int)
df['Comments'] = df['Comments'].astype(int)
df['Views'] = df['Views'].astype(int)


# Address potential negative values in Likes, Comments, and Views by replacing them with 0
df['Likes'] = df['Likes'].apply(lambda x: max(x, 0))
df['Comments'] = df['Comments'].apply(lambda x: max(x, 0))
df['Views'] = df['Views'].apply(lambda x: max(x, 0))


# Check for missing values again after handling
print("\nMissing values after handling:")
print(df.isnull().sum())

# Display descriptive statistics again to see the effect of cleaning
display(df.describe())


Missing values after handling:
Unnamed: 0      0
Title           0
Video ID        0
Published At    0
Keyword         0
Likes           0
Comments        0
Views           0
dtype: int64


Unnamed: 0.1,Unnamed: 0,Likes,Comments,Views
count,1881.0,1881.0,1881.0,1881.0
mean,940.0,169880.2,7854.971823,11600570.0
std,543.142247,795824.9,37860.678206,108388000.0
min,0.0,0.0,0.0,0.0
25%,470.0,2667.0,199.0,83902.0
50%,940.0,14741.0,814.0,589907.0
75%,1410.0,60821.0,3375.0,2797148.0
max,1880.0,16445560.0,732818.0,4034122000.0


In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import warnings
import gensim.downloader as api
import gensim

warnings.filterwarnings('ignore')

class YouTubeRecommender:
    def __init__(self, df):
        """
        Initialize the YouTube Recommendation System

        Parameters:
        df: DataFrame with columns - Title, Video ID, Keyword, Likes, Comments, Views
        """
        self.df = df.copy()
        self.word2vec_model = None # Initialize Word2Vec model as None
        self.load_word2vec_model() # Load model upon initialization
        self.prepare_features()
        self.build_content_similarity() # Build similarity matrix upon initialization

    def load_word2vec_model(self):
        """Load a pretrained Word2Vec model"""
        try:
            print("Downloading and loading pretrained Word2Vec model...")
            # Using a smaller model for demonstration, you can choose a larger one like 'word2vec-google-news-300'
            self.word2vec_model = api.load("glove-wiki-gigaword-50")
            print("Word2Vec model loaded successfully!")
        except Exception as e:
            print(f"Error loading Word2Vec model: {e}")
            self.word2vec_model = None

    def get_word_embedding(self, text):
        """Get the average embedding for a piece of text"""
        if self.word2vec_model is None:
            return None

        words = text.lower().split()
        embeddings = [self.word2vec_model[word] for word in words if word in self.word2vec_model]

        if not embeddings:
            return np.zeros(self.word2vec_model.vector_size) # Return zero vector if no words found

        return np.mean(embeddings, axis=0)


    def prepare_features(self):
        """Prepare and engineer features for recommendation"""

        # 1. Handle missing values
        # Ensure data types are suitable before filling NaNs, though previous steps handled this
        # Keeping these lines for robustness within the class
        for col in ['Likes', 'Comments', 'Views']:
            if col in self.df.columns:
                self.df[col] = pd.to_numeric(self.df[col], errors='coerce').fillna(0).astype(int)


        # 2. Create engagement score
        self.df['engagement_rate'] = (
            (self.df['Likes'] + self.df['Comments']) /
            (self.df['Views'] + 1)  # Add 1 to avoid division by zero
        ) * 100

        # 3. Create popularity score (normalized)
        scaler = MinMaxScaler()
        # Ensure columns exist and handle potential NaNs from conversion if needed
        cols_for_scaling = [col for col in ['Views', 'Likes', 'Comments'] if col in self.df.columns]
        if cols_for_scaling:
            self.df['popularity_score'] = scaler.fit_transform(
                self.df[cols_for_scaling]
            ).mean(axis=1)
        else:
             self.df['popularity_score'] = 0 # Default if columns are missing


        # 4. Combine Title and Keyword for better content representation
        self.df['content'] = (
            self.df['Title'].fillna('') + ' ' +
            self.df['Keyword'].fillna('')
        )

        print("Feature engineering completed!")
        print(f"Dataset shape: {self.df.shape}")

    def build_content_similarity(self):
        """Build Word2Vec embeddings and compute similarity"""
        if self.word2vec_model is None:
            print("Word2Vec model not loaded. Cannot build similarity matrix.")
            self.content_similarity = None
            self.content_embeddings = None
            return

        if 'content' not in self.df.columns or self.df['content'].isnull().all():
            print("Content column is missing or empty. Cannot build similarity matrix.")
            self.content_similarity = None
            self.content_embeddings = None
            return

        print("Generating video content embeddings using Word2Vec...")
        self.content_embeddings = np.array([
            self.get_word_embedding(text) for text in self.df['content'].fillna('')
        ])

        # Handle cases where get_word_embedding returned None or zero vectors
        if self.content_embeddings is None or len(self.content_embeddings) == 0:
            print("Could not generate content embeddings.")
            self.content_similarity = None
            return


        # Compute cosine similarity between embeddings
        print("Computing cosine similarity between embeddings...")
        self.content_similarity = cosine_similarity(self.content_embeddings, self.content_embeddings)

        print(f"Similarity matrix shape: {self.content_similarity.shape}")


    def find_videos_by_keyword(self, keyword, n_videos=5):
        """Find videos matching a specific keyword"""
        keyword_videos = self.df[
            self.df['Keyword'].str.contains(keyword, case=False, na=False)
        ].copy()

        if keyword_videos.empty:
            return "No videos found for this keyword!"

        # Sort by combined score to get relevant popular videos
        keyword_videos['combined_score'] = (
            keyword_videos['popularity_score'] * 0.6 +
            keyword_videos['engagement_rate'] * 0.4
        )
        top_keyword_videos = keyword_videos.nlargest(n_videos, 'combined_score')

        return top_keyword_videos


    def get_content_recommendations(self, video_title, n_recommendations=10):
        """
        Get content-based recommendations

        Parameters:
        video_title: Title of the video to get recommendations for
        n_recommendations: Number of recommendations to return
        """
        if self.content_similarity is None:
             return "Similarity matrix not built. Cannot get content recommendations."

        # Find video index
        idx = self.df[self.df['Title'] == video_title].index

        if len(idx) == 0:
            return "Video not found!"

        idx = idx[0]

        # Get similarity scores
        sim_scores = list(enumerate(self.content_similarity[idx]))

        # Sort by similarity (excluding the video itself)
        sim_scores = sorted(sim_scores, key=lambda x: x[1], reverse=True)[1:n_recommendations+1]

        # Get video indices
        video_indices = [i[0] for i in sim_scores]

        # Return recommendations with scores
        recommendations = self.df.iloc[video_indices][
            ['Title', 'Keyword', 'Views', 'Likes', 'engagement_rate', 'Video ID']
        ].copy()

        recommendations['similarity_score'] = [i[1] for i in sim_scores]

        return recommendations

    def get_category_recommendations(self, category, n_recommendations=10):
        """Get top videos by category based on popularity"""

        category_videos = self.df[self.df['Keyword'] == category].copy()

        if category_videos.empty:
             return "No videos found for this category!"

        # Sort by popularity and engagement
        category_videos['combined_score'] = (
            category_videos['popularity_score'] * 0.6 +
            category_videos['engagement_rate'] * 0.4
        )

        top_videos = category_videos.nlargest(n_recommendations, 'combined_score')

        return top_videos[['Title', 'Keyword', 'Views', 'Likes', 'Comments', 'combined_score']]


    def get_hybrid_recommendations(self, video_title, n_recommendations=10):
        """
        Hybrid recommendations combining content similarity and popularity
        """
        if self.content_similarity is None:
             return "Similarity matrix not built. Cannot get hybrid recommendations."

        # Get content-based recommendations (get more to choose from)
        content_recs = self.get_content_recommendations(video_title, n_recommendations * 2)

        if isinstance(content_recs, str):
            return content_recs # Propagate "Video not found!" or similarity matrix error

        # Add popularity boost
        content_recs['hybrid_score'] = (
            content_recs['similarity_score'] * 0.7 +
            content_recs['engagement_rate'] * 0.3
        )

        # Sort by hybrid score
        hybrid_recs = content_recs.nlargest(n_recommendations, 'hybrid_score')

        # Select only Title and Views columns for the final output
        return hybrid_recs[['Title', 'Views']]

    def get_recommendations_by_keyword(self, keyword, n_recommendations=10):
        """
        Get hybrid recommendations based on a keyword search using Word2Vec similarity.
        """
        print(f"Searching for videos related to keyword: '{keyword}' using Word2Vec...")

        if self.word2vec_model is None or self.content_embeddings is None:
            return "Word2Vec model or content embeddings not available."

        try:
            # Get the embedding for the search keyword
            keyword_embedding = self.get_word_embedding(keyword)

            if keyword_embedding is None or np.all(keyword_embedding == 0):
                 return "Could not get embedding for the keyword."

            # Compute similarity between the keyword embedding and all video embeddings
            keyword_sim_scores = cosine_similarity([keyword_embedding], self.content_embeddings).flatten()

            # Get the index of the video with the highest similarity to the keyword
            most_similar_video_index = keyword_sim_scores.argsort()[-1]

            # Get the title of the most similar video
            source_video_title = self.df.iloc[most_similar_video_index]['Title']
            print(f"Found most semantically similar video title: '{source_video_title}'")

            # Get hybrid recommendations based on this source video title
            hybrid_recs = self.get_hybrid_recommendations(
                source_video_title,
                n_recommendations
            )

            return hybrid_recs

        except Exception as e:
            print(f"Error during Word2Vec similarity search: {e}")
            return "Could not find videos related to this keyword using Word2Vec similarity."


    def get_trending_videos(self, n_recommendations=10):
        """Get trending videos based on engagement and views"""

        trending = self.df.copy()

        # Calculate trending score
        trending['trending_score'] = (
            trending['Views'] * 0.4 +
            trending['Likes'] * 0.3 +
            trending['Comments'] * 0.3
        )

        top_trending = trending.nlargest(n_recommendations, 'trending_score')

        return top_trending[['Title', 'Keyword', 'Views', 'Likes', 'Comments', 'trending_score']]

In [None]:
recommender = YouTubeRecommender(df)

Downloading and loading pretrained Word2Vec model...
Word2Vec model loaded successfully!
Feature engineering completed!
Dataset shape: (1881, 11)
Generating video content embeddings using Word2Vec...
Computing cosine similarity between embeddings...
Similarity matrix shape: (1881, 1881)


In [None]:
recommender.build_content_similarity()

Generating video content embeddings using Word2Vec...
Computing cosine similarity between embeddings...
Similarity matrix shape: (1881, 1881)


In [None]:
recommendations = recommender.get_hybrid_recommendations(
     "iPhone 14 Lineup is NOT what you Expected.. (plot twist)",
     n_recommendations=5
 )
print(recommendations)

                                                  Title   Views
1580  No One Has The Balls To Make This Nintendo Vid...   87422
1790  The stuff no one tells you about game developm...    1352
1572  Nintendo Switch Online Is About To Get VERY In...   56101
1594  If I Can Fix This Xbox Series X I&#39;ll GIVE ...  114921
531   iPhone 14 - what can we expect?! Rumors and more!  184475


In [None]:
keyword = "music"
recommendations_by_keyword = recommender.get_recommendations_by_keyword(
    keyword,
    n_recommendations=5
)
print(recommendations_by_keyword)

Searching for videos related to keyword: 'music' using Word2Vec...
Found most semantically similar video title: 'Ë®ÇÈñ±Á†¥10Ëê¨ÂõûÈ•ãÔºÅ2Â∞èÊôÇ 32È¶ñ „ÄäMusic Panda„Äã Á¥îÊ≠åÊõ≤Â§ßÁâπÈõÜÔºÅ'
                                                 Title     Views
923  Music Mix 2022 üéß Remixes of Popular Songs üéß ED...      9958
891  Lofi For Reading üìö Lofi Hip Hop | Study Music ...     14336
893  Best Music Mix ‚ô´ No Copyright Gaming Music ‚ô´ M...  13966109
926  Relaxing Whiskey Blues Music | Beautiful Relax...     10105
735                           ‰∏äÁî∞È∫óÂ•à„Äå„É™„ÉÜ„É©„ÉÅ„É•„Ç¢„Äç MUSIC VIDEO   2792013


Let's break down the implementation in this notebook according to the case study categories:

### 1. Company Selection and Business Context

*   **Implementation**: While not explicitly defined in a dedicated cell, the context is implied by the use of a YouTube video dataset. The business problem is framed around building a YouTube recommendation system to help users discover videos they might be interested in, thereby increasing engagement on the platform.
*   **Notebook Location**: This context is established by the overall goal of the notebook and the type of data used.

### 2. Data Collection and Description

*   **Implementation**: The data collection step is represented by the loading of the `videos-stats.csv` dataset. The initial EDA steps also contribute to the data description by providing information about the columns, data types, and basic statistics.
*   **Notebook Location**:
    *   Loading the dataset: Cell `JMgm1f-pomJD`
    *   Initial data description (`df.info()`, `df.describe()`, `df.isnull().sum()`, `df['Keyword'].value_counts()`): Cells `975d6be7`, `6efbc629`, `a836160a`, `ef278413`

### 3. Exploratory Data Analysis (EDA)

*   **Implementation**: Several code cells were used to perform basic EDA. This included checking data types, looking at descriptive statistics, identifying missing values, and exploring the distribution of keywords. Further data cleaning to handle missing and negative values was also performed as part of the EDA process.
*   **Notebook Location**: Cells `975d6be7`, `6efbc629`, `a836160a`, `ef278413`, and the cleaning steps in `z34dIiu1qoTW`.

### 4. Problem Formulation

*   **Implementation**: This step is explained in a markdown cell that formulates the recommendation problem based on the available data, suggesting an approach to predict user interest based on interactions or content. The chosen approach is a hybrid recommendation system.
*   **Notebook Location**: Markdown cell `0324d94f`

### 5. Model Design and Justification

*   **Implementation**: The core of this step is the design and implementation of the `YouTubeRecommender` class. This class encapsulates the logic for preparing features, building content similarity (initially with TF-IDF, then updated to Word2Vec), and providing different types of recommendations (content-based, category-based, hybrid, trending, and keyword-based). The shift from TF-IDF to Word2Vec was a specific design choice to improve semantic understanding for keyword searches.
*   **Notebook Location**: Code cell `-j17N5BewssT` (containing the `YouTubeRecommender` class definition).

### 6. Experiments and Evaluation

*   **Implementation**: An `RecommenderEvaluator` class was defined to calculate various metrics for the recommendation system. Code was generated to instantiate this evaluator and run evaluations for a single video and for the system overall on a sample of videos.
*   **Notebook Location**:
    *   `RecommenderEvaluator` class definition: Cell `6YVsze_KvKWQ`
    *   Evaluation execution: Cell `rYIUT6iPveiB`

### 7. Limitations and Future Work

*   **Implementation**: While not a dedicated section yet, limitations have been discussed in the context of the keyword search, specifically the issue of unexpected similarity results with TF-IDF and then Word2Vec. This implicitly points to areas for future work, such as using more advanced NLP models or different content representation techniques.
*   **Notebook Location**: Discussed in responses to user queries about the keyword search results and the similarity between 'green energy' and 'mukbang'.

### 8. Final Recommendations

*   **Implementation**: This step has not been explicitly addressed yet. It would involve summarizing the findings from the evaluation and providing recommendations based on the system's performance and the case study objectives.
*   **Notebook Location**: Not yet implemented.

In summary, the notebook has covered data loading, cleaning, EDA, problem formulation, model design (including feature engineering and similarity calculation), and the setup for evaluation. The next steps would involve analyzing the evaluation results and formulating final recommendations based on those findings.