In [1]:
pip install google-api-python-client pandas scikit-learn

Collecting google-api-python-clientNote: you may need to restart the kernel to use updated packages.

  Obtaining dependency information for google-api-python-client from https://files.pythonhosted.org/packages/1c/8a/12a389757b025dda3d6df29c4143dd896a3cd837f00587e162ad4baf3d31/google_api_python_client-2.139.0-py2.py3-none-any.whl.metadata
  Downloading google_api_python_client-2.139.0-py2.py3-none-any.whl.metadata (6.7 kB)
Collecting httplib2<1.dev0,>=0.19.0 (from google-api-python-client)
  Obtaining dependency information for httplib2<1.dev0,>=0.19.0 from https://files.pythonhosted.org/packages/a8/6c/d2fbdaaa5959339d53ba38e94c123e4e84b8fbc4b84beb0e70d7c1608486/httplib2-0.22.0-py3-none-any.whl.metadata
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Collecting google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0 (from google-api-python-client)
  Obtaining dependency information for google-auth!=2.24.0,!=2.25.0,<3.0.0.dev0,>=1.32.0 from https://files.pythonhosted.org/pack

# Data Collection

In [3]:
# notebooks/test_youtube_api.py
from app.youtube_api import YouTubeAPI

# Initialize the API object
api = YouTubeAPI()

# Example video IDs
video_ids = ['Ks-_Mh1QhMc', 'dQw4w9WgXcQ']

# Fetch video details
video_df = api.get_video_details(video_ids)

# Print the results
print(video_df.head())


      video_id                                              title  \
0  Ks-_Mh1QhMc  Your body language may shape who you are | Amy...   
1  dQw4w9WgXcQ  Rick Astley - Never Gonna Give You Up (Officia...   

                                         description  \
0  Body language affects how others see us, but i...   
1  The official video for “Never Gonna Give You U...   

                                                tags category_id  view_count  \
0  [Amy Cuddy, TED, TEDTalk, TEDTalks, TED Talk, ...          22    25232908   
1  [rick astley, Never Gonna Give You Up, nggyu, ...          10  1560490912   

   like_count  comment_count  
0      445821           9861  
1    17768799        2351103  


# Model Training

In [15]:
# content-recommendation-system.ipynb

# Import necessary modules
import pandas as pd
import numpy as np
from app.youtube_api import YouTubeAPI
from app.recommendation import preprocess_data, train_collaborative_filtering, get_content_recommendations
from googleapiclient.errors import HttpError
import os
from dotenv import load_dotenv
from sklearn.metrics.pairwise import cosine_similarity

# Load environment variables from .env file
load_dotenv()

# Initialize YouTube API
youtube_api = YouTubeAPI()  # No need to pass API key as it's read from the environment variable

# Function to fetch video IDs based on a search query or other criteria
def fetch_video_ids(query, max_results=10):
    try:
        search_response = youtube_api.youtube.search().list(
            q=query,
            part='id',
            maxResults=max_results,
            type='video'
        ).execute()

        video_ids = [item['id']['videoId'] for item in search_response.get('items', [])]
        return video_ids
    except HttpError as e:
        print(f"An error occurred: {e}")
        return []

# Get search query from user input
search_query = input("Enter search query: ")

if not search_query:
    raise ValueError("Search query cannot be empty.")

# Fetch video IDs based on user input
video_ids = fetch_video_ids(search_query, max_results=10)

if not video_ids:
    raise ValueError("No video IDs found for the search query.")

# Get video details
video_df = youtube_api.get_video_details(video_ids)

# Preprocess data
video_df, tfidf_matrix = preprocess_data(video_df)

# Function to dynamically generate user-item interaction data
def generate_user_item_data(user_ids, video_ids):
    user_item_data = {
        'user_id': [],
        'video_id': [],
        'interaction': []
    }

    for user_id in user_ids:
        for video_id in video_ids:
            # Simulate interaction data (e.g., based on some logic or actual data)
            interaction = np.random.choice([0, 1], p=[0.5, 0.5])  # Randomly assign interaction
            user_item_data['user_id'].append(user_id)
            user_item_data['video_id'].append(video_id)
            user_item_data['interaction'].append(interaction)
    
    return pd.DataFrame(user_item_data)

# Sample user IDs (should be fetched dynamically from your actual user data source)
user_ids = ['user1', 'user2', 'user3']

# Generate user-item interaction data dynamically
user_item_df = generate_user_item_data(user_ids, video_ids)

# Train collaborative filtering model
user_factors, item_factors = train_collaborative_filtering(user_item_df)

# Calculate content similarity
content_similarity = cosine_similarity(tfidf_matrix, tfidf_matrix)

# Save preprocessed data and models
video_df.to_csv('../data/video_metadata.csv', index=False)
user_item_df.to_csv('../data/user_watch_history.csv', index=False)
np.save('../data/user_factors.npy', user_factors)
np.save('../data/item_factors.npy', item_factors)
np.save('../data/content_similarity.npy', content_similarity)


Enter search query: machine learning tutorial


ValueError: n_components(50) must be <= n_features(10).

In [10]:
import pandas as pd

# Sample user-item interaction data
data = {
    'user_id': ['user1', 'user1', 'user2', 'user2', 'user3'],
    'video_id': ['video1', 'video2', 'video1', 'video3', 'video4'],
    'interaction': [1, 1, 1, 1, 1]  # Binary interactions
}

user_item_df = pd.DataFrame(data)


In [11]:
import pandas as pd
from sklearn.decomposition import TruncatedSVD
from scipy.sparse import csr_matrix


In [12]:
def train_collaborative_filtering(user_item_df):
    interaction_matrix = user_item_df.pivot(index='user_id', columns='video_id', values='interaction').fillna(0)
    print("Interaction matrix shape:", interaction_matrix.shape)
    interaction_sparse_matrix = csr_matrix(interaction_matrix)
    
    n_components = min(50, interaction_sparse_matrix.shape[1])
    print(f"Using n_components: {n_components}")  # Debugging output
    svd = TruncatedSVD(n_components=n_components, random_state=42)
    user_factors = svd.fit_transform(interaction_sparse_matrix)
    item_factors = svd.components_.T
    
    return user_factors, item_factors

# Run the function with sample data
user_factors, item_factors = train_collaborative_filtering(user_item_df)


print("User factors shape:", user_factors.shape)
print("Item factors shape:", item_factors.shape)


Interaction matrix shape: (3, 4)
Using n_components: 4
User factors shape: (3, 3)
Item factors shape: (4, 3)
