# Video Recommendation Algorithm 

### Importing Required Libraries

In [None]:
# Documentation added to cell 2
# Enhanced Documentation
import numpy as np
import pandas as pd
import requests
from IPython.display import JSON
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

## 1.) Data Preprocessing

### 1.1) Extracting data from the API

In [None]:
# Documentation added to cell 5
# Enhanced Documentation
base_url = "https://api.socialverseapp.com"
headers = {
    "Flic-Token": "flic_1e01009f9c1a54706f385bcc1993a08fd9647ba8f499572d280654d1c03c47bf"
}

#### All Viewed Posts of Users

In [None]:
# Documentation added to cell 7
# Enhanced Documentation
viewed_url = f"{base_url}/posts/view?page=1&page_size=1000&resonance_algorithm=resonance_algorithm_cjsvervb7dbhss8bdrj89s44jfjdbsjd0xnjkbvuire8zcjwerui3njfbvsujc5if"
response = requests.get(viewed_url ,headers=headers)
json_data = response.json()

In [None]:
# Documentation added to cell 8
# Enhanced Documentation
# Extracting data from JSON
data = []
for entry in json_data.get("posts", []):
    data.append({
        "id": entry["id"],
        "category_id": entry["category"]["id"],
        "category": entry["category"]["name"],
        "User": entry["first_name"]+entry["last_name"],
        "username": entry["username"],
        # "post_slug": entry["slug"],
        "title": entry["title"],
        "identifier": entry["identifier"],
        "comment_count": entry["comment_count"],
        "upvote_count": entry["upvote_count"],
        "view_count": entry["view_count"],
        "exit_count": entry["exit_count"],
        "rating_count": entry["rating_count"],
        "average_rating": entry["average_rating"],
        "share_count": entry["share_count"],
        "upvoted": entry["upvoted"],
        "bookmarked": entry["bookmarked"],
        "following": entry["following"]
    })
    
viewed_df = pd.DataFrame(data)

In [None]:
# Documentation added to cell 9
# Enhanced Documentation
viewed_df.head()

In [None]:
# Documentation added to cell 10
# Enhanced Documentation
viewed_df.shape

#### Extract All Posts

In [None]:
# Documentation added to cell 12
# Enhanced Documentation
post_url = f"{base_url}/posts/summary/get?page=1&page_size=1000"
response = requests.get(post_url, headers=headers)
json_data = response.json()

In [None]:
# Documentation added to cell 13
# Enhanced Documentation
json_data.keys()

In [None]:
# Documentation added to cell 14
# Enhanced Documentation
json_data["posts"]

In [None]:
# Documentation added to cell 15
# Enhanced Documentation
posts = json_data["posts"]
data = []

for post in posts:
    data.append({
        'id': post['id'],
        'title': post['title'],
        'category': post['category']['name'],
        'username': post['username'],
        'view_count': post['view_count'],
        'upvote_count': post['upvote_count'],
        'comment_count': post['comment_count'],
        'rating-count': post['rating_count'],
        'average_rating': post['average_rating'],
        'post_summary': post['post_summary']
    })

post_df = pd.DataFrame(data)
post_df.head(2)

In [None]:
# Documentation added to cell 16
# Enhanced Documentation
post_df.shape

In [None]:
# Documentation added to cell 17
# Enhanced Documentation
def mergeList(L):
    if isinstance(L, list):
        return ", ".join(str(item) for item in L)  
    return str(L) if L is not None else ""

def convert(text):
    if not isinstance(text, dict):  # Check if text is a dictionary
        print("Error: Expected dictionary, got:", type(text))
        return []
    L = []
    descr = text["description"]
    if isinstance(descr, list): descr = mergeList(descr)
    genre = text.get("genre", "")
    if not isinstance(genre, str):
        genre = " "

    # Handle actions based on data type
    if isinstance(text["actions"], list):
        # action = " ".join(text["actions"])
        action = " ".join([str(item) if isinstance(item, str) else mergeList(item.values()) for item in text["actions"]])
    elif isinstance(text["actions"], dict):
        first_key = list(text["actions"].keys())[0]
        action = mergeList(text["actions"][first_key])
        if isinstance(action, dict):
            action = ""
    else:
        action = ""
    
    # Handle emotions based on data type
    if isinstance(text["emotions"], list):
        # emotion = " ".join(text["emotions"])
        emotion = " ".join([str(item) if isinstance(item, str) else mergeList(item.values()) for item in text["targeted_audiance"]])
    elif isinstance(text["emotions"], dict):
        first_key = list(text["emotions"].keys())[0]
        emotion = mergeList(text["emotions"][first_key]) 
    else:
        emotion = ""

    # Handle audience based on data type
    if isinstance(text["targeted_audiance"], list):
        # audience = " ".join(text["targeted_audiance"])
        audience = " ".join([str(item) if isinstance(item, str) else mergeList(item.values()) for item in text["targeted_audiance"]])
    elif isinstance(text["targeted_audiance"], dict):
        first_key = list(text["targeted_audiance"].keys())[0]
        audience = mergeList(text["targeted_audiance"][first_key])
    else:
        audience = ""
        
    # Handle psychological view based on its data type
    if isinstance(text["psycological_view_of_video"], list):
        # psych = " ".join(text["psycological_view_of_video"])
        psych = " ".join([str(item) if isinstance(item, str) else mergeList(item.values()) for item in text["psycological_view_of_video"]])
    elif isinstance(text["psycological_view_of_video"], dict):
        first_key = list(text["psycological_view_of_video"].keys())[0]
        psych = mergeList(text["psycological_view_of_video"][first_key])
    else:
        psych = ""

    L.append(action+descr+emotion+genre+audience+psych)
    return L

In [None]:
# Documentation added to cell 18
# Enhanced Documentation
post_df["post_summary"] = post_df["post_summary"].apply(convert)
post_df.head(2)

In [None]:
# Documentation added to cell 19
# Enhanced Documentation
post_df.shape

In [None]:
# Documentation added to cell 20
# Enhanced Documentation
def collapse(L):
    return L[0]

In [None]:
# Documentation added to cell 21
# Enhanced Documentation
post_df['post_summary'] = post_df['post_summary'].apply(lambda x: x[0] if isinstance(x, list) and x else x)
post_df.head(2)

#### Extract All Users

In [None]:
# Documentation added to cell 23
# Enhanced Documentation
user_url = f"{base_url}/users/get_all?page=1&page_size=1000"
response = requests.get(user_url, headers=headers)
json_data = response.json()
json_data

In [None]:
# Documentation added to cell 24
# Enhanced Documentation
def extract_user_data(data):
    # Check if 'users' key exists in the input data
    if 'users' not in data:
        return []

    # Extract only the needed fields for each user
    extracted_data = [
        {
            'user_id': user.get('id', ''),
            'user': user.get('first_name', '')+user.get('last_name', ''),
            'username': user.get('username', ''),
            'bio': user.get('bio', ''),
            'post_count': user.get('post_count', 0),
            'follower_count': user.get('follower_count', 0),
            'following_count': user.get('following_count', 0)
        }
        for user in data['users']
    ]
    
    return extracted_data
clean_data = extract_user_data(json_data)

In [None]:
# Documentation added to cell 25
# Enhanced Documentation
user_df = pd.DataFrame(clean_data)
user_df.head(2)

### 1.2) Data Exploration

In [None]:
# Documentation added to cell 27
# Enhanced Documentation
"""
we have 
post_df - all post
viewed_df - all vwed post
user_df - all users

"""
# looking at users_df
user_df.info()

In [None]:
# Documentation added to cell 28
# Enhanced Documentation
# chceking null values in user_df
user_df.isnull().sum()

In [None]:
# Documentation added to cell 29
# Enhanced Documentation
# Mathematical Analysis of user_df
user_df.describe()

In [None]:
# Documentation added to cell 30
# Enhanced Documentation
"""
form this we can infer that 75% of people have not posted more than 40 videos
"""
user_df[user_df["post_count"] > 50]

In [None]:
# Documentation added to cell 31
# Enhanced Documentation
user_df.duplicated().sum()

In [None]:
# Documentation added to cell 32
# Enhanced Documentation
# Looking at post_df
post_df.info()

From this we can infer that there are no null values

In [None]:
# Documentation added to cell 34
# Enhanced Documentation
post_df.describe()

In [None]:
# Documentation added to cell 35
# Enhanced Documentation
post_df[post_df['rating-count'] >= 5].count()

In [None]:
# Documentation added to cell 36
# Enhanced Documentation
# Looking at viewed_df
viewed_df.info()

From this we can infer that there are no null values

In [None]:
# Documentation added to cell 38
# Enhanced Documentation
viewed_df.describe()

## 2.) Algorithm Development

### 2.1) Cold Start Problem Handling - Popularity Based Recommendations
Using post_df we can use the comment_count, view_count, upvote_count and average_rating as metrics to derive a new feature "trending_score" and then find the top trending post

In [None]:
# Documentation added to cell 41
# Enhanced Documentation
def calculate_trending_score(recommend_df, view_weight=0.5, upvote_weight=0.1, comment_weight=0.1, avg_rating_weight=0.3):
    # Ensure no NaN values during calculation
    recommend_df = recommend_df.fillna({
        'view_count': 0,
        'upvote_count': 0,
        'comment_count': 0,
        'average_rating': 0
    })
    
    recommend_df['trending_score'] = (
        (recommend_df['view_count'] * view_weight) + 
        (recommend_df['upvote_count'] * upvote_weight) + 
        (recommend_df['comment_count'] * comment_weight) +
        (recommend_df['average_rating'] * avg_rating_weight)
    )
    return recommend_df

# Apply the trending score to post_df
post_df = calculate_trending_score(post_df)
post_df.head(2)

In [None]:
# Documentation added to cell 42
# Enhanced Documentation
# Ranking the videos by trending score
def get_trending_recommendations(recommend_df, top_n=10):
    if recommend_df.empty:
        print("No data available to recommend trending videos.")
        return pd.DataFrame()
    
    trending_videos = recommend_df.sort_values(by='trending_score', ascending=False)
    trending_videos = trending_videos.drop_duplicates(subset='id')  # Remove duplicates by ID
    
    # Select the top N videos
    return trending_videos.head(top_n)[['id', 'title', 'category', 'trending_score']]

# Example usage
top_recommendations = get_trending_recommendations(post_df)
top_recommendations

### 2.2) Content Based Recommendation

In [None]:
# Documentation added to cell 44
# Enhanced Documentation
content_df = post_df[['id', 'title']]
content_df['tags'] = post_df['post_summary'].astype(str)
content_df.head(2)

In [None]:
# Documentation added to cell 45
# Enhanced Documentation
# Applying stemming
stemmer = PorterStemmer()

In [None]:
# Documentation added to cell 46
# Enhanced Documentation
def stem(text):
    y = []
    for i in text.split():
        y.append(stemmer.stem(i))
    return " ".join(y)

In [None]:
# Documentation added to cell 47
# Enhanced Documentation
content_df['tags'] = content_df['tags'].apply(stem)

In [None]:
# Documentation added to cell 48
# Enhanced Documentation
# creating a vector of tags
vectorizer = CountVectorizer(max_features=5000, stop_words='english')

In [None]:
# Documentation added to cell 49
# Enhanced Documentation
vectors = vectorizer.fit_transform(content_df['tags']).toarray()

In [None]:
# Documentation added to cell 50
# Enhanced Documentation
vectorizer.get_feature_names_out()

In [None]:
# Documentation added to cell 51
# Enhanced Documentation
# Finding Cosine similarity of vectors
similarity = cosine_similarity(vectors)

In [None]:
# Documentation added to cell 52
# Enhanced Documentation
# Content-based recommendation function
def content_based_recommend(video):
    if video not in content_df['title'].values:
        return f"Video '{video}' not found in the dataset."
    
    video_index = content_df[content_df['title'] == video].index[0]
    distances = similarity[video_index]
    video_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:6]
    
    recommendations = []
    for i in video_list:
        recommendations.append(content_df.iloc[i[0]].title)
    return recommendations

In [None]:
# Documentation added to cell 53
# Enhanced Documentation
print(content_based_recommend('Why fit in..?'))

### 2.3) Collaborative Recommendations Algorithm

In [None]:
# Documentation added to cell 55
# Enhanced Documentation
!pip install surprise

In [None]:
# Documentation added to cell 56
# Enhanced Documentation
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [None]:
# Documentation added to cell 57
# Enhanced Documentation
# Step 1: Prepare Data for Collaborative Filtering
# Use 'user_id', 'post_id' and a feedback score as interaction (view count or upvote count)
interaction_data = viewed_df[['username', 'id']]
interaction_data.columns = ['username', 'id']

In [None]:
# Documentation added to cell 58
# Enhanced Documentation
# Aggregate multiple metrics into a single score
interaction_data['interaction_score'] = (
    viewed_df['view_count'] * 0.4 +  # Weight for view count
    viewed_df['upvote_count'] * 0.3 +  # Weight for upvotes
    viewed_df['average_rating'] * 0.2 +  # Weight for average rating
    viewed_df['comment_count'] * 0.1  # Weight for comments
)


In [None]:
# Documentation added to cell 59
# Enhanced Documentation


In [None]:
# Documentation added to cell 60
# Enhanced Documentation
# Drop duplicates for the same id and username
interaction_data = interaction_data.drop_duplicates(subset=['id', 'username'], keep='first')

# Clip the interaction_score between 1 and 10
interaction_data['interaction_score'] = interaction_data['interaction_score'].clip(1, 10)
len(interaction_data)

In [None]:
# Documentation added to cell 61
# Enhanced Documentation
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(
    interaction_data[['username', 'id', 'interaction_score']],
    reader
)

In [None]:
# Documentation added to cell 62
# Enhanced Documentation
# Step 3: Train-Test Split
trainset, testset = train_test_split(data, test_size=0.2)

In [None]:
# Documentation added to cell 63
# Enhanced Documentation
# Step 4: Use SVD (Singular Value Decomposition) for Matrix Factorization
algo = SVD()
algo.fit(trainset)

In [None]:
# Documentation added to cell 64
# Enhanced Documentation
# Step 5: Evaluate the Model
predictions = algo.test(testset)
print(f'RMSE: {accuracy.rmse(predictions)}')  # Root Mean Squared Error

In [None]:
# Documentation added to cell 65
# Enhanced Documentation
# Step 6: Function to Recommend Videos for a Specific User
def collaborative_recommendations(username, post_df, algo, top_n=10):
    # Get a list of post IDs the user has not interacted with
    viewed_posts = interaction_data[interaction_data['username'] == username]['id'].tolist()
    all_posts = post_df['id'].tolist()
    posts_to_recommend = [post for post in all_posts if post not in viewed_posts]
    
    # Predict ratings for unseen posts
    predictions = [algo.predict(username, id) for id in posts_to_recommend]
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Get top N recommendations
    top_predictions = predictions[:top_n]
    top_ids = [pred.iid for pred in top_predictions]
    
    # Return recommended posts
    recommended_posts = post_df[post_df['id'].isin(top_ids)][['id', 'title', 'category']]
    return recommended_posts


In [None]:
# Documentation added to cell 66
# Enhanced Documentation
user_df.iloc[159]

In [None]:
# Documentation added to cell 67
# Enhanced Documentation
# Step 7: Test Recommendations for a User
username = user_df['username'].iloc[4]  # Example: using the first user in user_df
recommended_videos = collaborative_recommendations(username, post_df, algo)
print(recommended_videos)


In [None]:
# Documentation added to cell 68
# Enhanced Documentation


### 2.4) Hybrid Recommendation Algorithm

In [None]:
# Documentation added to cell 70
# Enhanced Documentation
def hybrid_recommender(username, video_title, post_df, algo, content_weight=0.5, collab_weight=0.5, top_n=10):
    """
    Combine content-based and collaborative filtering recommendations.

    Parameters:
    - username: str, the username for collaborative recommendations.
    - video_title: str, the video title for content-based recommendations.
    - post_df: DataFrame, containing post data with 'id', 'title', 'category'.
    - algo: Collaborative filtering algorithm object.
    - content_weight: float, weight for content-based recommendations.
    - collab_weight: float, weight for collaborative recommendations.
    - top_n: int, number of recommendations to return.

    Returns:
    - DataFrame of top hybrid recommendations.
    """
    # Get content-based recommendations
    content_recommendations = content_based_recommend(video_title)

    # Map content-based recommendations to their post IDs
    content_post_ids = post_df[post_df['title'].isin(content_recommendations)][['id', 'title']]

    # Get collaborative recommendations
    collab_recommendations = collaborative_recommendations(username, post_df, algo, top_n=top_n)

    # Merge recommendations
    # Assign normalized scores to content-based and collaborative recommendations
    content_post_ids['score'] = content_weight
    collab_recommendations['score'] = collab_weight

    # Combine both recommendation lists
    combined_recommendations = pd.concat([content_post_ids, collab_recommendations], ignore_index=True)

    # Group by video ID, sum scores, and sort by the highest score
    combined_recommendations = (
        combined_recommendations.groupby(['id', 'title', 'category'], as_index=False)['score']
        .sum()
        .sort_values(by='score', ascending=False)
    )

    # Return top N recommendations
    return combined_recommendations.head(top_n)


In [None]:
# Documentation added to cell 71
# Enhanced Documentation
print(hybrid_recommender('kinha', 'do it now', post_df, algo))

## 3.) Evaluation Metrics

### 3.1) CTR

In [None]:
# Documentation added to cell 74
# Enhanced Documentation
merged_df = viewed_df.merge(post_df, on=['id', 'username', 'category', 'title'], suffixes=('_viewed', '_post'))
merged_df.info()

In [None]:
# Documentation added to cell 75
# Enhanced Documentation
def calculate_ctr(user_df, viewed_df, post_df):
    # Merge viewed_df with post_df to include post information
    merged_df = viewed_df.merge(post_df, on=['id', 'username', 'category', 'title'], suffixes=('_viewed', '_post'))

    # Replace zero values to avoid division errors
    merged_df['exit_count'] = np.maximum(merged_df['exit_count'], 1)
    merged_df['view_count_post'] = np.maximum(merged_df['view_count_post'], 1)

    # Calculate CTR for each post
    merged_df['CTR'] = (
        (merged_df['share_count'] + merged_df['comment_count_post'] + merged_df['upvote_count_post']) /
        (merged_df['view_count_post'] + merged_df['exit_count'])
    ) * 100

    # Aggregate CTR by post and user
    post_ctr = merged_df.groupby('title')['CTR'].mean().reset_index()
    user_ctr = merged_df.groupby('username')['CTR'].mean().reset_index()

    # Return aggregated results
    return post_ctr, user_ctr, merged_df[['id', 'username', 'share_count', 'upvote_count_post', 
                                          'comment_count_post', 'view_count_post', 'exit_count', 'CTR']]


In [None]:
# Documentation added to cell 76
# Enhanced Documentation
post_ctr, user_ctr, detailed_ctr = calculate_ctr(user_df, viewed_df, post_df)
print("CTR by Post:\n", post_ctr)
print("\nCTR by User:\n", user_ctr)
print("\nDetailed CTR:\n", detailed_ctr.head())


In [None]:
# Documentation added to cell 77
# Enhanced Documentation


### 3.2) MAP

In [None]:
# Documentation added to cell 79
# Enhanced Documentation
def map_util(user_df, post_df, algo, video_title, top_n=10):
    recommendations = {}
    for user in user_df['username']:
        # Get recommendations using hybrid recommender
        # recs = hybrid_recommender(user, video_title, post_df, algo, top_n=top_n)
        recs = aborative_recommendations(user, post_df, algo)
        if not recs.empty:
            recommendations[user] = recs['id'].tolist()
    return recommendations


In [None]:
# Documentation added to cell 80
# Enhanced Documentation
viewed_df.iloc[51]['title']

In [None]:
# Documentation added to cell 81
# Enhanced Documentation
recommendation = map_util(user_df, post_df, algo, 'Trading memecoins #memecoin #solana #bitcoin #funny #usa #fyp #animation #animationmeme #memecoins #2d #foryou #foryoupage #repost #viraltiktok', top_n=20)

In [None]:
# Documentation added to cell 82
# Enhanced Documentation
recommendation

In [None]:
# Documentation added to cell 83
# Enhanced Documentation
viewed_df.iloc[0]

In [None]:
# Documentation added to cell 84
# Enhanced Documentation
def calculate_map(viewed_df, post_df, recommendations):
    """
    Calculate the Mean Average Precision (MAP) for the recommendations.
    
    Args:
        viewed_df: DataFrame containing the viewed posts for users.
        post_df: DataFrame containing post information.
        recommendations: Dictionary of {user: [recommended_post_ids]}.
        
    Returns:
        MAP score as a float.
    """
    map_scores = []

    for user, recommended_posts in recommendations.items():
        # Get the posts that the user has actually viewed
        viewed_posts = set(viewed_df[viewed_df['username'] == user]['id'].tolist())
        if not viewed_posts:
            continue
        print(user)
        print("recom= ", recommended_posts)
        # Calculate Average Precision (AP)
        relevant, total_precision = 0, 0
        for rank, post_id in enumerate(recommended_posts, start=1):
            if post_id in viewed_posts:
                print("oo")
                relevant += 1
                total_precision += relevant / rank

        # Normalize AP by the number of relevant items (viewed_posts)
        avg_precision = total_precision / len(viewed_posts) if viewed_posts else 0
        map_scores.append(avg_precision)

    # Compute and return Mean Average Precision (MAP)
    return sum(map_scores) / len(map_scores) if map_scores else 0


In [None]:
# Documentation added to cell 85
# Enhanced Documentation
# Calculate MAP for the given data and recommendations
map_score = calculate_map(viewed_df, post_df, recommendation)

print(f"Mean Average Precision (MAP): {map_score}")

In [None]:
# Documentation added to cell 86
# Enhanced Documentation


In [None]:
# Documentation added to cell 87
# Enhanced Documentation


In [None]:
# Documentation added to cell 88
# Enhanced Documentation


In [None]:
# Documentation added to cell 89
# Enhanced Documentation


In [None]:
# Documentation added to cell 90
# Enhanced Documentation
