# Video Recommendation Algorithm 

### Importing Required Libraries

In [98]:
import numpy as np
import pandas as pd
import requests
from IPython.display import JSON
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem.porter import PorterStemmer
from sklearn.metrics.pairwise import cosine_similarity

## 1.) Data Preprocessing

### 1.1) Extracting data from the API

In [99]:
base_url = "https://api.socialverseapp.com"
headers = {
    "Flic-Token" : "flic_b9c73e760ec8eae0b7468e7916e8a50a8a60ea7e862c32be44927f5a5ca69867"
}

#### All Viewed Posts of Users

In [100]:
viewed_url = f"{base_url}/posts/view?page=1&page_size=1000&resonance_algorithm=resonance_algorithm_cjsvervb7dbhss8bdrj89s44jfjdbsjd0xnjkbvuire8zcjwerui3njfbvsujc5if"
response = requests.get(viewed_url ,headers=headers)
json_data = response.json()

In [101]:
# Extracting data from JSON
data = []
for entry in json_data.get("posts", []):
    data.append({
        "id": entry["id"],
        "category_id": entry["category"]["id"],
        "category": entry["category"]["name"],
        "User": entry["first_name"]+entry["last_name"],
        "username": entry["username"],
        # "post_slug": entry["slug"],
        "title": entry["title"],
        "identifier": entry["identifier"],
        "comment_count": entry["comment_count"],
        "upvote_count": entry["upvote_count"],
        "view_count": entry["view_count"],
        "exit_count": entry["exit_count"],
        "rating_count": entry["rating_count"],
        "average_rating": entry["average_rating"],
        "share_count": entry["share_count"],
        "upvoted": entry["upvoted"],
        "bookmarked": entry["bookmarked"],
        "following": entry["following"]
    })
    
viewed_df = pd.DataFrame(data)

In [102]:
viewed_df.head()

Unnamed: 0,id,category_id,category,User,username,title,identifier,comment_count,upvote_count,view_count,exit_count,rating_count,average_rating,share_count,upvoted,bookmarked,following
0,1292,22,SolTok,SachinKinha,kinha,Day 22 turning $100 into 100000 by trading mem...,7LUUxsH,3,3,76,497,14,25,3,False,False,False
1,1265,22,SolTok,SachinKinha,kinha,#crypto #cryptotrading #memecoin #solmemecoins...,L846dJS,0,0,35,451,4,25,0,False,False,False
2,1292,22,SolTok,SachinKinha,kinha,Day 22 turning $100 into 100000 by trading mem...,7LUUxsH,3,3,76,497,14,25,3,False,False,False
3,1291,22,SolTok,SachinKinha,kinha,Can Moo Deng’s MEME COIN GO HIGHER_ #shorts,ajMOQkc,0,0,24,361,5,23,0,False,False,False
4,1306,22,SolTok,SachinKinha,kinha,Culture of Solana Token $COST. # 2024 Trum...,3WnTZUH,0,0,29,462,6,24,0,False,False,False


In [103]:
viewed_df = viewed_df.drop_duplicates(subset=['id', 'username'], keep='first')

In [104]:
viewed_df.shape

(280, 17)

#### Extract All Posts

In [105]:
post_url = f"{base_url}/posts/summary/get?page=1&page_size=1000"
response = requests.get(post_url, headers=headers)
json_data = response.json()

In [106]:
json_data.keys()

dict_keys(['status', 'message', 'page', 'max_page_size', 'page_size', 'posts'])

In [107]:
json_data["posts"]

[{'id': 11,
  'category': {'id': 2,
   'name': 'Vible',
   'count': 534,
   'description': 'All the best vibes!',
   'image_url': 'https://assets.socialverseapp.com/categories/a6861bd94e900fe7b6a88b72fd1975ea'},
  'slug': 'recipe-for-a-flow-state',
  'title': 'Recipe for a flow state',
  'identifier': 'OSsJAMz',
  'comment_count': 0,
  'upvote_count': 2,
  'view_count': 27,
  'exit_count': 0,
  'rating_count': 6,
  'average_rating': 42,
  'share_count': 8,
  'video_link': 'https://video-cdn.socialverseapp.com/michael_9f5b241d-d40e-4946-be5d-711448b7e0f4.mp4',
  'contract_address': '',
  'chain_id': '',
  'chart_url': '',
  'baseToken': {'address': '', 'name': '', 'symbol': '', 'image_url': ''},
  'is_locked': False,
  'created_at': 1698088807000,
  'first_name': 'Michael',
  'last_name': 'Dadzie',
  'username': 'afrobeezy',
  'upvoted': True,
  'bookmarked': False,
  'thumbnail_url': 'https://video-cdn.socialverseapp.com/michael_9f5b241d-d40e-4946-be5d-711448b7e0f4.0000002.jpg',
  'gif

In [108]:
posts = json_data["posts"]
data = []

for post in posts:
    data.append({
        'id': post['id'],
        'title': post['title'],
        'category': post['category']['name'],
        'username': post['username'],
        'view_count': post['view_count'],
        'upvote_count': post['upvote_count'],
        'comment_count': post['comment_count'],
        'rating-count': post['rating_count'],
        'average_rating': post['average_rating'],
        'post_summary': post['post_summary']
    })

post_df = pd.DataFrame(data)
post_df.head(2)

Unnamed: 0,id,title,category,username,view_count,upvote_count,comment_count,rating-count,average_rating,post_summary
0,11,Recipe for a flow state,Vible,afrobeezy,27,2,0,6,42,{'actions': {'key_events': ['discussion about ...
1,12,Why fit in..?,Vible,afrobeezy,3,2,0,0,0,{'actions': {'key_actions': ['Character expres...


In [109]:
post_df = post_df.drop_duplicates(subset=['id', 'username'], keep='first')
post_df.shape

(1000, 10)

In [110]:
# utility functions to convert json data of post_summary column to list of strings
def mergeList(L):
    """ This function takes a nested list of strings and merges them to one list of string
    """
    if isinstance(L, list):
        return ", ".join(str(item) for item in L)  
    return str(L) if L is not None else ""

def convert(text):
    """ This function cleans the json formatting of the post_summary column
    """
    if not isinstance(text, dict):  # Check if text is a dictionary
        print("Error: Expected dictionary, got:", type(text))
        return []
    L = []
    descr = text["description"]
    if isinstance(descr, list): descr = mergeList(descr)
    genre = text.get("genre", "")
    if not isinstance(genre, str):
        genre = " "

    # Handle actions based on data type
    if isinstance(text["actions"], list):
        # action = " ".join(text["actions"])
        action = " ".join([str(item) if isinstance(item, str) else mergeList(item.values()) for item in text["actions"]])
    elif isinstance(text["actions"], dict):
        first_key = list(text["actions"].keys())[0]
        action = mergeList(text["actions"][first_key])
        if isinstance(action, dict):
            action = ""
    else:
        action = ""
    
    # Handle emotions based on data type
    if isinstance(text["emotions"], list):
        # emotion = " ".join(text["emotions"])
        emotion = " ".join([str(item) if isinstance(item, str) else mergeList(item.values()) for item in text["targeted_audiance"]])
    elif isinstance(text["emotions"], dict):
        first_key = list(text["emotions"].keys())[0]
        emotion = mergeList(text["emotions"][first_key]) 
    else:
        emotion = ""

    # Handle audience based on data type
    if isinstance(text["targeted_audiance"], list):
        # audience = " ".join(text["targeted_audiance"])
        audience = " ".join([str(item) if isinstance(item, str) else mergeList(item.values()) for item in text["targeted_audiance"]])
    elif isinstance(text["targeted_audiance"], dict):
        first_key = list(text["targeted_audiance"].keys())[0]
        audience = mergeList(text["targeted_audiance"][first_key])
    else:
        audience = ""
        
    # Handle psychological view based on its data type
    if isinstance(text["psycological_view_of_video"], list):
        # psych = " ".join(text["psycological_view_of_video"])
        psych = " ".join([str(item) if isinstance(item, str) else mergeList(item.values()) for item in text["psycological_view_of_video"]])
    elif isinstance(text["psycological_view_of_video"], dict):
        first_key = list(text["psycological_view_of_video"].keys())[0]
        psych = mergeList(text["psycological_view_of_video"][first_key])
    else:
        psych = ""

    L.append(action+descr+emotion+genre+audience+psych)
    return L

In [111]:
post_df["post_summary"] = post_df["post_summary"].apply(convert)

Error: Expected dictionary, got: <class 'list'>


In [112]:
post_df.shape

(1000, 10)

In [113]:
def collapse(L):
    """ This function converts thee list of string to one string object
    """
    return L[0]

In [114]:
post_df['post_summary'] = post_df['post_summary'].apply(lambda x: x[0] if isinstance(x, list) and x else x)
post_df.head(2)

Unnamed: 0,id,title,category,username,view_count,upvote_count,comment_count,rating-count,average_rating,post_summary
0,11,Recipe for a flow state,Vible,afrobeezy,27,2,0,6,42,"discussion about flow state, demonstration of ..."
1,12,Why fit in..?,Vible,afrobeezy,3,2,0,0,0,"Character expressing inner turmoil, Transition..."


#### Extract All Users

In [115]:
user_url = f"{base_url}/users/get_all?page=1&page_size=1000"
response = requests.get(user_url, headers=headers)
json_data = response.json()
json_data

{'status': 'success',
 'message': 'Users fetched successfully',
 'page': 1,
 'max_page_size': 1000,
 'page_size': 1000,
 'users': [{'id': 1,
   'first_name': 'Michael',
   'last_name': 'Dadzie',
   'username': 'afrobeezy',
   'email': 'dadziemikke@outlook.com',
   'role': 'A',
   'profile_url': 'https://assets.socialverseapp.com/profile/afrobeezy1704186478image_cropper_B57BCBB5-40B6-4B0A-8710-940B7292DC11-2518-00000201B79852C3.jpg.png',
   'bio': 'Digitial Nomad | iOS Developer',
   'website_url': 'https://www.github.com/michaeldadzie',
   'instagram-url': 'https://www.instagram.com/michaeldadziie',
   'youtube_url': '',
   'tictok_url': '',
   'isVerified': False,
   'referral_code': 'afrob_dd42',
   'has_wallet': True,
   'last_login': '2024-11-15 07:53:32',
   'share_count': 0,
   'post_count': 57,
   'following_count': 7,
   'follower_count': 15,
   'is_verified': False,
   'is_online': False,
   'latitude': '',
   'longitude': ''},
  {'id': 2,
   'first_name': 'John',
   'last_nam

In [116]:
def extract_user_data(data):
    # Check if 'users' key exists in the input data
    if 'users' not in data:
        return []

    # Extract only the needed fields for each user
    extracted_data = [
        {
            'user_id': user.get('id', ''),
            'user': user.get('first_name', '')+user.get('last_name', ''),
            'username': user.get('username', ''),
            'bio': user.get('bio', ''),
            'post_count': user.get('post_count', 0),
            'follower_count': user.get('follower_count', 0),
            'following_count': user.get('following_count', 0)
        }
        for user in data['users']
    ]
    
    return extracted_data
clean_data = extract_user_data(json_data)

In [117]:
user_df = pd.DataFrame(clean_data)
user_df.head(2)

Unnamed: 0,user_id,user,username,bio,post_count,follower_count,following_count
0,1,MichaelDadzie,afrobeezy,Digitial Nomad | iOS Developer,57,15,7
1,2,JohnDoe,doey,,0,0,0


### 1.2) Data Exploration

In [118]:
"""
we have 
post_df - all post
viewed_df - all vwed post
user_df - all users

"""
# looking at users_df
user_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   user_id          1000 non-null   int64 
 1   user             1000 non-null   object
 2   username         1000 non-null   object
 3   bio              1000 non-null   object
 4   post_count       1000 non-null   int64 
 5   follower_count   1000 non-null   int64 
 6   following_count  1000 non-null   int64 
dtypes: int64(4), object(3)
memory usage: 54.8+ KB


In [119]:
# chceking null values in user_df
user_df.isnull().sum()

user_id            0
user               0
username           0
bio                0
post_count         0
follower_count     0
following_count    0
dtype: int64

In [120]:
user_df.drop(columns='bio', inplace=True) # removing bio columns as most of it is mostly empty string

In [121]:
# Mathematical Analysis of user_df
user_df.describe()

Unnamed: 0,user_id,post_count,follower_count,following_count
count,1000.0,1000.0,1000.0,1000.0
mean,510.122,1.01,0.23,0.217
std,293.581486,28.650318,1.609251,1.193036
min,1.0,0.0,0.0,0.0
25%,256.75,0.0,0.0,0.0
50%,511.5,0.0,0.0,0.0
75%,762.25,0.0,0.0,0.0
max,1019.0,904.0,40.0,20.0


Form this we can infer that 75% of people have not posted more than 40 videos

In [122]:
user_df[user_df["post_count"] > 50]

Unnamed: 0,user_id,user,username,post_count,follower_count,following_count
0,1,MichaelDadzie,afrobeezy,57,15,7
4,5,SachinKinha,kinha,904,40,15


In [123]:
user_df.duplicated().sum()

0

In [124]:
# Looking at post_df
post_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 10 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              1000 non-null   int64 
 1   title           1000 non-null   object
 2   category        1000 non-null   object
 3   username        1000 non-null   object
 4   view_count      1000 non-null   int64 
 5   upvote_count    1000 non-null   int64 
 6   comment_count   1000 non-null   int64 
 7   rating-count    1000 non-null   int64 
 8   average_rating  1000 non-null   int64 
 9   post_summary    1000 non-null   object
dtypes: int64(6), object(4)
memory usage: 78.2+ KB


From this we can infer that there are no null values

In [125]:
post_df.describe()

Unnamed: 0,id,view_count,upvote_count,comment_count,rating-count,average_rating
count,1000.0,1000.0,1000.0,1000.0,1000.0,1000.0
mean,728.663,14.244,1.171,0.265,2.437,11.13
std,370.868792,36.350596,3.647885,1.263476,5.946545,15.990397
min,11.0,0.0,0.0,0.0,0.0,0.0
25%,439.75,1.0,0.0,0.0,0.0,0.0
50%,760.5,4.0,0.0,0.0,0.0,0.0
75%,1040.25,13.0,1.0,0.0,3.0,25.0
max,1334.0,473.0,60.0,18.0,66.0,87.0


In [126]:
post_df[post_df['rating-count'] >= 5].count()

id                193
title             193
category          193
username          193
view_count        193
upvote_count      193
comment_count     193
rating-count      193
average_rating    193
post_summary      193
dtype: int64

In [127]:
# Looking at viewed_df
viewed_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 280 entries, 0 to 999
Data columns (total 17 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              280 non-null    int64 
 1   category_id     280 non-null    int64 
 2   category        280 non-null    object
 3   User            280 non-null    object
 4   username        280 non-null    object
 5   title           280 non-null    object
 6   identifier      280 non-null    object
 7   comment_count   280 non-null    int64 
 8   upvote_count    280 non-null    int64 
 9   view_count      280 non-null    int64 
 10  exit_count      280 non-null    int64 
 11  rating_count    280 non-null    int64 
 12  average_rating  280 non-null    int64 
 13  share_count     280 non-null    int64 
 14  upvoted         280 non-null    bool  
 15  bookmarked      280 non-null    bool  
 16  following       280 non-null    bool  
dtypes: bool(3), int64(9), object(5)
memory usage: 33.6+ KB


From this we can infer that there are no null values

In [128]:
viewed_df.describe()

Unnamed: 0,id,category_id,comment_count,upvote_count,view_count,exit_count,rating_count,average_rating,share_count
count,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0,280.0
mean,895.428571,7.489286,0.646429,2.217857,30.892857,178.296429,4.821429,25.514286,0.207143
std,428.748201,8.337841,2.168524,6.225658,58.038141,212.739752,7.247066,16.241672,0.761967
min,11.0,2.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
25%,640.25,2.0,0.0,0.0,7.0,0.0,1.0,17.0,0.0
50%,1075.5,2.0,0.0,1.0,15.0,3.0,3.0,28.0,0.0
75%,1250.25,13.0,0.0,2.0,31.0,412.0,6.0,38.0,0.0
max,1336.0,22.0,18.0,60.0,473.0,501.0,53.0,87.0,8.0


## 2.) Algorithm Development

### 2.1) Cold Start Problem Handling - Popularity Based Recommendations
Using post_df we can use the comment_count, view_count, upvote_count and average_rating as metrics to derive a new feature "trending_score" and then find the top trending post

In [129]:
def calculate_trending_score(recommend_df, view_weight=0.5, upvote_weight=0.1, comment_weight=0.1, avg_rating_weight=0.3):
    # Ensure no NaN values during calculation
    recommend_df = recommend_df.fillna({
        'view_count': 0,
        'upvote_count': 0,
        'comment_count': 0,
        'average_rating': 0
    })
    
    recommend_df['trending_score'] = (
        (recommend_df['view_count'] * view_weight) + 
        (recommend_df['upvote_count'] * upvote_weight) + 
        (recommend_df['comment_count'] * comment_weight) +
        (recommend_df['average_rating'] * avg_rating_weight)
    )
    return recommend_df

# Apply the trending score to post_df
post_df = calculate_trending_score(post_df)
post_df.head(2)

Unnamed: 0,id,title,category,username,view_count,upvote_count,comment_count,rating-count,average_rating,post_summary,trending_score
0,11,Recipe for a flow state,Vible,afrobeezy,27,2,0,6,42,"discussion about flow state, demonstration of ...",26.3
1,12,Why fit in..?,Vible,afrobeezy,3,2,0,0,0,"Character expressing inner turmoil, Transition...",1.7


In [130]:
# Ranking the videos by trending score
def get_trending_recommendations(recommend_df, top_n=10):
    if recommend_df.empty:
        print("No data available to recommend trending videos.")
        return pd.DataFrame()
    
    trending_videos = recommend_df.sort_values(by='trending_score', ascending=False)
    trending_videos = trending_videos.drop_duplicates(subset='id')  # Remove duplicates by ID
    
    # Select the top N videos
    return trending_videos.head(top_n)[['id', 'title', 'category', 'trending_score']]

# Example usage
top_recommendations = get_trending_recommendations(post_df)
top_recommendations

Unnamed: 0,id,title,category,trending_score
770,1064,What is DAI Stablecoin,Pumptok,244.3
768,1062,Did you miss out on $PEPE People turned $100...,Pumptok,215.9
772,1066,Silicon Valley Bank has sent shockwaves throug...,Pumptok,211.0
771,1065,Why Should I Buy $BNB === People always ask WH...,Pumptok,204.0
89,159,,Vible,159.4
346,560,,Vible,147.6
342,549,,Vible,88.1
363,590,,Vible,86.1
100,173,,Vible,79.8
24,44,escape the matrix,Vible,76.2


### 2.2) Content Based Recommendation

In [131]:
content_df = post_df[['id', 'title']]
content_df['tags'] = post_df['post_summary'].astype(str)
content_df.head(2)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df['tags'] = post_df['post_summary'].astype(str)


Unnamed: 0,id,title,tags
0,11,Recipe for a flow state,"discussion about flow state, demonstration of ..."
1,12,Why fit in..?,"Character expressing inner turmoil, Transition..."


In [132]:
# Applying stemming
stemmer = PorterStemmer()

In [133]:
def stem(text):
    y = []
    for i in text.split():
        y.append(stemmer.stem(i))
    return " ".join(y)

In [134]:
content_df['tags'] = content_df['tags'].apply(stem)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  content_df['tags'] = content_df['tags'].apply(stem)


In [135]:
# creating a vector of tags
vectorizer = CountVectorizer(max_features=5000, stop_words='english')

In [136]:
vectors = vectorizer.fit_transform(content_df['tags']).toarray()

In [137]:
vectorizer.get_feature_names_out()

array(['00', '000', '05', ..., 'पत', 'बज', 'கடவ'], dtype=object)

In [138]:
# Finding Cosine similarity of vectors
similarity = cosine_similarity(vectors)

In [139]:
# Content-based recommendation function
def content_based_recommend(video):
    if video not in content_df['title'].values:
        return f"Video '{video}' not found in the dataset."
    
    video_index = content_df[content_df['title'] == video].index[0]
    distances = similarity[video_index]
    video_list = sorted(list(enumerate(distances)), reverse=True, key=lambda x: x[1])[1:11]
    
    recommendations = []
    for i in video_list:
        recommendations.append(content_df.iloc[i[0]][['id','title']])
    
    # Convert the list of recommended videos to a DataFrame
    recommendations_df = pd.DataFrame(recommendations)
    return recommendations_df

In [140]:
print(content_based_recommend('Why fit in..?'))

      id                                              title
146  259                                                   
409  650                                             NEVER.
599  872                                    Ramayan part 22
186  368                                                   
408  648  Don’t feel nothing for these people. Follow fo...
657  932                                    Ramayan part 57
302  502                                                   
659  934                                    Ramayan part 60
606  879                                    Ramayan part 29
411  652           Understand one thing about these people.


### 2.3) Collaborative Recommendations Algorithm

#### 2.3.1) Model Based Approach

In [141]:
from surprise import Dataset, Reader, SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

In [142]:
# Prepare Data for Collaborative Filtering
# Use 'user_id', 'post_id' and a feedback score as interaction (view count or upvote count)
interaction_data = viewed_df[['username', 'id']]
interaction_data.columns = ['username', 'id']

In [143]:
# Aggregate multiple metrics into a single score
interaction_data['interaction_score'] = (
    viewed_df['view_count'] * 0.4 +  # Weight for view count
    viewed_df['upvote_count'] * 0.3 +  # Weight for upvotes
    viewed_df['average_rating'] * 0.2 +  # Weight for average rating
    viewed_df['comment_count'] * 0.1  # Weight for comments
)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interaction_data['interaction_score'] = (


In [144]:
# Clip the interaction_score between 1 and 10
interaction_data['interaction_score'] = interaction_data['interaction_score'].clip(1, 10)
len(interaction_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  interaction_data['interaction_score'] = interaction_data['interaction_score'].clip(1, 10)


280

In [145]:
reader = Reader(rating_scale=(1, 10))
data = Dataset.load_from_df(
    interaction_data[['username', 'id', 'interaction_score']],
    reader
)

In [146]:
# Train-Test Split
trainset, testset = train_test_split(data, test_size=0.2)

In [147]:
# Use SVD (Singular Value Decomposition) for Matrix Factorization
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x25088633ad0>

In [148]:
# Evaluate the Model
predictions = algo.test(testset)
print(f'RMSE: {accuracy.rmse(predictions)}')  # Root Mean Squared Error

RMSE: 2.6554
RMSE: 2.6553858790224156


In [149]:
# Function to Recommend Videos for a Specific User
def model_collab_recommend(username, post_df, algo, top_n=10):
    # Get a list of post IDs the user has not interacted with
    viewed_posts = interaction_data[interaction_data['username'] == username]['id'].tolist()
    all_posts = post_df['id'].tolist()
    posts_to_recommend = [post for post in all_posts if post not in viewed_posts]
    
    # Predict ratings for unseen posts
    predictions = [algo.predict(username, id) for id in posts_to_recommend]
    predictions.sort(key=lambda x: x.est, reverse=True)
    
    # Get top N recommendations
    top_predictions = predictions[:top_n]
    top_ids = [pred.iid for pred in top_predictions]
    
    # Return recommended posts
    recommended_posts = post_df[post_df['id'].isin(top_ids)][['id', 'title', 'category']]
    return recommended_posts


In [150]:
# Test Recommendations for a User
username = user_df['username'].iloc[4]  # Example: using the first user in user_df
recommended_videos = model_collab_recommend(username, post_df, algo)
print(recommended_videos)


       id                                              title     category
5      16                                           act now!        Vible
6      17                                   Jump. Just Jump!        Vible
8      23             Direct your energy on the right things        Vible
40     66  A new dawn of personalized computing is is fro...        E/ACC
707   997                                Words are like keys        Vible
776  1073  OvaDrive! Day 5 of sharing UPDATES till we rea...     OvaDrive
988  1323                                               🚗 ⏩⏩  FuseTrendz 
991  1326                            something different!!!!  FuseTrendz 
992  1327                                                💯💯💯  FuseTrendz 
998  1333  thereum crypto currency 3d modeling and animat...  FuseTrendz 


#### 2.3.2) Memory Based Approach

In [151]:
# Create item-user interaction matrix
def create_item_user_matrix(interaction_data, post_df):
    # Create a user-item matrix where rows are users and columns are items (videos)
    user_item_matrix = interaction_data.pivot(index='username', columns='id', values='interaction_score').fillna(0)
    return user_item_matrix

# Calculate item-item similarity
def calculate_item_similarity(user_item_matrix):
    item_similarity = cosine_similarity(user_item_matrix.T)  # Transpose the matrix to get item-item similarity
    item_similarity_df = pd.DataFrame(item_similarity, index=user_item_matrix.columns, columns=user_item_matrix.columns)
    return item_similarity_df


In [152]:
# Item-based recommendation function
def item_based_recommendations(username, user_item_matrix, item_similarity_df, top_n=10):
    # Check if user has interacted with any posts
    if username not in user_item_matrix.index or user_item_matrix.loc[username].sum() == 0:
        return f"User '{username}' has no interactions in the dataset. Cannot provide recommendations."

    # Get the list of videos the user has interacted with
    viewed_posts = user_item_matrix.loc[username]
    viewed_items = viewed_posts[viewed_posts > 0].index.tolist()
    
    # Calculate the recommendation scores for each video not interacted with
    scores = {}
    for item in item_similarity_df.columns:
        # Sum the similarity scores between the viewed items and the target item
        score = sum(item_similarity_df.loc[item, viewed_items] * viewed_posts[viewed_items])
        scores[item] = score

    # Sort the items by score and return the top N recommendations
    recommended_items = sorted(scores.items(), key=lambda x: x[1], reverse=True)[:top_n]
    
    # Get the recommended video titles from the post_df
    recommended_titles = post_df[post_df['id'].isin([item[0] for item in recommended_items])]['id'].tolist()
    return recommended_titles


In [153]:
def memory_collab_recommend(username ,user_df, user_item_matrix, item_similarity_df, post_df, top_n=10):
    """
    Generate item-based recommendations for all users in the dataset.
    """
    if username not in user_df['username'].values:
        return f"User '{username}' not found in the dataset."

    user_recommendations = item_based_recommendations(
        username, user_item_matrix, item_similarity_df, top_n
    )
    
    # Fetch video titles and IDs from post_df
    recommended_df = post_df[post_df['id'].isin(user_recommendations)][['id', 'title']].reset_index(drop=True)
    
    return recommended_df


In [154]:
user_item_matrix = create_item_user_matrix(interaction_data, post_df)
item_similarity_df = calculate_item_similarity(user_item_matrix)
username = "kinha"
recommended_videos = memory_collab_recommend(username, user_df, user_item_matrix, item_similarity_df, post_df)
print(recommended_videos)

    id                                  title
0   81  how are the 2023 fitness goals going?
1   82       Decide to be extraodinary and do
2  100                                       
3  130                                       
4  131                                       
5  133                                       
6  147                                       
7  148                                       
8  150                                       
9  152                                       


### 2.4) Hybrid Recommendation Algorithm

In [155]:
def hybrid_recommender(username, video_title, post_df, algo, content_weight=0.5, collab_weight=0.5, top_n=10):
    """
    Combine content-based and collaborative filtering recommendations.
    """
    # Get content-based recommendations
    content_recommendations = content_based_recommend(video_title)

    # Map content-based recommendations to their post IDs
    content_post_ids = post_df[post_df['title'].isin(content_recommendations)][['id', 'title']]

    # Get collaborative recommendations
    collab_recommendations = model_collab_recommend(username, post_df, algo, top_n=top_n)

    # Merge recommendations
    # Assign normalized scores to content-based and collaborative recommendations
    content_post_ids['score'] = content_weight
    collab_recommendations['score'] = collab_weight

    # Combine both recommendation lists
    combined_recommendations = pd.concat([content_post_ids, collab_recommendations], ignore_index=True)

    # Group by video ID, sum scores, and sort by the highest score
    combined_recommendations = (
        combined_recommendations.groupby(['id', 'title', 'category'], as_index=False)['score']
        .sum()
        .sort_values(by='score', ascending=False)
    )

    # Return top N recommendations
    return combined_recommendations.head(top_n)


In [156]:
print(hybrid_recommender('kinha', 'do it now', post_df, algo))

     id                                              title     category  score
0    16                                           act now!        Vible    0.5
1    17                                   Jump. Just Jump!        Vible    0.5
2    23             Direct your energy on the right things        Vible    0.5
3    66  A new dawn of personalized computing is is fro...        E/ACC    0.5
4   997                                Words are like keys        Vible    0.5
5  1073  OvaDrive! Day 5 of sharing UPDATES till we rea...     OvaDrive    0.5
6  1323                                               🚗 ⏩⏩  FuseTrendz     0.5
7  1326                            something different!!!!  FuseTrendz     0.5
8  1327                                                💯💯💯  FuseTrendz     0.5
9  1333  thereum crypto currency 3d modeling and animat...  FuseTrendz     0.5


## 3.) Evaluation Metrics

### 3.1) CTR

In [157]:
merged_df = viewed_df.merge(post_df, on=['id', 'username', 'category', 'title'], suffixes=('_viewed', '_post'))
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 278 entries, 0 to 277
Data columns (total 24 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     278 non-null    int64  
 1   category_id            278 non-null    int64  
 2   category               278 non-null    object 
 3   User                   278 non-null    object 
 4   username               278 non-null    object 
 5   title                  278 non-null    object 
 6   identifier             278 non-null    object 
 7   comment_count_viewed   278 non-null    int64  
 8   upvote_count_viewed    278 non-null    int64  
 9   view_count_viewed      278 non-null    int64  
 10  exit_count             278 non-null    int64  
 11  rating_count           278 non-null    int64  
 12  average_rating_viewed  278 non-null    int64  
 13  share_count            278 non-null    int64  
 14  upvoted                278 non-null    bool   
 15  bookma

In [158]:
def calculate_ctr(user_df, viewed_df, post_df):
    """
    This function calculates the Click Through Rate of posts
    """
    # Merge viewed_df with post_df to include post information
    merged_df = viewed_df.merge(post_df, on=['id', 'username', 'category', 'title'], suffixes=('_viewed', '_post'))

    # Replace zero values to avoid division errors
    merged_df['exit_count'] = np.maximum(merged_df['exit_count'], 1)
    merged_df['view_count_post'] = np.maximum(merged_df['view_count_post'], 1)

    # Calculate CTR for each post
    merged_df['CTR'] = (
        (merged_df['share_count'] + merged_df['comment_count_post'] + merged_df['upvote_count_post']) /
        (merged_df['view_count_post'] + merged_df['exit_count'])
    ) * 100

    # Aggregate CTR by post and user
    post_ctr = merged_df.groupby('title')['CTR'].mean().reset_index()
    user_ctr = merged_df.groupby('username')['CTR'].mean().reset_index()

    # Return aggregated results
    return post_ctr, user_ctr, merged_df[['id', 'username', 'share_count', 'upvote_count_post', 
                                          'comment_count_post', 'view_count_post', 'exit_count', 'CTR']]


In [159]:
post_ctr, user_ctr, detailed_ctr = calculate_ctr(user_df, viewed_df, post_df)
print("CTR by Post:\n", post_ctr)
print("\nCTR by User:\n", user_ctr)
print("\nDetailed CTR:\n", detailed_ctr.head())


CTR by Post:
                                                  title        CTR
0                                                        6.980493
1    #crypto #cryptotrading #memecoin #solmemecoins...   0.000000
2    #crypto #cryptotrading #memecoin #solmemecoins...   0.000000
3    #crypto #memecoins to buy today! SCF on my rad...   0.000000
4             #cryptomeme #solana #shitcoins #memecoin   0.000000
..                                                 ...        ...
229                          😱 JESUS 😱 GOD JESUS jesus  18.181818
230                                                 🙏🏽   0.537634
231                           🚀 Best memecoins on Tron   0.000000
232  🚀 MOO DENG Solana Meme Coin Hits $70M Market C...   0.202429
233                                               🚗 ⏩⏩   0.000000

[234 rows x 2 columns]

CTR by User:
            username        CTR
0         afrobeezy   8.630870
1              jack  80.666667
2             kinha   5.241231
3    michaeldadziie   0.704225


### 3.2) MAP

In [160]:
def map_util(user_df, post_df, algo, video_title, top_n=10):
    recommendations = {}
    for user in user_df['username']:
        # Get recommendations using hybrid recommender
        # recs = hybrid_recommender(user, video_title, post_df, algo, top_n=top_n)
        recs = collaborative_recommendations(user, post_df, algo)
        if not recs.empty:
            recommendations[user] = recs['id'].tolist()
    return recommendations


In [161]:
# Calculate MAP (Mean Average Precision)
def calculate_map(viewed_df, post_df, recommendations, k=10):
    """
    Calculate Mean Average Precision (MAP) for the recommendations.
    """
    map_scores = []

    for user, recommended_titles in recommendations.items():
        # Check if the user has any viewed posts in viewed_df
        viewed_posts = viewed_df[viewed_df['username'] == user]['id'].tolist()
        
        if not viewed_posts:  # Skip users without any interactions
            print(f"User '{user}' has no viewed posts. Skipping MAP calculation for this user.")
            continue
        
        # Only take the top-k recommendations
        top_k_recommendations = recommended_titles[:k]
        
        # Calculate Average Precision for this user
        relevant = 0
        total_precision = 0
        for rank, title in enumerate(top_k_recommendations, start=1):
            if title in viewed_posts:
                relevant += 1
                total_precision += relevant / rank
        
        # If no relevant posts, the precision is zero
        if relevant > 0:
            avg_precision = total_precision / min(k, len(viewed_posts))
        else:
            avg_precision = 0
        map_scores.append(avg_precision)

    # Calculate Mean Average Precision (MAP)
    if len(map_scores) > 0:
        return sum(map_scores) / len(map_scores)
    else:
        return 0.0


In [164]:
# Create item-user interaction matrix
user_item_matrix = create_item_user_matrix(interaction_data, post_df)

# Calculate item-item similarity matrix
item_similarity_df = calculate_item_similarity(user_item_matrix)

# Get recommendations for all users using item-based collaborative filtering
recommendations = {}
for username in user_df['username']:
    recommended_titles = item_based_recommendations(username, user_item_matrix, item_similarity_df, top_n=10)
    # Check if the user does not have interactions and handle accordingly
    if isinstance(recommended_titles, list):  # Only add if recommendations are valid
        recommendations[username] = recommended_titles


In [165]:
# Calculate MAP for the recommendations
map_score = calculate_map(viewed_df, post_df, recommendations, k=10)
print(f"Mean Average Precision (MAP): {map_score}")

Mean Average Precision (MAP): 0.49388594944150505
