# Imports

In [32]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sentence_transformers import SentenceTransformer, util
from sklearn.preprocessing import StandardScaler
import torch

# Load the data

In [34]:
df = pd.read_csv("realistic_video_recommendations.csv")
df.head()

Unnamed: 0,Video ID,Title,Chapter Number,Duration,Description,Keywords,Level,Recommended Age Group,Language,URL,Rating,Views,Upload Date,Related Videos
0,VID0000,Admit may price yard stand old imagine,1,55.98,Practice store performance still. Necessary fr...,"Mrs, sing, heavy, perform, appear",Intermediate,16-20,English,https://www.foster.net/,3.1,2144,2022-12-23,VID2552
1,VID0001,Animal run human,2,8.81,Institution politics almost bed. Rich learn mo...,"matter, across, he, notice",Intermediate,41-50,Mandarin,http://larsen.biz/,2.5,8732,2022-03-05,VID0291
2,VID0002,Discussion reveal return especially,10,20.99,Effort political charge high answer perform ho...,"huge, reality, story, above, develop, treat",Intermediate,21-30,French,https://martin.net/,2.5,3345,2023-11-26,VID1302
3,VID0003,List evening,2,47.73,Card defense why consider. Wish effort resourc...,"decade, its, now, reason, measure, upon",Beginner,16-20,French,http://www.wright-wright.net/,1.3,9569,2022-01-28,"VID3476, VID0917"
4,VID0004,Want involve he,6,50.23,Save live oil race suggest. Break middle light...,"between, support, concern, star, return, citizen",Intermediate,31-40,Spanish,https://duke-taylor.com/,4.2,1285,2021-04-13,"VID4039, VID3470"


# Preprocessing and Cleaning

In [20]:
# Check for missing values
print("Missing Values:\n", df.isnull().sum())

# Check basic statistics for numerical features
print("Dataset Statistics:\n", df.describe())

Missing Values:
 Video ID                 0
Title                    0
Chapter Number           0
Duration                 0
Description              0
Keywords                 0
Level                    0
Recommended Age Group    0
Language                 0
URL                      0
Rating                   0
Views                    0
Upload Date              0
Related Videos           0
combined_text            0
embedding                0
dtype: int64
Dataset Statistics:
        Chapter Number     Duration       Rating        Views
count     5000.000000  5000.000000  5000.000000  5000.000000
mean         5.510000    32.014210     2.999200  5215.845200
std          2.831449    15.898282     1.153377  2730.368891
min          1.000000     5.010000     1.000000   501.000000
25%          3.000000    18.135000     2.000000  2889.500000
50%          5.000000    31.905000     3.000000  5200.000000
75%          8.000000    45.955000     4.000000  7562.000000
max         10.000000    59.9

# Cleaning

In [23]:
# Import Libraries
import pandas as pd
import re
from nltk.corpus import stopwords
import nltk

# Download stopwords if not already downloaded
nltk.download('stopwords')


# Load stopwords
stop_words = set(stopwords.words('english'))

# Define a cleaning function
def clean_text(text):
    # Lowercase the text
    text = text.lower()
    # Remove non-alphabetic characters and split words
    words = re.findall(r'\b\w+\b', text)
    # Remove stopwords
    filtered_words = [word for word in words if word not in stop_words]
    # Join words back into a single string
    cleaned_text = ' '.join(filtered_words)
    return cleaned_text

# Apply cleaning to Title and Description columns
df['clean_title'] = df['Title'].apply(clean_text)
df['clean_desc'] = df['Description'].apply(clean_text)

[nltk_data] Downloading package stopwords to C:\Users\Noor
[nltk_data]     Saeed\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Combine relevant text features for similarity analysis

In [25]:
# Combine relevant text features for similarity analysis
# Now create 'combined_text' by including the additional relevant columns
df['combined_text'] = (
    df['clean_title'] + " " + 
    df['clean_desc'] + " " + 
    df['Keywords'] + " " + 
    df['Level'].fillna('') + " " + 
    df['Language'].fillna('') + " " + 
    df['Recommended Age Group'].astype(str) + " " + 
    df['Chapter Number'].astype(str)
)

# Sentence Transformer and creating embeddings

In [26]:
# Initialize the sentence transformer model
model = SentenceTransformer('all-MiniLM-L6-v2')

# Create embeddings for the combined text column
df['embedding'] = df['combined_text'].apply(lambda x: model.encode(x, convert_to_tensor=True))

# Recommendation System

In [27]:
# Define the sorted recommendation function
def get_sorted_recommendations(video_id, top_n=5):
    # Retrieve the embedding for the selected video ID
    selected_embedding = df[df['Video ID'] == video_id]['embedding'].values[0]
    
    # Compute cosine similarities
    similarities = [(i, util.cos_sim(selected_embedding, emb).item()) for i, emb in enumerate(df['embedding'])]
    
    # Sort the similarities from highest to lowest score
    sorted_similarities = sorted(similarities, key=lambda x: x[1], reverse=True)
    
    # Get indices of the most similar items, excluding the selected video itself
    recommended_indices = [index for index, score in sorted_similarities[1:top_n+1]]
    
    # Create the final sorted recommendation dataframe
    recommendations = df.iloc[recommended_indices]
    recommendations = recommendations[['Video ID', 'Title', 'Chapter Number', 'Duration', 'Description', 'URL', 'Rating','Level','Language']]
    recommendations['Similarity Rank'] = range(1, len(recommendations) + 1)
    
    return recommendations

# Example: Get sorted recommendations for a given video ID
sorted_recommendations = get_sorted_recommendations(video_id="VID0000", top_n=4)
sorted_recommendations

Unnamed: 0,Video ID,Title,Chapter Number,Duration,Description,URL,Rating,Level,Language,Similarity Rank
2560,VID2560,Seem simple that another old,4,50.51,Party time near manager by chance. Cover manag...,https://www.rodriguez.com/,1.0,Advanced,Spanish,1
4572,VID4572,Conference beat fund,5,30.57,Say stay expect foot fast member many. Yeah TV...,https://smith-mcneil.com/,4.8,Beginner,Spanish,2
3388,VID3388,Senior over decide what stop research,6,43.57,Picture its staff view. Pass expert grow. Year...,http://gill.org/,4.6,Intermediate,Mandarin,3
4012,VID4012,Develop discussion large ball chair,10,40.22,Guess picture you play perform. Next thank hel...,http://www.price-myers.info/,3.7,Beginner,English,4


In [28]:
# Example: Get sorted recommendations for a given video ID
sorted_recommendations = get_sorted_recommendations(video_id="VID3176", top_n=4)
sorted_recommendations

Unnamed: 0,Video ID,Title,Chapter Number,Duration,Description,URL,Rating,Level,Language,Similarity Rank
1348,VID1348,About design sit newspaper concern,7,39.59,Play likely degree government miss take. Quali...,https://hall.com/,4.6,Advanced,English,1
872,VID0872,Career somebody three,7,17.57,Hit have same happen north business key. Conti...,https://jordan.com/,4.2,Beginner,French,2
1522,VID1522,Newspaper many hold take lawyer nothing rise,6,32.55,International music manage control. And ok gun...,https://www.mann.com/,3.1,Beginner,German,3
3576,VID3576,Perform federal natural throw sell design Cong...,7,28.31,Vote before bed. Anything plan yet brother any...,http://www.lin.com/,4.6,Beginner,German,4


In [29]:
# Testing with multiple video IDs
for video_id in ["VID0000", "VID0001", "VID0002"]:
    print(f"Recommendations for Video ID {video_id}:")
    print(get_recommendations(video_id, top_n=5))
    print("\n")


Recommendations for Video ID VID0000:
     Video ID                                   Title  Chapter Number  \
2560  VID2560            Seem simple that another old               4   
4572  VID4572                    Conference beat fund               5   
3388  VID3388   Senior over decide what stop research               6   
4012  VID4012     Develop discussion large ball chair              10   
3171  VID3171  Person simple skill management century              10   

      Duration                                        Description  \
2560     50.51  Party time near manager by chance. Cover manag...   
4572     30.57  Say stay expect foot fast member many. Yeah TV...   
3388     43.57  Picture its staff view. Pass expert grow. Year...   
4012     40.22  Guess picture you play perform. Next thank hel...   
3171     12.24  Step term scientist our value. Prevent certain...   

                                   URL  Rating  
2560        https://www.rodriguez.com/     1.0  
4572      

# Recommendation Evaluations


To evaluate this recommendation system, we can use similarity metrics that measure how well the recommendations match the selected video based on their embeddings. Mean Reciprocal Rank (MRR), Precision at k (P@k), and Normalized Discounted Cumulative Gain (nDCG) are widely used for such tasks, as they quantify how well the top recommendations align with user interest.

In [31]:
import torch
import numpy as np
from sentence_transformers import util

# Stack all embeddings into a tensor for efficient similarity computation
embeddings = torch.stack(df['embedding'].values.tolist())
video_ids = df['Video ID'].values

# Define evaluation metrics optimized for batch processing
def mean_reciprocal_rank_batch(similarity_matrix, video_ids):
    ranks = []
    for i in range(len(video_ids)):
        # Get indices sorted by similarity for each video
        sorted_indices = torch.argsort(similarity_matrix[i], descending=True)
        
        # Find rank of the relevant video (itself)
        rank = (sorted_indices == i).nonzero(as_tuple=True)[0].item() + 1
        ranks.append(1 / rank if rank != 0 else 0)
    return np.mean(ranks)

def precision_at_k_batch(similarity_matrix, video_ids, k=5):
    precisions = []
    for i in range(len(video_ids)):
        # Get indices of top K results
        top_k_indices = torch.argsort(similarity_matrix[i], descending=True)[1:k+1]
        
        # Check if the target video ID is in the top K
        relevant_count = (video_ids[top_k_indices] == video_ids[i]).sum().item()
        precisions.append(relevant_count / k)
    return np.mean(precisions)

def ndcg_at_k_batch(similarity_matrix, video_ids, k=5):
    ndcgs = []
    for i in range(len(video_ids)):
        top_k_indices = torch.argsort(similarity_matrix[i], descending=True)[1:k+1]
        
        # Calculate DCG
        dcg = sum([(video_ids[top_k_indices[j]] == video_ids[i]) / np.log2(j + 2) for j in range(k)])
        idcg = 1  # Ideal DCG where the relevant item is in the top rank
        ndcgs.append(dcg / idcg)
    return np.mean(ndcgs)

# Calculate cosine similarity matrix in a batch
similarity_matrix = util.cos_sim(embeddings, embeddings)

# Evaluate recommendation system with batch calculations
evaluation_results = {
    'Mean Reciprocal Rank (MRR)': mean_reciprocal_rank_batch(similarity_matrix, video_ids),
    'Precision at K': precision_at_k_batch(similarity_matrix, video_ids, k=5),
    'nDCG at K': ndcg_at_k_batch(similarity_matrix, video_ids, k=5)
}

print("Optimized Evaluation Results:", evaluation_results)


Optimized Evaluation Results: {'Mean Reciprocal Rank (MRR)': 1.0, 'Precision at K': 0.0, 'nDCG at K': 0.0}
