In [24]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from scipy.sparse import csr_matrix
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics.pairwise import cosine_distances
from sklearn.preprocessing import LabelEncoder
from collections import OrderedDict
import os
import time

from tqdm import tqdm

In [67]:
movies_df= pd.read_table('movies.csv', sep=',')
movies_df['movieId'] = movies_df['movieId'].astype('uint32')
users_df=pd.read_csv('users_df.csv', index_col=0)
user_movie_matrix = pd.read_csv('user_movie_matrix.csv', index_col='userId')
correlation_df=pd.read_csv('correlation_matrix.csv', index_col='userId')
correlation_df.columns = correlation_df.columns.astype(int)

#Kolaborativno filtriranje

In [71]:
def get_most_similar_users(user_id, n, correlation_df):
    user_correlations = correlation_df[user_id].drop(user_id)
    sorted_users = user_correlations.sort_values(ascending=False)
    top_n_similar_users = sorted_users.head(n).index.tolist()
    return top_n_similar_users

def get_movie_recommendations(user_id, m, similar_users, user_movie_matrix, min_correlation=0.05):
    user_index = user_movie_matrix.index.get_loc(user_id)

    # Sum the correlations (weights) and weighted scores of similar users
    weights_sum = 0
    weighted_scores_sum = np.zeros_like(user_movie_matrix.loc[user_id])

    for similar_user in similar_users:
        similar_user_index = user_movie_matrix.index.get_loc(similar_user)
        correlation = correlation_df.at[user_id, similar_user]

        if np.isnan(correlation) or correlation < min_correlation:
            continue
            
        weights_sum += correlation
        weighted_scores_sum += user_movie_matrix.loc[similar_user] * correlation
        
    # Normalize the weighted scores by dividing by the sum of correlations
    if weights_sum != 0:
        normalized_scores = weighted_scores_sum / weights_sum
    else:
        normalized_scores = np.zeros_like(user_movie_matrix.loc[user_id])

    # Remove the movies that the user has already rated
    user_rated_movies = user_movie_matrix.loc[user_id]
    normalized_scores[user_rated_movies != 0] = 0
    
    # Get the top m movie recommendations
    # Get a pandas Series with the movie indices as the index
    normalized_scores_series = pd.Series(normalized_scores, index=user_movie_matrix.columns)
    # Select the top m movie recommendations by using the nlargest function on the pandas Series
    recommended_movie_ids = normalized_scores_series.nlargest(m).index
    return recommended_movie_ids.astype(int)

#Cold start

In [77]:
country_to_region = {
    'Belarus': 'Eastern Europe',
    'Bulgaria': 'Eastern Europe',
    'Czech Republic': 'Eastern Europe',
    'Hungary': 'Eastern Europe',
    'Poland': 'Eastern Europe',
    'Moldova': 'Eastern Europe',
    'Romania': 'Eastern Europe',
    'Russia': 'Eastern Europe',
    'Slovakia': 'Eastern Europe',
    'Ukraine': 'Eastern Europe',
    'Åland Islands': 'Northern Europe',
    'Denmark': 'Northern Europe',
    'Estonia': 'Northern Europe',
    'Faroe Islands': 'Northern Europe',
    'Finland': 'Northern Europe',
    'Iceland': 'Northern Europe',
    'Ireland': 'Northern Europe',
    'Isle of Man': 'Northern Europe',
    'Latvia': 'Northern Europe',
    'Lithuania': 'Northern Europe',
    'Norway': 'Northern Europe',
    'Svalbard and Jan Mayen Islands': 'Northern Europe',
    'Sweden': 'Northern Europe',
    'United Kingdom': 'Northern Europe',
    'Albania': 'Southern Europe',
    'Andorra': 'Southern Europe',
    'Bosnia and Herzegovina': 'Southern Europe',
    'Croatia': 'Southern Europe',
    'Gibraltar': 'Southern Europe',
    'Greece': 'Southern Europe',
    'Holy See': 'Southern Europe',
    'Italy': 'Southern Europe',
    'Malta': 'Southern Europe',
    'Montenegro': 'Southern Europe',
    'North Macedonia': 'Southern Europe',
    'Portugal': 'Southern Europe',
    'San Marino': 'Southern Europe',
    'Serbia': 'Southern Europe',
    'Slovenia': 'Southern Europe',
    'Spain': 'Southern Europe',
    'Austria': 'Western Europe',
    'Belgium': 'Western Europe',
    'France': 'Western Europe',
    'Germany': 'Western Europe',
    'Liechtenstein': 'Western Europe',
    'Luxembourg': 'Western Europe',
    'Monaco': 'Western Europe',
    'Netherlands': 'Western Europe',
    'Switzerland': 'Western Europe',
    'Channel Islands': 'Northern Europe'
}

age_ranges = {
    'Teenagers (13-19)': (13, 19),
    'Young Adults (20-25)': (20, 25),
    'Adults (26-35)': (26, 35),
    'Middle-aged Adults (36-45)': (36, 45),
    'Senior Adults (46-65)': (46, 65)
}

min_age = 13
max_age = 65

def normalized_difference(a, b, min_age, max_age):
    return np.abs((a - min_age) - (b - min_age)) / (max_age - min_age)

def find_similar_users(users_df, new_user, n):
    # Get new user details
    new_sex = new_user['sex'].item()
    new_age_group = new_user['age_group'].item()
    new_age = new_user['age'].item()
    new_country = new_user['country'].item()
    new_region = new_user['region'].item()

    # Compute age distance range
    min_age = users_df['age'].min()
    max_age = users_df['age'].max()

    # Calculate age group similarity
    age_distance = normalized_difference(users_df['age'], new_age, min_age, max_age)
    age_similarity = 0.45 * (1 - age_distance)

    # Calculate overall similarity
    similarity_scores = (
        (users_df['sex'] == new_sex).astype(float) * 0.35 +
        (users_df['age_group'] == new_age_group).astype(float) * age_similarity +
        (users_df['country'] == new_country).astype(float) * 0.15 +
        (users_df['region'] == new_region).astype(float) * 0.05
    )

    # Sort users by similarity scores and get top n
    top_indices = np.argsort(similarity_scores)[::-1][:n]
    top_similarities = similarity_scores[top_indices]

    # Get top n similar users
    similar_users = users_df.loc[top_indices]

    return similar_users, top_similarities

def get_new_user(new_index):
    age_distribution = users_df['age'].value_counts(normalize=True)
    sex_distribution = users_df['sex'].value_counts(normalize=True)
    region_distribution = users_df['region'].value_counts(normalize=True)
    country_distribution = users_df['country'].value_counts(normalize=True)
    
    new_age = np.random.choice(age_distribution.index, p=age_distribution.values) 
    new_sex = np.random.choice(sex_distribution.index, p=sex_distribution.values)
    new_country = np.random.choice(country_distribution.index, p=country_distribution.values)

    new_region = country_to_region[new_country]
    new_age_group = pd.cut([new_age], bins=[12, 19, 25, 35, 45, 65], labels=list(age_ranges))

    
    new_user = pd.DataFrame({
        'userId': [new_index],
        'sex': [new_sex],
        'age': [new_age],
        'country': [new_country],
        'region': [new_region]
    })
    
    new_user['age_group_pom'] = pd.cut(new_user['age'], bins=[13, 19, 25, 35, 45, 65], labels=list(age_ranges))
    new_user.insert(new_user.columns.get_loc('age') + 1, 'age_group', new_user['age_group_pom'])
  
    return new_user.drop('age_group_pom', axis=1)    

def get_movie_recommendations_cold_start(m, similar_users, similarities, user_movie_matrix, min_correlation=0.35):

    weights_sum = 0
    weighted_scores_sum = np.zeros_like(user_movie_matrix.loc[similar_users[0]])

    for similar_user, similarity in zip(similar_users, similarities):
        similar_user_index = user_movie_matrix.index.get_loc(similar_user)

        if np.isnan(similarity) or similarity < min_correlation:
            continue
            
        weights_sum += similarity
        weighted_scores_sum += user_movie_matrix.loc[similar_user] * similarity
        
    # Normalize the weighted scores by dividing by the sum of correlations
    if weights_sum != 0:
        normalized_scores = weighted_scores_sum / weights_sum
    else:
        normalized_scores = np.zeros_like(user_movie_matrix.loc[similar_user[0]])

    
    # Get the top m movie recommendations
    # Get a pandas Series with the movie indices as the index
    normalized_scores_series = pd.Series(normalized_scores, index=user_movie_matrix.columns)
    # Select the top m movie recommendations by using the nlargest function on the pandas Series
    recommended_movie_ids = normalized_scores_series.nlargest(m).index
    return recommended_movie_ids.astype(int)

#Recomendation model

In [99]:
def get_recommendations(user_id, num_similar_users, num_recommednations):
  if user_id in correlation_df.columns:
      similar_users = get_most_similar_users(user_id, num_similar_users, correlation_df)
      recommended_movie_ids = get_movie_recommendations(user_id, num_recommednations, similar_users, user_movie_matrix, min_correlation=0.05)
      recommended_movie_titles = movies_df[movies_df['movieId'].isin(recommended_movie_ids)]['title']

      return similar_users, list(recommended_movie_ids), list(recommended_movie_titles)	

  new_user=get_new_user(user_id)
  similar_users, top_similarities=find_similar_users(users_df, new_user, num_similar_users)
  recommended_movie_ids = get_movie_recommendations_cold_start(num_recommednations, similar_users['userId'].tolist(), top_similarities, user_movie_matrix, min_correlation=0.5)
  recommended_movie_titles = movies_df[movies_df['movieId'].isin(recommended_movie_ids)]['title']

  return list(similar_users['userId']), list(recommended_movie_ids), list(recommended_movie_titles)	

In [104]:
user_id=700
num_similar_users=5
num_recommednations=10

similar_users, recommended_movie_ids, recommended_movie_titles=get_recommendations(user_id, num_similar_users, num_recommednations)

print("Similar users to user", user_id, "are:", similar_users)
print("Recommended movies are:\n", '\n'.join(str(item) for item in recommended_movie_titles))

Similar users to user 700 are: [75, 335, 54, 342, 280]
Recommended movies are:
 Clerks (1994)
Star Wars: Episode IV - A New Hope (1977)
Shawshank Redemption, The (1994)
Terminator 2: Judgment Day (1991)
Star Wars: Episode V - The Empire Strikes Back (1980)
Raiders of the Lost Ark (Indiana Jones and the Raiders of the Lost Ark) (1981)
Jungle Book, The (1967)
For a Few Dollars More (Per qualche dollaro in più) (1965)
Lord of the Rings: The Fellowship of the Ring, The (2001)
I Am Sam (2001)
