In [3]:
import sys
import os
import django
from pathlib import Path

# Manually set the root directory and the settings module
root_dir = "C:/Users/USER/Documents/recommender/src"
settings_module = "cfehome.settings"

os.environ.setdefault("DJANGO_SETTINGS_MODULE", settings_module)
os.environ["DJANGO_ALLOW_ASYNC_UNSAFE"] = "true"

# Add the root directory to sys.path
if root_dir not in sys.path:
    sys.path.insert(0, root_dir)
    print(f"'{root_dir}' successfully added to sys.path")

# Setup Django
try:
    django.setup()
    print("Django setup successfully")
except Exception as e:
    print(f"Error setting up Django: {e}")

# Now, access the settings and perform your operations
from django.conf import settings
import pandas as pd

# Ensure that the DATA_DIR setting is available
try:
    ratings_path = settings.DATA_DIR / "ratings_small.csv"
    if ratings_path.exists():
        print(f"Found ratings file at: {ratings_path}")
    else:
        print(f"Ratings file not found at: {ratings_path}")
except AttributeError as e:
    print(f"Error accessing DATA_DIR: {e}")
except Exception as e:
    print(f"An unexpected error occurred: {e}")



'C:/Users/USER/Documents/recommender/src' successfully added to sys.path
Django setup successfully
Found ratings file at: C:\Users\USER\Documents\recommender\src\data\ratings_small.csv


In [25]:
import pandas as pd
from django.conf import settings
from movies.models import Movie
from ratings.models import Rating


LINKS_SMALL_CSV = settings.DATA_DIR / 'links_small.csv'
LINKS_SMALL_CSV.exists()

True

In [6]:
qs = Rating.objects.all()
missing_movie_ids = []
for instance in qs:
    if instance.content_object is None:
        missing_movie_ids.append(instance.object_id)

_total = len(missing_movie_ids)
total_missing = list(set(missing_movie_ids))
print(len(total_missing), total)

NameError: name 'total' is not defined

In [8]:
print(len(total_missing), _total, qs.count())

6392 57175 100004


In [10]:
links_df = pd.read_csv(LINKS_SMALL_CSV)
links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [12]:
ms_df = links_df.copy()[links_df.movieId.isin(total_missing)]
ms_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
3,4,114885,31357.0
6,7,114319,11860.0
7,8,112302,45325.0
8,9,114576,9091.0


In [14]:
ms_df.shape[0] == len(total_missing)

True

In [17]:
def enrich_imdb_col(val):
    val = str(val)
    if len(val) == 7:
       val = f"tt{val}"
       return val
    if len(val) == 6:
        val = f"tt0{val}"
        return val
    if len(val) == 5:
        val = f"tt00{val}"
        return val
    return val

In [18]:
ms_df['tt'] = ms_df['imdbId'].apply(enrich_imdb_col)
#ms_df.to_csv('')
ms_df.head()


Unnamed: 0,movieId,imdbId,tmdbId,tt
0,1,114709,862.0,tt0114709
3,4,114885,31357.0,tt0114885
6,7,114319,11860.0,tt0114319
7,8,112302,45325.0,tt0112302
8,9,114576,9091.0,tt0114576


In [19]:
MOVIES_CSV = settings.DATA_DIR / 'movies_metadata.csv'
MOVIES_CSV.exists()

True

In [21]:
movies_cols = ['title', 'overview', 'release_date', 'imdb_id']
movies_df = pd.read_csv(MOVIES_CSV, usecols=movies_cols)
movies_df.head()

Unnamed: 0,imdb_id,overview,release_date,title
0,tt0114709,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story
1,tt0113497,When siblings Judy and Peter discover an encha...,1995-12-15,Jumanji
2,tt0113228,A family wedding reignites the ancient feud be...,1995-12-22,Grumpier Old Men
3,tt0114885,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale
4,tt0113041,Just when George Banks has recovered from his ...,1995-02-10,Father of the Bride Part II


In [22]:
missing_movies_df = ms_df.merge(movies_df, left_on='tt', right_on='imdb_id')
missing_movies_df.head()

Unnamed: 0,movieId,imdbId,tmdbId,tt,imdb_id,overview,release_date,title
0,1,114709,862.0,tt0114709,tt0114709,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story
1,4,114885,31357.0,tt0114885,tt0114885,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale
2,7,114319,11860.0,tt0114319,tt0114319,An ugly duckling having undergone a remarkable...,1995-12-15,Sabrina
3,8,112302,45325.0,tt0112302,tt0112302,"A mischievous young boy, Tom Sawyer, witnesses...",1995-12-22,Tom and Huck
4,9,114576,9091.0,tt0114576,tt0114576,International action superstar Jean Claude Van...,1995-12-22,Sudden Death


In [23]:
missing_movies_df['id'] = missing_movies_df['movieId']
missing_movies_df['id_alt'] = missing_movies_df['tmdbId'].apply(lambda x: str(int(x)))
missing_movies_df.head()

Unnamed: 0,movieId,imdbId,tmdbId,tt,imdb_id,overview,release_date,title,id,id_alt
0,1,114709,862.0,tt0114709,tt0114709,"Led by Woody, Andy's toys live happily in his ...",1995-10-30,Toy Story,1,862
1,4,114885,31357.0,tt0114885,tt0114885,"Cheated on, mistreated and stepped on, the wom...",1995-12-22,Waiting to Exhale,4,31357
2,7,114319,11860.0,tt0114319,tt0114319,An ugly duckling having undergone a remarkable...,1995-12-15,Sabrina,7,11860
3,8,112302,45325.0,tt0112302,tt0112302,"A mischievous young boy, Tom Sawyer, witnesses...",1995-12-22,Tom and Huck,8,45325
4,9,114576,9091.0,tt0114576,tt0114576,International action superstar Jean Claude Van...,1995-12-22,Sudden Death,9,9091


In [56]:
final_df = missing_movies_df.copy()[['id', 'id_alt', 'title']]
final_df['id_alt'] = final_df['id_alt'].astype(str)
final_df.head()

Unnamed: 0,id,id_alt,title
0,1,862,Toy Story
1,4,31357,Waiting to Exhale
2,7,11860,Sabrina
3,8,45325,Tom and Huck
4,9,9091,Sudden Death


In [57]:
alt_id_list = final_df['id_alt'].to_list()

In [58]:
#movies_qs = Movie.objects.filter(id_in=alt_id_list)
movies_qs.count()

6034

In [59]:
from django.forms.models import model_to_dict

In [78]:
from django.db import transaction

# Define batch size
batch_size = 100

# Prepare lists for bulk operations
objects_to_create = []
objects_to_delete = []
existing_titles = set(Movie.objects.values_list('title', flat=True))

movies_qs = Movie.objects.filter(id__in=alt_id_list)
for obj in movies_qs:
    data = final_df.copy()[final_df['id_alt'] == str(obj.id)]
    if data.shape[0] == 1:
        og_model_data = model_to_dict(obj)
        update_data = data.to_dict('records')[0]
        if obj.title == update_data.get('title'):
            og_model_data['id'] = update_data['id']
            new_model_data = {**og_model_data}

            # Check if title is already in the database or in the create list
            if new_model_data['title'] in existing_titles:
                print(f"Skipping duplicate title: {new_model_data['title']}")
                continue

            # Add title to existing_titles set
            existing_titles.add(new_model_data['title'])

            # Add to lists for bulk operations
            objects_to_create.append(Movie(**new_model_data))
            objects_to_delete.append(obj)

            # Perform batch operations
            if len(objects_to_create) >= batch_size:
                with transaction.atomic():
                    Movie.objects.bulk_create(objects_to_create)
                    for obj in objects_to_delete:
                        obj.delete()
                objects_to_create = []
                objects_to_delete = []

# Perform remaining operations outside the loop
if objects_to_create:
    with transaction.atomic():
        Movie.objects.bulk_create(objects_to_create)
        for obj in objects_to_delete:
            obj.delete()


Skipping duplicate title: American Beauty
Skipping duplicate title: Citizen Kane
Skipping duplicate title: Dancer in the Dark
Skipping duplicate title: The Fifth Element
Skipping duplicate title: My Life Without Me
Skipping duplicate title: The Endless Summer
Skipping duplicate title: Pirates of the Caribbean: The Curse of the Black Pearl
Skipping duplicate title: Kill Bill: Vol. 1
Skipping duplicate title: Jarhead
Skipping duplicate title: Walk on Water
Skipping duplicate title: Apocalypse Now
Skipping duplicate title: Eternal Sunshine of the Spotless Mind
Skipping duplicate title: A History of Violence
Skipping duplicate title: Twelve Monkeys
Skipping duplicate title: Talk to Her
Skipping duplicate title: 8 Mile
Skipping duplicate title: Absolute Power
Skipping duplicate title: Paradise Now
Skipping duplicate title: Brazil
Skipping duplicate title: Billy Elliot
Skipping duplicate title: American History X
Skipping duplicate title: War of the Worlds
Skipping duplicate title: Before Su

In [81]:
from celery import shared_task
from django.db import transaction
from django.utils import timezone
from movies.models import Movie

@shared_task
def task_update_movie_ratings_batch(movie_ids):
    for movie_id in movie_ids:
        qs = Movie.objects.filter(id=movie_id)
        if qs.exists():
            movie = qs.first()
            rating_avg = movie.rating_set.aggregate(Avg('rating'))['rating__avg']
            rating_count = movie.rating_set.count()
            score = decimal.Decimal(rating_avg * rating_count * 1.0)
            qs.update(
                rating_avg=rating_avg,
                rating_count=rating_count,
                score=score,
                rating_last_updated=timezone.now()
            )


In [90]:
import os

# Change the directory to the parent directory
os.chdir(r'C:\Users\USER\Documents\recommender\src\ratings\management\commands\commands')

# Define the command to be executed
command = "python manage.py update_movie_ratings"


In [91]:
import os
import time

# Record the start time
start_time = time.time()

# Change the directory to the parent directory
os.chdir(r'C:\Users\USER\Documents\recommender\src\ratings\management\commands\commands')

# Define the command to be executed
command = "python manage.py update_movie_ratings"

# Execute the command
os.system(command)

# Calculate the time taken
total_time = time.time() - start_time

# Print the time taken
print(f"Total time taken for ratings update: {total_time} seconds")


Total time taken for ratings update: 2.713663101196289 seconds
