In [None]:
import re
import pandas as pd
import numpy as np


### Date

In [None]:
def normalise_date(date):
  if pd.isna(date):
    return date
  date = re.sub(r'(\d{2})[-/](\d{2})[-/](\d{4})', r'\3-\2-\1', date)
  date = re.sub(r'(\d{4})[-/](\d{2})[-/](\d{2})', r'\1-\2-\3', date)
  return date

### Actor And Genre Transformation

In [None]:
import pickle

# Load the dictionary from the pickle file
with open('mapGenreAndActorToScore.pkl', 'rb') as f:
    mapGenreAndActorToScore = pickle.load(f)



In [None]:
# Example: Get the score for actor "Tom Hanks" in genre "Comedy"
actor = "Tom Cruise"
genre = "Comedy"
if (actor, genre) in mapGenreAndActorToScore:
    score = mapGenreAndActorToScore[(actor, genre)]
    print(f"The score for {actor} in {genre} is {score}")
else:
    print(f"No score found for {actor} in {genre}")


The score for Tom Cruise in Comedy is 6.5


In [None]:
RATING_VALUE_MEAN=6.5

In [None]:
def compute_actor_genre_score(row):
    actor_genre_pairs = [(actor, genre) for actor in row['actors'] for genre in row['genre']]
    scores = [mapGenreAndActorToScore.get(pair,RATING_VALUE_MEAN) for pair in actor_genre_pairs]
    if len(scores)>=1:
      return sum(scores) / len(scores)
    return RATING_VALUE_MEAN

### Director Transformation

In [None]:
import pickle

# Load the dictionary from the pickle file
with open('mapDirectorToScore.pkl', 'rb') as f:
    mapDirectorToScore = pickle.load(f)



In [None]:
def compute_director_score(row):
    scores = [mapDirectorToScore.get(director,RATING_VALUE_MEAN) for director in row["directors"]]
    if len(scores)>=1:
      return sum(scores) / len(scores)
    return RATING_VALUE_MEAN

### Creator Transformation

In [None]:
import pickle

# Load the dictionary from the pickle file
with open('mapCreatorToScore.pkl', 'rb') as f:
    mapCreatorToScore = pickle.load(f)



In [None]:
def compute_creator_score(row):
    scores = [mapCreatorToScore.get(creator,RATING_VALUE_MEAN) for creator in row["creators"]]
    if len(scores)>=1:
      return sum(scores) / len(scores)
    return RATING_VALUE_MEAN

### Keywords Transformation

In [None]:
import pickle

# Load the dictionary from the pickle file
with open('mapkeywordToScore.pkl', 'rb') as f:
    mapkeywordToScore = pickle.load(f)



In [None]:
def compute_keyword_score(row):
    scores = [mapkeywordToScore.get(keyword,RATING_VALUE_MEAN) for keyword in row["keywords"]]
    if len(scores)>=1:
      return sum(scores) / len(scores)
    return RATING_VALUE_MEAN

### todayAgoPublished

In [None]:
def todayAgoPublished(date):
  return (pd.to_datetime("01-01-2025") - date).days

In [None]:
# prompt: call the todayAgoPublished with an example

import pandas as pd

# Example usage of todayAgoPublished
example_date = pd.to_datetime('01-10-2025')
days_ago = todayAgoPublished(example_date)
days_ago


-9

### Minutes

In [None]:
def convert_to_minutes(duration):
    try:
        if pd.isna(duration) or duration == 'nan':
            return None

        hours = 0
        minutes = 0

        hour_match = re.search(r'PT(\d+)H', duration)
        minute_match = re.search(r'(\d+)M', duration)

        if hour_match:
            hours = int(hour_match.group(1))
        if minute_match:
            minutes = int(minute_match.group(1))

        total_minutes = hours * 60 + minutes
        return  total_minutes

    except Exception as e:
        print(f"invalid conversion of {duration}")
        return  None



### Content Rating

In [None]:
rating_to_age = {
    'nan': None,
    'Not Rated': None,
    'TV-14': 14,
    'TV-PG': 10,
    'PG-13': 13,
    'R': 17,
    'X': 18,
    'PG': 10,
    'TV-Y': 0,
    'TV-MA': 17,
    'TV-G': 0,
    'TV-Y7': 7,
    'Unrated': None,
    'Approved': 0,
    'E': 0,
    'K-A': 0,
    'M': 18,
    'TV-Y7-FV': 7,
    'E10+': 10,
    'T': 10,
    'G': 0,
    '18+': 18,
    '16+': 16,
    '6+': 6,
    '12+': 12,
    '13+': 13,
    'NC-17': 17,
    'EC': 3,
    'GP': 0
}

In [None]:
def convert_rating_to_age(rating):
    return rating_to_age.get(rating, None )

### Budget

In [None]:
MEDIAN_BUDGET = 15.0

### Minutes

In [None]:
MEAN_MINUTES = 48.09870499600736

### Today ago published

In [None]:
MEDIAN_TODAY_AGO_PUBLISHED =6354.0

### Min age by default

In [None]:
min_age_by_default = 14

# Convert function

In [None]:
def convert(movie_data):
  movie_data["datePublished"] = normalise_date(movie_data["datePublished"])

## Example

In [2]:
import re
import pandas as pd
import ast
import pickle
import numpy as np
from sklearn.preprocessing import StandardScaler


# Load transformation mappings
with open('mapGenreAndActorToScore.pkl', 'rb') as f:
    mapGenreAndActorToScore = pickle.load(f)

with open('mapDirectorToScore.pkl', 'rb') as f:
    mapDirectorToScore = pickle.load(f)

with open('mapCreatorToScore.pkl', 'rb') as f:
    mapCreatorToScore = pickle.load(f)

with open('mapkeywordToScore.pkl', 'rb') as f:
    mapkeywordToScore = pickle.load(f)

# Constants
RATING_VALUE_MEAN = 6.5
MEDIAN_BUDGET = 15.0
MEAN_MINUTES = 48.1
MEDIAN_TODAY_AGO_PUBLISHED = 6354.0
min_age_by_default = 14
rating_to_age = {
    'nan': None, 'Not Rated': None, 'TV-14': 14, 'TV-PG': 10, 'PG-13': 13,
    'R': 17, 'X': 18, 'PG': 10, 'TV-Y': 0, 'TV-MA': 17, 'TV-G': 0, 'TV-Y7': 7,
    'Unrated': None, 'Approved': 0, 'E': 0, 'K-A': 0, 'M': 18, 'TV-Y7-FV': 7,
    'E10+': 10, 'T': 10, 'G': 0, '18+': 18, '16+': 16, '6+': 6, '12+': 12,
    '13+': 13, 'NC-17': 17, 'EC': 3, 'GP': 0
}

def normalise_date(date):
    if pd.isna(date):
        return None
    date = re.sub(r'(\d{2})[-/](\d{2})[-/](\d{4})', r'\3-\2-\1', date)
    date = re.sub(r'(\d{4})[-/](\d{2})[-/](\d{2})', r'\1-\2-\3', date)
    return date

def compute_actor_genre_score(actors, genres):
    actor_genre_pairs = [(actor, genre) for actor in actors for genre in genres]
    scores = [mapGenreAndActorToScore.get(pair, RATING_VALUE_MEAN) for pair in actor_genre_pairs]
    return sum(scores) / len(scores) if scores else RATING_VALUE_MEAN

def compute_director_score(directors):
    scores = [mapDirectorToScore.get(d, RATING_VALUE_MEAN) for d in directors]
    return sum(scores) / len(scores) if scores else RATING_VALUE_MEAN

def compute_creator_score(creators):
    scores = [mapCreatorToScore.get(c, RATING_VALUE_MEAN) for c in creators]
    return sum(scores) / len(scores) if scores else RATING_VALUE_MEAN

def compute_keyword_score(keywords):
    scores = [mapkeywordToScore.get(k, RATING_VALUE_MEAN) for k in keywords]
    return sum(scores) / len(scores) if scores else RATING_VALUE_MEAN

def todayAgoPublished(date):
    try:
        date = pd.to_datetime(date)
        return (pd.to_datetime("2025-01-01") - date).days
    except:
        return MEDIAN_TODAY_AGO_PUBLISHED

def convert_to_minutes(duration):
    if pd.isna(duration) or duration == 'nan':
        return None
    hours = minutes = 0
    hour_match = re.search(r'PT(\d+)H', duration)
    minute_match = re.search(r'(\d+)M', duration)
    if hour_match:
        hours = int(hour_match.group(1))
    if minute_match:
        minutes = int(minute_match.group(1))
    return hours * 60 + minutes if hours or minutes else MEAN_MINUTES

def convert_rating_to_age(rating):
    return rating_to_age.get(rating, min_age_by_default)

def parse_list(value):
    try:
        return [item['name'] for item in ast.literal_eval(value)] if value else []
    except:
        return []

def convert(movie_data):
    return {
        "ratingCount": movie_data.get("ratingCount", 0),
        "budget": movie_data.get("budget", MEDIAN_BUDGET),
        "actorsGenresScore": compute_actor_genre_score(
            parse_list(movie_data.get("actors")),
            ast.literal_eval(movie_data.get("genre", "[]"))
        ),
        "directorsScore": compute_director_score(parse_list(movie_data.get("directors"))),
        "creatorsScore": compute_creator_score(parse_list(movie_data.get("creators"))),
        "keywordsScore": compute_keyword_score(movie_data.get("keywords", "").split(",")),
        "todayAgoPublished": todayAgoPublished(movie_data.get("datePublished")),
        "minutes": convert_to_minutes(movie_data.get("Minutes")),
        "minAgeToWatch": convert_rating_to_age(movie_data.get("contentRating"))
    }


In [3]:
movie_data = {
    "imdbId": "tt0120812",
    "name": "Rush Hour",
    "genre": "['Action', 'Comedy', 'Crime']",
    "datePublished": "1998-09-18",
    "contentRating": "PG-13",
    "keywords": "child kidnapping,chinese mafia,gun,mobster,organized crime",
    "ratingCount": 306205,
    "budget": 33,
    "Minutes": "PT1H38M",
    "actors": "[{'name': 'Jackie Chan', 'id': 'nm0000329'}, {'name': 'Chris Tucker', 'id': 'nm0000676'}, {'name': 'Ken Leung', 'id': 'nm0504962'}]",
    "directors": "[{'name': 'Brett Ratner', 'id': 'nm0711840'}]",
    "creators": "[{'name': '', 'id': ''}, {'name': '', 'id': ''}, {'name': 'Ross LaManna', 'id': 'nm0482780'}, {'name': 'Jim Kouf', 'id': 'nm0467942'}]"
}

In [4]:
new_data = convert(movie_data)

In [5]:
print(new_data)

{'ratingCount': 306205, 'budget': 33, 'actorsGenresScore': 6.444444444444445, 'directorsScore': 5.2, 'creatorsScore': 6.5901756674206275, 'keywordsScore': 7.431363636363637, 'todayAgoPublished': 9602, 'minutes': 98, 'minAgeToWatch': 13}


# Scaling

In [6]:
feature_order = ['ratingCount', 'budget',  'actorsGenresScore',
                 'directorsScore', 'creatorsScore', 'keywordsScore',
                 'todayAgoPublished','minutes', 'minAgeToWatch']

In [7]:
# Load the scaler from the pickle file
with open('scaler.pkl', 'rb') as file:
    scaler = pickle.load(file)

In [10]:
data_df = pd.DataFrame([new_data], columns=feature_order)


data_df.head()

Unnamed: 0,ratingCount,budget,actorsGenresScore,directorsScore,creatorsScore,keywordsScore,todayAgoPublished,minutes,minAgeToWatch
0,306205,33,6.444444,5.2,6.590176,7.431364,9602,98,13


In [11]:
# Apply scaler transformation
scaled_new_data = scaler.transform(data_df)

In [12]:
print(scaled_new_data)

[[14.28026245  0.15366728 -0.45389195 -1.69539352 -0.64259373  1.13013421
   0.2186146   1.53052766  0.20824958]]


# Run model

In [18]:
import joblib

# Load the saved model
xgb_reg_loaded = joblib.load('xgboost_reg_model.pkl')

In [20]:
y_test_pred_xgb = xgb_reg_loaded.predict(scaled_new_data)

print(y_test_pred_xgb)

[6.3115377]
