In [58]:
import pandas as pd, numpy as np, joblib
from sentence_transformers import SentenceTransformer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.feature_extraction import FeatureHasher
from scipy import sparse
from lightgbm import Booster
from catboost import CatBoostRegressor
from scrape_metadata import scrape_metadata

In [59]:
df_train = pd.read_csv('data/ratings_enriched.csv')

sbert = SentenceTransformer('all-MiniLM-L6-v2')

mlb = MultiLabelBinarizer()
mlb.fit(
    df_train[['Genre1','Genre2']]
      .fillna('')
      .apply(lambda r: [g for g in r if g], axis=1)
      .tolist()
)

year_mean, year_std = df_train['Year'].mean(), df_train['Year'].std()

dir_hasher = FeatureHasher(n_features=256, input_type='dict', alternate_sign=False)
cty_hasher = FeatureHasher(n_features=128, input_type='dict', alternate_sign=False)

In [60]:
USE_RATING_STATS = True

if USE_RATING_STATS == False:
  cat_model = CatBoostRegressor(); cat_model.load_model('models/catboost.cbm')
  lgbm_model = Booster(model_file='models/lightgbm.txt')
else:
  cat_model = CatBoostRegressor(); cat_model.load_model('models/catboost_ratings.cbm')
  lgbm_model = Booster(model_file='models/lightgbm_ratings.txt')

In [61]:
def prepare_single_point(row: pd.Series):
    emb = sparse.csr_matrix(
        sbert.encode([row['Plot']], convert_to_numpy=True)
    )
    genres = [[g for g in [row['Genre1'], row['Genre2']] if pd.notna(g) and g!='']]
    g_hot  = sparse.csr_matrix(mlb.transform(genres))
    year_std_val = np.array([(row['Year']-year_mean)/year_std]).reshape(-1,1)
    num_parts = [year_std_val]
    if USE_RATING_STATS:
        rating_avg_val = np.array([row['RatingAvg'] if pd.notna(row['RatingAvg']) else 0],
                                  dtype=float).reshape(-1,1)
        rating_cnt_val = np.array([np.log1p(row['RatingCount'] or 0)],
                                  dtype=float).reshape(-1,1)
        num_parts += [rating_avg_val, rating_cnt_val]
    X_num = sparse.csr_matrix(np.hstack(num_parts))

    # ------- lightgbm: hash Director & Country
    dir_dict = [{'d=' + str(row['Director1']): 1}]
    cty_dict = [{'c=' + str(row['Country1']): 1}]
    X_hash = sparse.hstack([
        dir_hasher.transform(dir_dict),
        cty_hasher.transform(cty_dict)
    ])

    X_lgb = sparse.hstack([emb, g_hot, X_num, X_hash]).tocsr()

    # ------- catboost: dataframe con colonne originali
    feat_names = [f'emb_{i}' for i in range(emb.shape[1])] + \
                 list(mlb.classes_) + ['year']
    if USE_RATING_STATS:
        feat_names += ['rating_avg','rating_count']
    X_cat = pd.DataFrame(
        sparse.hstack([emb, g_hot, X_num]).toarray(),
        columns=feat_names
    )
    X_cat['Director1'] = str(row['Director1'])
    X_cat['Country1']  = str(row['Country1'])
    return X_lgb, X_cat

def predict_from_url(url: str, model = 'cat'):
    meta = scrape_metadata(url)
    meta['Year'] = pd.to_numeric(meta.get('Year', np.nan), errors='coerce')
    meta['RatingAvg']   = pd.to_numeric(meta['RatingAvg'], errors='coerce')
    meta['RatingCount'] = pd.to_numeric(meta['RatingCount'], errors='coerce')

    X_lgb, X_cat = prepare_single_point(meta)
    pred_lgb = float(lgbm_model.predict(X_lgb)[0])
    pred_cat = float(cat_model.predict(X_cat)[0])
    if model == 'cat':
        return pred_cat
    else:
        return pred_lgb

## Search

In [66]:
from tavily import TavilyClient

query = 'sexy vampires'
client = TavilyClient("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")
response = client.search(
    query=query,
    include_domains=["letterboxd.com/film"],
    search_depth='advanced',
    max_results=10,
)
uris = []
for res in response['results']:
    uris.append(res['url'])

In [67]:
df_uris = pd.DataFrame(uris, columns=['url'])
new_cols = df_uris['url'].progress_apply(scrape_metadata)
df_uris = pd.concat([df_uris, new_cols], axis=1)

df_uris = df_uris[df_uris['Duration'] > 60]
df_uris = df_uris[df_uris['RatingCount'] > 100]

100%|██████████| 7/7 [00:03<00:00,  1.95it/s]


In [68]:
df_uris['Prediction'] = df_uris['url'].progress_apply(predict_from_url)
df_uris.sort_values('Prediction', ascending=False, inplace=True)

for i, row in df_uris.iterrows():
    print(f"  {row['url']} - Predicted Rating: {row['Prediction']:.2f}, Avg Rating: {row['RatingAvg']:.2f}")
    print(f"  {row['Plot']}")
    print()

100%|██████████| 1/1 [00:00<00:00,  1.74it/s]

  https://letterboxd.com/film/the-horrible-sexy-vampire/ - Predicted Rating: 2.79, Avg Rating: 2.77
  Some kind of sadist, but not human, is murdering people. A doctor is convinced that the killings are the evil workings of a reclusive odd baron who died many years ago.




