# Linear Regression Model for RottenTomatoes

In [2]:
import sklearn
sklearn.__version__

'1.2.0'

In [3]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from pathlib import Path
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from numbers import Number
from matplotlib import pyplot as plt
import catboost
from catboost import CatBoostClassifier, CatBoostRegressor
from typing import Tuple, List


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [4]:
# Global VARIABLES
BATCH_SIZE = 128
NEPOCHS = 100

DATA_PATH = Path("src", "movies", "filter")

## Load Data

In [6]:
# Load Data
# df = pd.read_parquet(Path(DATA_PATH.absolute(), "film_info_simple.parquet"))
df = pd.read_parquet("film_info_simple.parquet")

FileNotFoundError: [Errno 2] No such file or directory: '/home/benjamin/Documents/DataLiteracy/DataLiteracyProject/src/movies/filter/src/movies/filter/film_info_simple.parquet'

### Normalize data

In [None]:
def streaming_suppliers(suppliers: str):
  streaming_suppliers = []
  if suppliers is None:
    return streaming_suppliers
  for supplier, offer in ast.literal_eval(suppliers):
    if offer=='Subscription':
      streaming_suppliers.append(supplier)
  return streaming_suppliers

In [None]:
def flatten_list(deeplist):
  return [entry for entrylist in deeplist for entry in entrylist]

In [None]:
df['streaming_suppliers_list'] = df['suppliers_list'].transform(lambda x: streaming_suppliers(x))

In [None]:
suppliers_count = pd.Series(flatten_list(df['suppliers_list'].transform(lambda x: streaming_suppliers(x)).tolist())).value_counts()

In [None]:
supplier_tokens = (suppliers_count[suppliers_count>=100]).keys().tolist()

In [None]:
supplier_tokens

In [None]:
def year_to_int(date: str|None) -> int:
  """
  transform release_date of format <yyyy>-<mm>-<dd> into normalizable integer
  """
  if isinstance(date, str):
    return int(date.split("-")[0])
  else:
    return None

def month_to_int(date: str|None) -> int:
  """
  transform release_date of format <yyyy>-<mm>-<dd> into normalizable integer
  """
  if isinstance(date, str):
    return int(date.split("-")[1])
  else:
    return None

In [None]:
def score_to_percent(score: Number) -> float:
  if isinstance(score, Number):
    return score / 100
  else:
    return None

In [None]:
def normalize_column(data: np.array) -> np.array:
  """
  Works only for integer/float based columns. 
  """
  scaled = preprocessing.StandardScaler().fit_transform(data.reshape(-1, 1))  # removes mean and scales to variance
  return scaled.reshape(-1)

In [None]:
df["month"] = df["release_date"].transform(month_to_int)

In [None]:
# special normalization
df["year"] = df["release_date"].transform(year_to_int)
df["month"] = df["release_date"].transform(month_to_int)

# remove mean and variance from score
def normalize_score(entry, mean, variance):
  return (entry-mean)/(variance**0.5)

CRITICS_MEAN = df["critics_score"].mean()
CRITICS_VARIANCE = np.var(df["critics_score"])

AUDIENCE_MEAN = df["audience_score"].mean()
AUDIENCE_VARIANCE = np.var(df["audience_score"])

df["critics_score"] = df["critics_score"].transform(normalize_score, mean=CRITICS_MEAN, variance=CRITICS_VARIANCE)
df["audience_score"] = df["audience_score"].transform(normalize_score, mean=AUDIENCE_MEAN, variance=AUDIENCE_VARIANCE)

# Convert scores to percentage
# df["critics_score"] = df["critics_score"].transform(score_to_percent)
# df["audience_score"] = df["audience_score"].transform(score_to_percent)

In [None]:
CRITICS_VARIANCE

In [None]:
# add proportional revenue component 
df["revenue"] = df["box_office"].to_numpy() / df["budget"].to_numpy()
df["revenue"] = df["revenue"].transform(lambda x: 0 if np.isnan(x) or np.isinf(x) else x)

In [None]:
to_normalize_cols = [
  "critics_count",
  # "critics_score",  # to_predict
  "audience_count",
  # "audience_score", # to_predict
  "rottentomatoes_length",
  "year",     # transform into integer first
  "month",    # transform into integer first
  "box_office", 
  "budget",
  "revenue",
]

In [None]:
# normalize columns
for column in to_normalize_cols:
  df[column] = normalize_column(df[column].to_numpy())

### Tokenize and encode in One-Hot Vectors 

In [None]:
from collections.abc import Iterable


def to_tokens(item: str, separator: str) -> list[str]:
  if isinstance(item, str):
    return item.strip().lower().split(separator)
  else:
    return item

def tokenize(items: Iterable[str], separator: str) -> list[str]:
  """
  take iterable items and get unique tokens
  """
  token_set = set()
  for item in items:
    for subitem in to_tokens(item, separator):
      token_set.add(subitem)
  return sorted(list(token_set))


def encode_one_hot(tokens: list[str]) -> dict[str: np.array]:
  encoder = preprocessing.OneHotEncoder(sparse_output=False)
  vectors = encoder.fit_transform(np.array(tokens).reshape(-1,1))
  return {token: vector for token, vector in zip(tokens, vectors)}


def replace_column_with_one_hot(frame: pd.DataFrame, column: str, one_hots: dict[str: np.array]) -> pd.DataFrame:
  """
  adds each token as a new column and sets vector
  """
  keys = list(one_hots.keys())
  vectors = np.zeros((len(frame), len(keys)))
  # pd.DataFrame({key: np.zeros(len(frame)) for key in keys})
  
  for key, value in one_hots.items():
    vectors[frame[column].map(set([key]).issubset), :] += value
    # print( vectors[frame[column].map(set([key]).issubset)]  )
  
  vectors_df = pd.DataFrame({key: vectors[:, idx] for idx, key in enumerate(keys)})
  return frame.drop(columns=[column]).join(vectors_df)


In [None]:
genre_tokens = tokenize(df["rottentomatoes_genre"], '/')
genre_vector = encode_one_hot(genre_tokens)


In [None]:
df["rottentomatoes_genre"] = df["rottentomatoes_genre"].transform(to_tokens, separator='/')

In [None]:
df = replace_column_with_one_hot(df, "rottentomatoes_genre", genre_vector)
df

In [None]:
supplier_vector = encode_one_hot(supplier_tokens)
df = replace_column_with_one_hot(df, "streaming_suppliers_list", supplier_vector)
df

In [None]:
# get all new one hot token- columns
GENRE_VECTOR = genre_tokens
SUPPLIER_VECTOR = supplier_tokens

### Create targets and training-/test-sets

In [None]:
TARGET_PARAMS = ["critics_score"]
TRAIN_PARAMS = [
    # "critics_count",
    # "audience_count",
    *GENRE_VECTOR,
    *SUPPLIER_VECTOR,
    "rottentomatoes_length",
    "year",
    "month",
    "box_office",
    "budget",
]   # extend this list if required


In [None]:
df.keys()

In [None]:
df[[*TRAIN_PARAMS]] = df[[*TRAIN_PARAMS]].fillna(0)

In [None]:
df_filtered

In [None]:
# Filter Rows containg a none for given any column listed in *_PARAMS
df_filtered = df[~df[[*TARGET_PARAMS, *TRAIN_PARAMS]].isnull().any(axis=1)]

In [None]:
# Create set for DataLoader
target = df_filtered[TARGET_PARAMS]
params = df_filtered[TRAIN_PARAMS]

In [None]:
df.keys()

In [None]:
param_train, param_test, target_train, target_test = train_test_split(params, target, test_size=0.25, shuffle=True)

In [None]:
param_train

## Prepare Torch

### Regression Models
We're gonna use 3 different Models to predict critics_score, audience_score and the absolute difference between them

In [None]:
target_train[TARGET_PARAMS[0]]

In [None]:
BATCH_SIZE = 1
len(param_train)

In [None]:
# prepare Data Loaders
data_limiter = 50000
train = torch.utils.data.TensorDataset(
    torch.Tensor(param_train.to_numpy()[:data_limiter]).to(device=device), 
    torch.Tensor(target_train[TARGET_PARAMS[1]].to_numpy()[:data_limiter]).to(device=device)
  )
test = torch.utils.data.TensorDataset(
    torch.Tensor(param_test.to_numpy()).to(device=device),
    torch.Tensor(target_test[TARGET_PARAMS[1]].to_numpy()).to(device=device)
)

train_loader = torch.utils.data.DataLoader(dataset=train, batch_size=BATCH_SIZE, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test, batch_size=500, shuffle=True)


In [None]:
len(TRAIN_PARAMS)

In [None]:
INPUT_SIZE = len(TRAIN_PARAMS)
HIDDEN_WIDTH = INPUT_SIZE*2 # INPUT_SIZE

model = nn.Sequential(
    nn.Linear(INPUT_SIZE, HIDDEN_WIDTH), nn.Tanh(),
    # Hidden Layers
    # nn.Linear(HIDDEN_WIDTH, HIDDEN_WIDTH), 
    # nn.BatchNorm1d(HIDDEN_WIDTH),
    # nn.LeakyReLU(),
    nn.Linear(HIDDEN_WIDTH, HIDDEN_WIDTH), nn.Tanh(),
    nn.Linear(HIDDEN_WIDTH, HIDDEN_WIDTH), nn.Tanh(),
    # Output Layer
    nn.Linear(HIDDEN_WIDTH, 1),
    # nn.Sigmoid(),
)

model.to(device=device)

In [None]:
def validate(_model: nn.Module, _test_loader: torch.utils.data.DataLoader, _loss_function: nn.Module) -> float:
    epoch_loss = 0
    for idx, (x, y) in enumerate(_test_loader):
        # Execute model
        x_pred = _model(x)

        # Derive Losses
        loss = _loss_function(x_pred, y) 
        epoch_loss += loss
    loss = epoch_loss.cpu().detach().numpy()/(idx+1)
    print(f"test loss: {loss}")
    return loss


def run_training(_model: nn.Module, _train_loader: torch.utils.data.DataLoader, _optimizer: torch.optim.Optimizer, _loss_function: nn.Module, _n_epochs: int, _scheduler: torch.optim.lr_scheduler = None, _test_loader: torch.utils.data.DataLoader = None) -> tuple[list[float], list[float]]:
    """
    Run training

    @param model: tensorflow Session object
    @param train_loader: iterator over training data
    @param optimizer: optimizer
    @param loss: tensorflow loss class
    @param n_epochs: number of epochs to run training
    @param scheduler: adjusts learning rate
    @param train_loader: iterator over test data

    @return: Mean training errors per epoch and test errors every 10 epochs
    """
    train_errors = []
    test_errors = []
    for i_epoch in range(_n_epochs):
        print('Start epoch %d' % i_epoch)
        epoch_loss = 0
        for idx, (x, y) in enumerate(_train_loader):
            _model.train()
            _optimizer.zero_grad()

            # Execute model
            x_pred = _model(x)

            # Derive Losses
            loss = _loss_function(x_pred, y) 
            epoch_loss += loss
            # Derive gradients
            loss.backward()

            # Apply optimizer
            _optimizer.step()

            if (idx % 100) == 0:
                print('[epoch=%d, it=%d] loss = %.4f'
                      % (i_epoch, idx, loss))
        train_errors.append(epoch_loss.cpu().detach().numpy()/(idx+1))
        if _scheduler:
          _scheduler.step()
          print(f"learning rate: {_scheduler.get_last_lr()}")
        
        # if (i_epoch % 10) == 0:
        test_errors.append(validate(_model, _test_loader, _loss_function))

    return train_errors, test_errors

In [None]:
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
loss = nn.MSELoss(reduction="mean") # nn.L1Loss(reduction="mean")
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=(1 - 5e-3))

# model = torch.load(Path(DATA_PATH.absolute(), "critcs_regression2.model"))

n_epochs = 100
train_errors, test_errors = run_training(model, train_loader, optimizer, loss, n_epochs, scheduler, test_loader)
plt.plot(train_errors)
plt.plot(test_errors)

In [None]:
error = 0
counter = 0
for x, y in test_loader:
  for _x, _y in zip(model(x), y):
    counter += 1
    _x_r = round(float(_x), 2)
    _y_r = round(float(_y),2)
    error += abs(_x - _y)
    print(f"preciction: {_x_r}; target: {_y_r}")
error /= counter
print(f"MAE: {error}")

In [None]:
torch.save(model, Path(DATA_PATH.absolute(), "audience_regression_alt.model"))

### Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

score_target_train = target_train[TARGET_PARAMS[0]]
score_target_test = target_test[TARGET_PARAMS[0]]
liner_reg = LinearRegression().fit(param_train, score_target_train)  # critics score
target_predict = liner_reg.predict(param_train)
test_predict = liner_reg.predict(param_test)

mae_sorted_train = sorted(abs(target_predict - score_target_train.to_numpy()))
mae_sorted_test = sorted(abs(test_predict - score_target_test.to_numpy()))

plt.plot(mae_sorted_train)
plt.plot(mae_sorted_test)
print("Training score:", liner_reg.score(param_train, score_target_train))
print("Test score:", liner_reg.score(param_test, score_target_test))

In [None]:
plt.plot(sorted(target_predict))

In [None]:
plt.hist(mae_sorted_train, bins=50, rwidth=0.85)

In [None]:
plt.hist(mae_sorted_test, bins=50, rwidth=0.85)

In [None]:
param_train

In [None]:
import pandas
target_bins_train = pandas.cut(target_train['critics_score'],10, labels=False)
target_bins_test = pandas.cut(target_test['critics_score'],10, labels=False)

In [None]:
target_bins_train.value_counts()

In [None]:
model = CatBoostClassifier(
    iterations=1000,
    learning_rate=0.01,
    depth = 8,
    #l2_leaf_reg = 0.02,
    random_strength=2,
)
  
model.fit(
      param_train, target_bins_train,
      eval_set=(param_test, target_bins_test),
      verbose=10,
      plot=True,
      use_best_model=False,
);

In [None]:
target_bins_train

In [None]:
pd.Series(model.predict(param_train).flatten()).value_counts()

In [None]:
sklearn.metrics.accuracy_score(model.predict(param_train), target_bins_train)

In [None]:
sklearn.metrics.accuracy_score(model.predict(param_test), target_bins_test)

In [None]:
(model.predict(param_test).flatten()-target_bins_test).transform(lambda x: abs(x)<=1).value_counts()

In [None]:
target_test

In [None]:
model = CatBoostRegressor(
    iterations=30000,
    learning_rate=0.01,
    depth = 8,
    #l2_leaf_reg = 0.02,
    random_strength=2,
)
  
model.fit(
      param_train, target_train,
      eval_set=(param_test, target_test),
      verbose=10,
      plot=True,
      use_best_model=False,
);

In [None]:
sklearn.metrics.mean_absolute_error(model.predict(param_train),target_train['critics_score'])

In [None]:
sklearn.metrics.mean_absolute_error(model.predict(param_test),target_test['critics_score'])

In [None]:
model.predict(param_test)

- Error (evaluation MAE) Evaluation
- Keine Normalisierung der Targets
- Plots train+test ergebnis (zusammen?)