# Import Modules

In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import torch 
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder

# Import Dataset

In [82]:
df = pd.read_csv("movies_new.csv")
df = df.sort_values(by="startYear")

## Data Preprocessing 

In [None]:
df['numVotes_log'] = np.log1p(df['numVotes'])
numeric_columns = ['averageRating', 'HitScore', 'numVotes', 'numVotes_log', 'runtimeMinutes', 'startYear']
df[numeric_columns] = df[numeric_columns].apply(pd.to_numeric, errors='coerce')
df.drop(columns=["originalTitle"], inplace=True)

# Define features and target
X= df[['averageRating', 'HitScore', 'numVotes_log', 'runtimeMinutes']].to_numpy()
y = df['HitScore'].reshape(-1,1).to_numpy()

# Split data into training (80%) and temp (20%)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=42)

# Further split temp (20%) into validation (10%) and test (10%)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=42)

print(f"{X_train.shape[0] + X_val.shape[0] + X_test.shape[0] == df.shape[0]}")

SyntaxError: invalid syntax (1986502979.py, line 8)

### Normalize the dataset

In [77]:
scalerX = StandardScaler()
X_train_scaled = scalerX.fit_transform(X_train)  # Fit on training set
X_test_scaled = scalerX.transform(X_test)
X_val_scaled = scalerX.transform(X_val)  # Apply on test set


scalerY = StandardScaler()
y_train_scaled = scalerY.fit_transform(y_train.reshape(-1, 1))  # Fit on training set
y_test_scaled = scalerY.transform(y_test.reshape(-1, 1))  # Apply on test set
y_val_scaled = scalerY.transform(y_val.reshape(-1, 1))  # Apply on validation test set

### Frame as a Time Series problem
* Convert the data into a sequential type

In [78]:
# Define time step (e.g., 5 years)
TIME_STEP = 5  

def create_sequences(X, y, time_step):
    Xs, ys = [], []
    for i in range(len(X) - time_step):
        Xs.append(X[i:i+time_step])  # Past 5 years
        ys.append(y[i+time_step])    # Target is the next year
    return np.array(Xs), np.array(ys)

# Convert datasets into 3D shape (samples, time steps, features)
X_train_seq, y_train_seq = create_sequences(X_train_scaled, y_train, TIME_STEP)
X_val_seq, y_val_seq = create_sequences(X_val_scaled, y_val, TIME_STEP)
X_test_seq, y_test_seq = create_sequences(X_test_scaled, y_test, TIME_STEP)

# Check shapes
print("Train shape:", X_train_seq.shape)  # (samples, time_steps, features)
print("Val shape:", X_val_seq.shape)
print("Test shape:", X_test_seq.shape)

Train shape: (159671, 5, 4)
Val shape: (19954, 5, 4)
Test shape: (19955, 5, 4)


### Convert to Tensors

In [79]:
# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train_seq, dtype=torch.float32)
X_val_tensor = torch.tensor(X_val_seq, dtype=torch.float32)
X_test_tensor = torch.tensor(X_test_seq, dtype=torch.float32)

y_train_tensor = torch.tensor(y_train_seq, dtype=torch.float32)
y_val_tensor = torch.tensor(y_val_seq, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test_seq, dtype=torch.float32)

# Check shapes
print(X_train_tensor.shape, y_train_tensor.shape)
print(X_val_tensor.shape, y_val_tensor.shape)
print(X_test_tensor.shape, y_test_tensor.shape)

torch.Size([159671, 5, 4]) torch.Size([159671])
torch.Size([19954, 5, 4]) torch.Size([19954])
torch.Size([19955, 5, 4]) torch.Size([19955])


# Optimizer