# fit a Neural Network to Myket Animation data

Our Data consists mainly of 2lvl binary factors, and our task is regression. 

In [14]:
import numpy as np
import matplotlib.pyplot as plt

import pandas as pd
import seaborn as sns

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

import tensorflow as tf
from tensorflow.keras.utils import set_random_seed, split_dataset

### Loading the Data

In [154]:
file = 'clean_data.csv'

def load_data(file):
    try:
        df = pd.read_csv(file)
        assert df.isna().sum().sum() == 0 # first sum() to sum on each column, second to sum() on all columns
    except FileNotFoundError:
        msg = "Expected data file not found, you must run Data Collection and Data Cleaning procedures. follow instructions in README.txt"
        raise FileNotFoundError(msg)
    except AssertionError:
        msg = 'Data has missing values, Run Cleaning notebooks first maybe there is a problem there.'
        raise AssertionError(msg)
    return df

df = load_data(file)

### Data Preperation And Train/Test Split 


Important considerations:
- Tensorflow/keras only accepts numerical features. onehot encoding can be performed categories.
- `URL` and `Name` must not be given to the model.
- `Year` is categorical or numerical?


In [157]:
df = load_data(file)

# Identify categorical columns (excluding URL & Name)
categorical_cols = [col for col in df.columns if (col not in ['URL', 'Name']) and (df[col].dtype == 'object')]

# One-hot encode categorical features
encoder = OneHotEncoder(sparse_output=False, drop='first')
encoded_categories = encoder.fit_transform(df[categorical_cols])

# Convert to DataFrame
enc_df = pd.DataFrame(encoded_categories, columns=encoder.get_feature_names_out(categorical_cols), index=df.index)

# Drop original categorical columns and merge encoded features
df = df.drop(columns=categorical_cols).join(enc_df)

# Convert numeric categorical features (2-level factors) to integer
binary_cols = ['Num_Seasons', 'Total_Episodes', 'Publication', 'VoiceActors', 'Review', 'Tips', 'End', 'Description', 
               'Characters', 'InformativeMessages', 'PositiveAndNegative', 'SummaryStory', 'Screening', 'Critics', 
               'Conclusion', 'Introduction', 'Is_Doblele', 'Series', 'Animation', 'Western', 'Adventure', 'Comedy', 
               'Family', 'Fantasy', 'Mystery', 'Action', 'Romance', 'Drama', 'SciFi', 'ShortFilm', 'Crime', 
               'Musical', 'Korean', 'Thriller', 'Anime', 'Music']

df[binary_cols] = df[binary_cols].astype(np.int64)

# Drop unnecessary columns
df = df.drop(columns=['Unnamed: 0', 'URL', 'Name'])

# Scale continuous features
x_scaler = StandardScaler()
y_scaler = StandardScaler()
continuous_x_cols = ['IMDB_Link', 'Number_People', 'Total_Words', 'Num_Titles', 'Total_Target_Words', 
                   'About_Words', 'Story_Words', 'Release_Date_Words', 'Review_Words', 'Final_Words', 
                   'Informative_Words', 'Positive_Negative_Words', 'Summary_Words', 'Screening_Words', 
                   'Critics_Words', 'Conclusion_Words', 'Introduction_Words', 'Voice_Actor_Words']


df[continuous_x_cols] = x_scaler.fit_transform(df[continuous_x_cols])
df['Amtiaz'] = y_scaler.fit_transform(df['Amtiaz'].to_numpy().reshape((-1,1)))

# Set random seed
set_random_seed(14)

# Train-test split
train_df, test_df = train_test_split(df, test_size=0.2, random_state=14)

# Define target and feature sets
y_train = train_df.pop('Amtiaz')
y_test = test_df.pop('Amtiaz')
x_train = train_df
x_test = test_df

### Model Specification

In [158]:
# Hyperparameters
L1 = 60  # nodes in layer 1
L2 = 30  # nodes in layer 2
L3 = 5  # nodes in layer 2
learning_rate = 0.001  # upper limit for Adam optimizer
lmbda = 0.0001

# Define the model
model = tf.keras.Sequential([
    tf.keras.layers.Input(shape =(x_train.shape[1],)),
    tf.keras.layers.Dense(units=L1, activation='relu'),
    tf.keras.layers.Dense(units=L2, activation='relu'),
    tf.keras.layers.Dense(units=L3, activation='relu'),
    tf.keras.layers.Dense(units=1)  # Regression task
])

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
mse = tf.keras.losses.MeanSquaredError()
mae = tf.keras.losses.MeanAbsoluteError()
model.compile(optimizer=optimizer, loss=mse, metrics=['mae'])


model.summary()


In [160]:
# Train the model
history = model.fit(
    x_train, y_train,
    validation_data=(x_test, y_test),
    epochs=50,  # You can increase epochs for better performance
    batch_size=8
)

# Evaluate the model
test_loss, test_mae = model.evaluate(x_test, y_test)
print(f"Test Loss: {test_loss:.4f}, Test MAE: {test_mae:.4f}")


Epoch 1/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.0153 - mae: 0.7804 - val_loss: 2.5198 - val_mae: 1.4948
Epoch 2/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.0303 - mae: 0.7883 - val_loss: 2.6686 - val_mae: 1.5433
Epoch 3/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.0426 - mae: 0.7959 - val_loss: 2.7695 - val_mae: 1.5746
Epoch 4/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step - loss: 1.0536 - mae: 0.8015 - val_loss: 2.7837 - val_mae: 1.5779
Epoch 5/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.0589 - mae: 0.8035 - val_loss: 2.7052 - val_mae: 1.5519
Epoch 6/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.0592 - mae: 0.8012 - val_loss: 2.6014 - val_mae: 1.5170
Epoch 7/50
[1m64/64[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 4ms/step - loss: 1.0627 -

This at last is the minimum MSE obtained. I changed `L1`, `L2`, `L3`, `learning_rate` by hand, no grid search for tuning. 

In [161]:
from sklearn.metrics import r2_score

def adjusted_r2(y_true, y_pred, n, p):
    r2 = r2_score(y_true, y_pred)
    return 1 - ((1 - r2) * (n - 1) / (n - p - 1))

# Example usage:
y_pred = model.predict(x_test)
r2 = r2_score(y_test, y_pred)
adj_r2 = adjusted_r2(y_test, y_pred, n=x_test.shape[0], p=x_test.shape[1])

print(f"R²: {r2:.4f}")
print(f"Adjusted R²: {adj_r2:.4f}")

[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 11ms/step 
R²: 0.0409
Adjusted R²: -2.2054


Surely you are joking Mr.Feynman! This means what our model is predicting is no better than predicting the mean value! let's examine `y_pred`

In [165]:
y_pred = y_pred.reshape(-1, 1)  # Reshape to (n_samples, 1)

# Inverse transform to get original scale
yhat = y_scaler.inverse_transform(y_pred)
y = y_scaler.inverse_transform(y_test.to_numpy().reshape(-1, 1))  # Reshape y_test to (n_samples, 1)

# Print first 10 predictions vs actual values
for i, j in zip(y[:20], yhat[:20]):
    print(f"Actual: {i}, Predicted: {j}")


Actual: [96.], Predicted: [83.22892]
Actual: [90.], Predicted: [91.00413]
Actual: [84.], Predicted: [77.41638]
Actual: [75.], Predicted: [80.172226]
Actual: [94.], Predicted: [78.40123]
Actual: [85.], Predicted: [87.41065]
Actual: [86.], Predicted: [83.06726]
Actual: [78.], Predicted: [79.49491]
Actual: [89.], Predicted: [86.618195]
Actual: [88.], Predicted: [87.37692]
Actual: [92.], Predicted: [75.05681]
Actual: [96.], Predicted: [91.544205]
Actual: [70.], Predicted: [74.79234]
Actual: [83.], Predicted: [80.23523]
Actual: [86.], Predicted: [76.48962]
Actual: [84.], Predicted: [84.60321]
Actual: [82.], Predicted: [76.87033]
Actual: [87.], Predicted: [80.84146]
Actual: [75.], Predicted: [81.8219]
Actual: [88.], Predicted: [82.12972]


### Hyperparameter tuning

we would like to proceed but `GridSearchCV` from `sklearn` has some compatibility issues with recent versions of t

In [166]:
from tensorflow.keras import regularizers
from sklearn.model_selection import GridSearchCV
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor

# Define the model
def create_model(l1=0.01, l2=0.01, learning_rate=0.001):
    model = tf.keras.Sequential([
        tf.keras.layers.Input(shape=(x_train.shape[1],)),
        tf.keras.layers.Dense(128, activation='relu', 
                              kernel_regularizer=regularizers.l1_l2(l1=l1, l2=l2)),
        tf.keras.layers.Dense(64, activation='relu'),
        tf.keras.layers.Dense(1)
    ])
    
    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(optimizer=optimizer, loss='mse', metrics=['mae'])
    
    return model

# Wrap the model for use with GridSearchCV
model = KerasRegressor(build_fn=create_model, epochs=50, batch_size=32, verbose=0)

# Define grid search parameters
param_grid = {
    'l1': [0.0001, 0.001, 0.01, 0.1],
    'l2': [0.0001, 0.001, 0.01, 0.1],
    'learning_rate': [0.0001, 0.001, 0.01]
}

# GridSearchCV to tune hyperparameters
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=1, cv=3)
grid_result = grid.fit(x_train, y_train)

# Get the best parameters and performance
print("Best parameters found: ", grid_result.best_params_)
print("Best cross-validation score: ", grid_result.best_score_)


ModuleNotFoundError: No module named 'tensorflow.keras.wrappers.scikit_learn'