Training the model:
- Before training the model, clubbing all the output label as 'others' for which that has less than 300 records in the souce data
- This is to improve the overall accuracy and also each output labels' precision

In [165]:
import numpy as np
from PIL import Image
import pandas as pd
import re
from sklearn.decomposition import PCA
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from transformers import BlipModel, BlipProcessor, ViTModel, ViTFeatureExtractor, BertTokenizer, BertModel
from transformers import ViTModel, ViTImageProcessor, BertTokenizer, BertModel

# Reading the pre-processed source data from csv into a DF
data_with_genreAndPosters = pd.read_csv("/Users/ajayrahulraja/Library/CloudStorage/OneDrive-UniversityofHertfordshire/Data Science Project/Data_Science_Project/data_science_project/Code/dataset_with_oneGenre_nonNull_features.csv")
data_with_genreAndPosters.head(2)

Unnamed: 0.1,Unnamed: 0,id,original_title,overview,tagline,genres,budget,popularity,release_date,genre_label,poster_paths
0,4,11862,Father of the Bride Part II,"Just when George Banks has recovered from his daughter's wedding, he receives the news that she's pregnant ... and that George's wife, Nina, is expecting too. He was planning on selling their home, but that's a plan that -- like George -- will have to change with the arrival of both a grandchild and a kid of his own.",Just When His World Is Back To Normal... He's In For The Surprise Of His Life!,"[{'id': 35, 'name': 'Comedy'}]",0,8.387519,1995-02-10,Comedy,/Users/ajayrahulraja/Library/CloudStorage/OneDrive-UniversityofHertfordshire/Data Science Project/Data_Science_Project/data_science_project/Dataset/downloaded_posters_oneGenre/Father of the Bride Part II.jpg
1,25,16420,Othello,The evil Iago pretends to be friend of Othello in order to manipulate him to serve his own end in the film version of this Shakespeare classic.,"Envy, greed, jealousy and love.","[{'id': 18, 'name': 'Drama'}]",0,1.845899,1995-12-15,Drama,/Users/ajayrahulraja/Library/CloudStorage/OneDrive-UniversityofHertfordshire/Data Science Project/Data_Science_Project/data_science_project/Dataset/downloaded_posters_oneGenre/Othello.jpg


In [166]:
# Poster_paths - this column has the downloaded movie posters' paths.
# The poster paths are local system's file location - as each movie's images are downloaded already on the local system

# 'Poster Paths' column have characters that does not make the location a valid location
data_with_genreAndPosters['poster_paths'] = data_with_genreAndPosters['poster_paths'].str.replace(':', '_').str.replace('?', '_').str.replace('*', '_')

  data_with_genreAndPosters['poster_paths'] = data_with_genreAndPosters['poster_paths'].str.replace(':', '_').str.replace('?', '_').str.replace('*', '_')


In [None]:
# To get a better view on the notebook - setting the column width to maximum 
pd.set_option('max_colwidth', 2000)

In [168]:
## Replacing if the poster path still has characters that should not be

# data_with_genreAndPosters.loc[
#     data_with_genreAndPosters['poster_paths'].str.contains('Where Is Parsifal?'), 
#     'poster_paths'
# ] = data_with_genreAndPosters.loc[
#     data_with_genreAndPosters['poster_paths'].str.contains('Where Is Parsifal?'), 
#     'poster_paths'
# ].str.replace('?', '_')

In [169]:
# Splitting the data into 80-20 for test and train
train, test = train_test_split(data_with_genreAndPosters, test_size=0.2)
train, val = train_test_split(train, test_size=0.1)
print(type(train), type(test), type(val))

# splitting the processed data df into test, train and validation
val.to_csv("/Users/ajayrahulraja/Library/CloudStorage/OneDrive-UniversityofHertfordshire/Data Science Project/Data_Science_Project/data_science_project/Code/val_data.csv")
test.to_csv("/Users/ajayrahulraja/Library/CloudStorage/OneDrive-UniversityofHertfordshire/Data Science Project/Data_Science_Project/data_science_project/Code/test_data.csv")
train.to_csv("/Users/ajayrahulraja/Library/CloudStorage/OneDrive-UniversityofHertfordshire/Data Science Project/Data_Science_Project/data_science_project/Code/train_data.csv")

<class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'> <class 'pandas.core.frame.DataFrame'>


In [170]:
# Fixing a threshold value of 300 and anything under this threshold will be labels as 'others'
genreLables_under300 = ['History', 'War', 'Fantasy', 'Mystery', 'Family', 'Music', 'Adventure', 'Romance', \
                        'Crime', 'Science Fiction', 'Animation']
data_with_genreAndPosters['genre_label_with_threshold'] = data_with_genreAndPosters['genre_label'].apply(lambda x: 'Others' if x in genreLables_under300 else x)

In [171]:
# checking the new output labels and their counts - after applying threshold
data_with_genreAndPosters.groupby('genre_label_with_threshold').count()

Unnamed: 0_level_0,Unnamed: 0,id,original_title,overview,tagline,genres,budget,popularity,release_date,genre_label,poster_paths
genre_label_with_threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
Action,318,318,318,318,135,318,318,318,318,318,318
Comedy,3429,3429,3429,3429,1279,3429,3429,3429,3429,3429,3429
Documentary,2589,2589,2589,2589,749,2589,2589,2589,2585,2589,2589
Drama,5229,5229,5229,5229,1667,5229,5229,5229,5227,5229,5229
Horror,1014,1014,1014,1014,677,1014,1014,1014,1014,1014,1014
Others,1172,1172,1172,1172,400,1172,1172,1172,1172,1172,1172
Thriller,486,486,486,486,221,486,486,486,486,486,486
Western,323,323,323,323,202,323,323,323,323,323,323


In [172]:
# cleaning the text feature before embedding and training
def clean_text(text):
    text = text.lower()
    text = re.sub(r'-', ' ', text)
    text = re.sub(r'[^\w\s\.!?]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Loading train, test and val from preprocessed data 'i.e., from preprocessing_movieWithOneGenre.ipynb notebook' of 80-20 ratio
val = pd.read_csv("/Users/ajayrahulraja/Library/CloudStorage/OneDrive-UniversityofHertfordshire/Data Science Project/Data_Science_Project/data_science_project/Code/val_data.csv")
test = pd.read_csv("/Users/ajayrahulraja/Library/CloudStorage/OneDrive-UniversityofHertfordshire/Data Science Project/Data_Science_Project/data_science_project/Code/test_data.csv")
train = pd.read_csv("/Users/ajayrahulraja/Library/CloudStorage/OneDrive-UniversityofHertfordshire/Data Science Project/Data_Science_Project/data_science_project/Code/train_data.csv")

# applying the threshold for train, test and vall
genreLables_under300 = ['History', 'War', 'Fantasy', 'Mystery', 'Family', 'Music', 'Adventure', \
            'Romance', 'Crime', 'Science Fiction', 'Animation']
val['genre_label_with_threshold'] = val['genre_label'].apply(lambda x: 'Others' if x in genreLables_under300 else x)
test['genre_label_with_threshold'] = test['genre_label'].apply(lambda x: 'Others' if x in genreLables_under300 else x)
train['genre_label_with_threshold'] = train['genre_label'].apply(lambda x: 'Others' if x in genreLables_under300 else x)

# textual feature of train, test and val are being cleaned - before extracting the feature & training the model
val['cleaned_overview'] = val['overview'].apply(clean_text)
test['cleaned_overview'] = test['overview'].apply(clean_text)
train['cleaned_overview'] = train['overview'].apply(clean_text)

In [None]:
# Resizing the visual input feature
def resize_image(image_path):
    try:
        image = Image.open(image_path).convert("RGB")
        image = image.resize((224, 224))
        return np.array(image)
    except Exception as e:
        print(f"Error with image {image_path}: {e}")
        return None

# Function to extract visual features using ViT
def vit_features(image_list, model, processor):
    feature_list = []
    valid_index = []
    for index, image_path in enumerate(image_list):
        image = resize_image(image_path)
        if image is None:
            continue
        inputs = processor(images=image, return_tensors="pt")
        outputs = model(**inputs)

        visual_features = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        feature_list.append(visual_features)
        valid_indices.append(index)

        print(f"Extracted feature shape: {features.shape}")

    if feature_list:
        return np.vstack(feature_list), valid_index
    else:
        return np.array([]), []

# Function to extract textual features using BERT
def bert_features(text_list, valid_indices, tokenizer, model):
    feature_list = []
    for index in valid_indices:
        text = text_list[index]
        if not isinstance(text, str):
            print(f"Invalid text: {text}")
            continue
        inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True)
        outputs = model(**inputs)

        textual_features = outputs.last_hidden_state.mean(dim=1).detach().numpy()
        feature_list.append(textual_features)

    if feature_list:
        return np.vstack(feature_list)
    else:
        return np.array([])

In [132]:
# Extracting visual features from train dataset and test dataset
vit_features_train, valid_indices_train = vit_features(train['poster_paths'].tolist(), vit_model, vit_processor)
vit_features_test, valid_indices_test = vit_features(test['poster_paths'].tolist(), vit_model, vit_processor)
print(f'ViT features (train): {vit_features_train.shape}')
print(f'ViT features (test): {vit_features_test.shape}')

# Extracting textual features from train dataset and test dataset
bert_features_train = bert_features(train['cleaned_overview'].tolist(), valid_indices_train, bert_tokenizer, bert_model)
bert_features_test = bert_features(test['cleaned_overview'].tolist(), valid_indices_test, bert_tokenizer, bert_model)
print(f'BERT features (train): {bert_features_train.shape}')
print(f'BERT features (test): {bert_features_test.shape}')

# Combining features together
if vit_features_train.size > 0 and bert_features_train.size > 0:
    feature_train = np.hstack((vit_features_train, bert_features_train))
    feature_test = np.hstack((vit_features_test, bert_features_test))
else:
    feature_train = np.array([])
    feature_test = np.array([])

# Using classifier to get the prediction
if ('genre_label_with_threshold' in test.columns) and ('genre_label_with_threshold' in train.columns):
    feature_label_test = test['genre_label_with_threshold'].iloc[valid_indices_test]
    feature_label_train = train['genre_label_with_threshold'].iloc[valid_indices_train]

    if (feature_test.size > 0) and (feature_train.size > 0):
        classifier = SVC()
        classifier.fit(feature_train, feature_label_train)
        prediction_result = classifier.predict(feature_test)
        accuracy = accuracy_score(feature_label_test, prediction_result)
        print(f'Accuracy: {accuracy}')
        test['predicted_genre'] = np.nan
        test.loc[valid_indices_test, 'predicted_genre'] = prediction_result
    else:
        print("Empty training and test data after extracting the features.")
else:
    print("Output labels (genre_label_with_threshold) are not found in the dataset.")

Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feature shape: (1, 768)
Extracted feat

In [133]:
print('The total number of records in test: ', len(test))
test_subset = test[['id', 'original_title', 'genre_label', 'predicted_genre']]
test_subset.loc[test_subset['genre_label'] != test_subset['predicted_genre']]

The total number of records in test:  2912


Unnamed: 0,id,original_title,genre_label,predicted_genre
4,160859,Elena,Documentary,Drama
9,18281,Side Street,Crime,Drama
13,38008,Ghosts of Cité Soleil,Documentary,Action
15,101858,The Movies,Comedy,Drama
16,35418,Nightmare,Crime,Drama
...,...,...,...,...
2886,37820,Death in Brunswick,Drama,Comedy
2887,5168,Être et avoir,Documentary,Drama
2899,41486,子連れ狼 地獄へ行くぞ!大五郎,Action,Others
2907,4256,Scary Movie 3,Comedy,Others


In [134]:
test_subset.loc[test_subset['genre_label'] == test_subset['predicted_genre']]

Unnamed: 0,id,original_title,genre_label,predicted_genre
0,128206,Shahid,Drama,Drama
1,359108,Viva,Drama,Drama
2,42495,King Lear,Drama,Drama
3,42612,Medium Cool,Drama,Drama
5,199291,The Dark Matter of Love,Documentary,Documentary
...,...,...,...,...
2905,760,Les aventures de Rabbi Jacob,Comedy,Comedy
2906,73311,Szelíd teremtés - A Frankenstein-terv,Drama,Drama
2908,13636,Bigger Stronger Faster*,Documentary,Documentary
2909,83479,Ultrasuede: In Search of Halston,Documentary,Documentary


* Final try of training the model to improve the accuracy with precision scores *

In [144]:
# Loading train, test and val from preprocessed data 'i.e., from preprocessing_movieWithOneGenre.ipynb notebook' of 80-20 ratio
val = pd.read_csv("/Users/ajayrahulraja/Library/CloudStorage/OneDrive-UniversityofHertfordshire/Data Science Project/Data_Science_Project/data_science_project/Code/val_data.csv")
test = pd.read_csv("/Users/ajayrahulraja/Library/CloudStorage/OneDrive-UniversityofHertfordshire/Data Science Project/Data_Science_Project/data_science_project/Code/test_data.csv")
train = pd.read_csv("/Users/ajayrahulraja/Library/CloudStorage/OneDrive-UniversityofHertfordshire/Data Science Project/Data_Science_Project/data_science_project/Code/train_data.csv")

# Threshold is 300. Meaning output labels under 300 records is classified as 'Others'
genreLables_under300 = ['History', 'War', 'Fantasy', 'Mystery', 'Family', 'Music', 'Adventure',
                        'Romance', 'Crime', 'Science Fiction', 'Animation']
val['genre_label_with_threshold'] = val['genre_label'].apply(lambda x: 'Others' if x in genreLables_under300 else x)
test['genre_label_with_threshold'] = test['genre_label'].apply(lambda x: 'Others' if x in genreLables_under300 else x)
train['genre_label_with_threshold'] = train['genre_label'].apply(lambda x: 'Others' if x in genreLables_under300 else x)

# Combining the features together
if vit_features_train.size > 0 and bert_features_train.size > 0:
    feature_train = np.hstack((vit_features_train, bert_features_train))
    feature_test = np.hstack((vit_features_test, bert_features_test))
else:
    feature_train = np.array([])
    feature_test = np.array([])

feature_label_train = train['genre_label_with_threshold'].iloc[valid_indices_train].values
feature_label_test = test['genre_label_with_threshold'].iloc[valid_indices_test].values

if (feature_test.size > 0) and (feature_train.size > 0):
    # Standardizing both the features - to have 'mean' of 0 and a 'standard deviation' of 1.
    scaler = StandardScaler()
    feature_train_scaled = scaler.fit_transform(feature_train)
    feature_test_scaled = scaler.transform(feature_test)
    # Standardization is done - so that the model receives data that is consistent in scale, which can lead to better and more reliable model performance
    
    # Applying PCA to reduce dimensionality
    pca = PCA(n_components=0.95)  # 95% of the variance
    feature_train_pca = pca.fit_transform(feature_train_scaled)
    feature_test_pca = pca.transform(feature_test_scaled)
    
    print(f'PCA features (train): {feature_train_pca.shape}')
    print(f'PCA features (test): {feature_test_pca.shape}')
    
    # using Grid Search to find the optimal value for hyperparameter tuning - applying the default values
    param_grid = {
        'C': [0.1, 1, 10], # regularization parameter - trade-off between maximizing the margin (distance between the decision boundary and the closest data points) and minimizing classification error
        'gamma': [1, 0.1, 0.01], # kernel coefficient for non-linear hyperplanes
        'kernel': ['rbf', 'linear'] # type of decision boundary used
    }
    grid = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=3, n_jobs=-1)
    grid.fit(feature_train_pca, feature_label_train)
    print(f"Best parameters: {grid.best_params_}")
    
    # Predicting genres using Classifier
    classifier = grid.best_estimator_
    prediction_result = classifier.predict(feature_test_pca)
    accuracy = accuracy_score(feature_label_test, prediction_result)
    print(f'Accuracy: {accuracy}')
    test['predicted_genre'] = np.nan
    test.loc[valid_indices_test, 'predicted_genre'] = prediction_result
else:
    print("Empty training and test data after extracting the features.")


PCA features (train): (10479, 849)
PCA features (test): (2912, 849)
Fitting 3 folds for each of 18 candidates, totalling 54 fits
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 1.0min
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 1.1min
[CV] END ......................C=0.1, gamma=1, kernel=linear; total time= 1.1min
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 1.0min
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 1.1min
[CV] END ....................C=0.1, gamma=0.1, kernel=linear; total time= 1.1min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 3.4min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 3.4min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf; total time= 3.4min
[CV] END .........................C=0.1, gamma=1, kernel=rbf; total time= 3.4min
[CV] END .......................C=0.1, gamma=0.1, kernel=rbf;

In [145]:
# Standardizing the features using scaler
scaler = StandardScaler()
feature_train_scaled = scaler.fit_transform(feature_train)
feature_test_scaled = scaler.transform(feature_test)

# using Grid Search for hyperparameter tuning - using common ranges for SVM
param_grid = {
    'C': [0.1, 1, 10], # Regularization parameter
    'gamma': ['scale', 'auto'], # Kernel coefficient for 'rbf', 'poly' and 'sigmoid'
    'kernel': ['linear', 'rbf', 'poly', 'sigmoid'] # kernel type to be used in the algorithm
}

# Initializing GridSearchCV with SVC
grid_search = GridSearchCV(SVC(), param_grid, refit=True, verbose=2, cv=2, n_jobs=-1)
grid_search.fit(feature_train_scaled, feature_label_train)
print(f"Best parameters: {grid_search.best_params_}")

# Prediction using the best model
best_model = grid_search.best_estimator_
prediction_result = best_model.predict(feature_test_scaled)

# Calculating and printing various metrics
accuracy = accuracy_score(feature_label_test, prediction_result)
precision = precision_score(feature_label_test, prediction_result, average='weighted')
recall = recall_score(feature_label_test, prediction_result, average='weighted')
f1 = f1_score(feature_label_test, prediction_result, average='weighted')

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1 Score: {f1}")

# Detailed classification report
print(classification_report(feature_label_test, prediction_result))

# Confusion matrix
conf_matrix = confusion_matrix(feature_label_test, prediction_result)
print(conf_matrix)

# Cross-validation scores for robustness
cv_scores = cross_val_score(best_model, feature_train_scaled, feature_label_train, cv=2, scoring='accuracy', n_jobs=-1)
print(f"Cross-validation accuracy scores: {cv_scores}")
print(f"Mean cross-validation accuracy: {cv_scores.mean()}")


Fitting 2 folds for each of 24 candidates, totalling 48 fits
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 1.3min
[CV] END ..................C=0.1, gamma=scale, kernel=linear; total time= 1.3min
[CV] END .................C=0.1, gamma=scale, kernel=sigmoid; total time= 1.7min
[CV] END .................C=0.1, gamma=scale, kernel=sigmoid; total time= 1.7min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 2.5min
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time= 1.2min
[CV] END .....................C=0.1, gamma=scale, kernel=rbf; total time= 2.5min
[CV] END ...................C=0.1, gamma=auto, kernel=linear; total time= 1.2min
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time= 3.2min
[CV] END ....................C=0.1, gamma=scale, kernel=poly; total time= 3.3min
[CV] END ..................C=0.1, gamma=auto, kernel=sigmoid; total time= 1.7min
[CV] END ......................C=0.1, gamma=auto