In [8]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error, r2_score, f1_score, classification_report
from sklearn.multiclass import OneVsRestClassifier
from sklearn.linear_model import Ridge, LogisticRegression
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re
import joblib
import warnings

# Suppress warnings for cleaner output
warnings.filterwarnings("ignore")

# Download necessary NLTK data
nltk.download(['stopwords', 'wordnet'], quiet=True)
STOP_WORDS = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    """
    Preprocess the input text by:
    - Lowercasing
    - Removing URLs, mentions, hashtags, digits, and non-alphanumeric characters
    - Tokenizing
    - Removing stopwords
    - Lemmatizing
    """
    text = re.sub(r'http\S+|www\S+|https\S+|\@\w+|\#|\d+|[^\w\s]', '', str(text).lower())
    tokens = [lemmatizer.lemmatize(word) for word in text.split() if word not in STOP_WORDS]
    return ' '.join(tokens)

def predict_rating_and_genres(combined_text):
    """
    Predict rating and genres based on the combined title and description.
    """
    # Load the saved models and MultiLabelBinarizer
    rating_model = joblib.load('rating_model.joblib')
    genre_model = joblib.load('genre_model.joblib')

    # Preprocess input text
    combined_processed = preprocess_text(combined_text)

    # Predict rating
    rating = rating_model.predict([combined_processed])[0]

    # Predict genre probabilities
    genre_proba = genre_model.decision_function([combined_processed])

    # Define a probability threshold
    threshold = 0.5

    # Apply threshold to get genre predictions
    genres = mlb.classes_[genre_proba[0] >= threshold]

    # If no genres meet the threshold, select top 3
    if len(genres) == 0:
        top_indices = np.argsort(genre_proba[0])[::-1][:3]
        genres = mlb.classes_[top_indices]

    # Convert to list
    genres = genres.tolist()

    return rating, genres

def load_and_preprocess_data(file_path):
    """
    Load the dataset from a CSV file and preprocess the text fields.
    """
    df = pd.read_csv(file_path)

    # Rename columns to lowercase
    df = df.rename(columns={
        'Title': 'title',
        'Description': 'description',
        'Rating': 'rating',
        'Genres': 'genres'
    })

    required_columns = ['title', 'description', 'rating', 'genres']
    for col in required_columns:
        if col not in df.columns:
            raise ValueError(f"Required column '{col}' not found in the dataset. Please check your CSV file.")

    # Handle missing values
    df['title'] = df['title'].fillna('')
    df['description'] = df['description'].fillna('')
    df['rating'] = pd.to_numeric(df['rating'], errors='coerce')
    df['genres'] = df['genres'].fillna('').apply(
        lambda x: [genre.strip() for genre in str(x).split(',') if genre.strip()]
    )

    # Remove rows with NaN ratings
    df = df.dropna(subset=['rating'])

    # Preprocess text fields and combine title and description
    df['combined_text'] = (df['title'] + ' ' + df['description']).apply(preprocess_text)

    return df

def create_text_pipeline(model):
    """
    Create a text processing and modeling pipeline.
    """
    return Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, ngram_range=(1, 2))),
        ('model', model)
    ])

def train_and_evaluate_model(X, y, model_type, model_name, mlb=None):
    """
    Train the model and evaluate its performance.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    pipeline = create_text_pipeline(model_type)
    pipeline.fit(X_train, y_train)

    y_pred = pipeline.predict(X_test)

    if isinstance(model_type, Ridge):
        mse = mean_squared_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        print(f"Evaluation for {model_name}: Mean Squared Error: {mse:.4f}, R2 Score: {r2:.4f}")
    else:
        if mlb:
            print(f"Evaluation for {model_name}: F1 Score (Micro): {f1_score(y_test, y_pred, average='micro'):.4f}, F1 Score (Macro): {f1_score(y_test, y_pred, average='macro'):.4f}")
            print("\nClassification Report:")
            print(classification_report(y_test, y_pred, target_names=mlb.classes_, zero_division=0))
        else:
            print("Multi-label classification evaluation requires a MultiLabelBinarizer.")

    return pipeline

if __name__ == "__main__":
    try:
        # Load and preprocess data
        df = load_and_preprocess_data('16k_Movies.csv')

        # Prepare labels for genre prediction
        mlb = MultiLabelBinarizer()
        genre_labels = mlb.fit_transform(df['genres'])

        # Train Rating Model using combined_text
        rating_model = train_and_evaluate_model(
            df['combined_text'],
            df['rating'],
            Ridge(alpha=1.0),
            "Rating Model (Combined Title & Description)"
        )

        # Train Genre Model using combined_text with OneVsRestClassifier
        genre_classifier = OneVsRestClassifier(
            LogisticRegression(C=1.0, solver='saga', max_iter=1000, n_jobs=-1)
        )

        genre_model = train_and_evaluate_model(
            df['combined_text'],
            genre_labels,
            genre_classifier,
            "Genre Model (Combined Title & Description)",
            mlb=mlb
        )

        # Save the trained models
        joblib.dump(rating_model, 'rating_model.joblib')
        joblib.dump(genre_model, 'genre_model.joblib')

    except Exception as e:
        print(f"An error occurred: {str(e)}")

Evaluation for Rating Model (Combined Title & Description): Mean Squared Error: 2.1587, R2 Score: -0.0632
Evaluation for Genre Model (Combined Title & Description): F1 Score (Micro): 0.5350, F1 Score (Macro): 0.2784

Classification Report:
              precision    recall  f1-score   support

      Action       0.88      0.40      0.55       451
   Adventure       0.82      0.24      0.37       333
   Animation       1.00      0.02      0.04       101
   Biography       0.74      0.09      0.16       227
      Comedy       0.81      0.50      0.62       847
       Crime       0.79      0.34      0.47       438
 Documentary       0.92      0.33      0.48       252
       Drama       0.75      0.86      0.80      1506
      Family       1.00      0.12      0.21       194
     Fantasy       0.97      0.14      0.25       269
   Film-Noir       0.00      0.00      0.00         0
     History       0.00      0.00      0.00       147
      Horror       0.91      0.24      0.38       300
   

In [9]:
def run_custom_inference(title, description):
    """
    Run custom inference to predict rating and genres.
    """
    combined_text = title + " " + description
    rating, genres = predict_rating_and_genres(combined_text)
    return rating, genres

print("\n")
print(f"Real Movie Test:")
# Real Movie Example 1
title = "The Godfather"
description = "The aging patriarch of an organized crime dynasty transfers control of his clandestine empire to his reluctant son."
rating, genres = run_custom_inference(title, description)
print(f"Predicted Rating for '{title}': {rating}")
print(f"Predicted Genres for '{title}': {genres}")

# Real Movie Example 2
title = "Titanic"
description = "A young man and woman from different social classes fall in love aboard the ill-fated voyage of the RMS Titanic."
rating, genres = run_custom_inference(title, description)
print(f"Predicted Rating for '{title}': {rating}")
print(f"Predicted Genres for '{title}': {genres}")

print("\n")
print(f"Fake Movie Test:")
# Fake Movie Example 1
title = "Shadow Man"
description = "This psychological horror is super spooky, diving deep into the mind of a man haunted by his past and an entity that follows him everywhere."
rating, genres = run_custom_inference(title, description)
print(f"Predicted Rating for '{title}': {rating}")
print(f"Predicted Genres for '{title}': {genres}")

# Fake Movie Example 2
title = "Galactic Wars"
description = "In the distant future, humanity fights for survival in an intergalactic war against an ancient alien species determined to wipe out all life in the galaxy."
rating, genres = run_custom_inference(title, description)
print(f"Predicted Rating for '{title}': {rating}")
print(f"Predicted Genres for '{title}': {genres}")

# Fake Movie Example 3
title = "The Lost City of Andoria"
description = "A team of archaeologists embarks on a treacherous journey to uncover a long-lost civilization hidden deep within the uncharted forests of South America."
rating, genres = run_custom_inference(title, description)
print(f"Predicted Rating for '{title}': {rating}")
print(f"Predicted Genres for '{title}': {genres}")

# Fake Movie Example 4
title = "Cyber Revolution"
description = "In a world dominated by AI, a group of renegade hackers attempt to dismantle the oppressive global system run by superintelligent machines."
rating, genres = run_custom_inference(title, description)
print(f"Predicted Rating for '{title}': {rating}")
print(f"Predicted Genres for '{title}': {genres}")





Real Movie Test:
Predicted Rating for 'The Godfather': 7.563060124019376
Predicted Genres for 'The Godfather': ['Drama']
Predicted Rating for 'Titanic': 7.646985307405162
Predicted Genres for 'Titanic': ['Drama', 'Romance']


Fake Movie Test:
Predicted Rating for 'Shadow Man': 6.340331114215586
Predicted Genres for 'Shadow Man': ['Thriller', 'Horror', 'Mystery']
Predicted Rating for 'Galactic Wars': 6.887879120924816
Predicted Genres for 'Galactic Wars': ['Action', 'Sci-Fi']
Predicted Rating for 'The Lost City of Andoria': 6.198276867857527
Predicted Genres for 'The Lost City of Andoria': ['Thriller', 'Adventure', 'Drama']
Predicted Rating for 'Cyber Revolution': 5.5109351360257195
Predicted Genres for 'Cyber Revolution': ['Sci-Fi', 'Drama', 'Action']
