In [None]:
# Install PySpark and required libraries
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!pip install -q pyspark findspark nltk spacy

# Install English model for SpaCy
!python -m spacy download en_core_web_sm


Collecting en-core-web-sm==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m58.8 MB/s[0m eta [36m0:00:00[0m
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [None]:
# Import required libraries
import findspark
import pyspark
from pyspark.ml.feature import RegexTokenizer, StopWordsRemover
from pyspark.sql import SparkSession
from pyspark.sql.types import IntegerType, ArrayType
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MultiLabelBinarizer
import pandas as pd
import os
import joblib
import nltk
import spacy

In [None]:
# Initialize Spark
findspark.init()
sc = pyspark.SparkContext()
spark = SparkSession.builder \
        .appName("GenrePrediction") \
        .config("spark.driver.memory", "4g") \
        .config("spark.executor.memory", "4g") \
        .getOrCreate()

In [None]:
# Download NLTK stopwords
nltk.download("stopwords")
from nltk.corpus import stopwords

# Load SpaCy model
nlp = spacy.load("en_core_web_sm")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
# Read input data (training and mapping files)
train_dframe = pd.read_csv('/content/train.csv')
map_dframe = pd.read_csv('/content/mapping.csv')


# Convert Pandas DataFrame to Spark DataFrame
train_data = spark.createDataFrame(train_dframe)
mapping_data = spark.createDataFrame(map_dframe, ['id', 'genre'])


In [None]:
# Tokenization using RegexTokenizer
tokenizer = RegexTokenizer(inputCol="plot", outputCol="ptok", pattern="\\w+", gaps=False)
train_data = tokenizer.transform(train_data)


In [None]:
# Remove stopwords using Spark's StopWordsRemover
spark_remover = StopWordsRemover(inputCol="ptok", outputCol="spark_clr")
train_data = spark_remover.transform(train_data)


In [None]:
# Function to remove stopwords using NLTK
nltk_stopwords = set(stopwords.words("english"))

def remove_stopwords_nltk(tokens):
    return [word for word in tokens if word.lower() not in nltk_stopwords]


In [None]:
# Register UDF for NLTK stopword removal
nltk_udf = pyspark.sql.functions.udf(remove_stopwords_nltk, ArrayType(IntegerType()))
train_data = train_data.withColumn("nltk_clr", nltk_udf(train_data["ptok"]))


In [None]:
# Function to remove stopwords using SpaCy
def remove_stopwords_spacy(tokens):
    doc = nlp(" ".join(tokens))
    return [token.text for token in doc if not token.is_stop]


In [None]:
# Register UDF for SpaCy stopword removal
spacy_udf = pyspark.sql.functions.udf(remove_stopwords_spacy, ArrayType(IntegerType()))
train_data = train_data.withColumn("spacy_clr", spacy_udf(train_data["ptok"]))

# Select the "spacy_clr" column for further processing
train_data = train_data.withColumn("final_clr", train_data["spacy_clr"])

# Convert Spark DataFrame to Pandas for further processing
train_data_pd = train_data.select("plot", "genre", "final_clr").toPandas()


In [None]:
# Prepare text data using CountVectorizer
vectorizer = CountVectorizer(max_features=1000)
X_train = vectorizer.fit_transform(train_data_pd["plot"])


In [None]:
# Map genres to integer labels
genre_count = map_dframe["id"].nunique()
mapper_details = {row["genre"]: row["id"] for index, row in map_dframe.iterrows()}

def genreMapSplit(mapper_obj):
    result = []
    for element in mapper_obj[1:-1].split(","):
        result.append(mapper_details.get(element.strip()[1:-1]))
    result.sort()
    return result

# Apply genre mapping
train_data_pd["mapped_genres"] = train_data_pd["genre"].apply(genreMapSplit)


In [None]:
# Convert mapped genres into a multilabel binarized format
mlb = MultiLabelBinarizer()
y_train = mlb.fit_transform(train_data_pd["mapped_genres"])


In [None]:
# Train and save Logistic Regression models for each genre
model_dir = "lr_models"
os.makedirs(model_dir, exist_ok=True)

for index in range(genre_count):
    y_train_genre = y_train[:, index]

    lr_model = LogisticRegression(max_iter=10000)
    lr_model.fit(X_train, y_train_genre)

    # Save the model
    model_path = f"{model_dir}/lr_model_{index}.pkl"
    joblib.dump(lr_model, model_path)
    print(f"Model for genre {index} saved successfully.")

# Logistic Regression Vectorizer
lr_vectorizer = CountVectorizer(max_features=1000)
X_train_lr = lr_vectorizer.fit_transform(train_data_pd['plot'])

# Save the Logistic Regression vectorizer
joblib.dump(lr_vectorizer, "lr_vectorizer.pkl")



Model for genre 0 saved successfully.
Model for genre 1 saved successfully.
Model for genre 2 saved successfully.
Model for genre 3 saved successfully.
Model for genre 4 saved successfully.
Model for genre 5 saved successfully.
Model for genre 6 saved successfully.
Model for genre 7 saved successfully.
Model for genre 8 saved successfully.
Model for genre 9 saved successfully.
Model for genre 10 saved successfully.
Model for genre 11 saved successfully.
Model for genre 12 saved successfully.
Model for genre 13 saved successfully.
Model for genre 14 saved successfully.
Model for genre 15 saved successfully.
Model for genre 16 saved successfully.
Model for genre 17 saved successfully.
Model for genre 18 saved successfully.
Model for genre 19 saved successfully.


['lr_vectorizer.pkl']

In [None]:
# Function to predict genres for a given movie plot
def predict_genres(description):
    X_input = vectorizer.transform([description])
    predicted_genres = []

    for index in range(genre_count):
        model_path = f"{model_dir}/lr_model_{index}.pkl"
        if os.path.exists(model_path):
            lr_model = joblib.load(model_path)
            if lr_model.predict(X_input)[0] == 1:
                genre_name = [k for k, v in mapper_details.items() if v == index][0]
                predicted_genres.append(genre_name)

    return predicted_genres


In [None]:
# User input for movie description
movie_plot = input("Enter the movie description (plot): ")

# Predict genres for user input
predicted_genres = predict_genres(movie_plot)
print(f"Predicted genres for the movie plot: {predicted_genres}")

Predicted genres for the movie plot: ['Thriller', 'Action', 'Horror', 'Action/Adventure']


In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix
import numpy as np

# Calculate accuracy and confusion matrix for the training data
def evaluate_model(X_train, y_train):
    # Initialize lists for predictions and true labels
    all_predictions = []
    all_true_labels = []

    for index in range(genre_count):
        # Get true labels for the current genre
        y_true = y_train[:, index]

        # Load the corresponding model
        model_path = f"{model_dir}/lr_model_{index}.pkl"
        lr_model = joblib.load(model_path)

        # Predict labels for the training data
        y_pred = lr_model.predict(X_train)

        # Append predictions and true labels
        all_predictions.append(y_pred)
        all_true_labels.append(y_true)

    # Convert lists to numpy arrays for metrics calculation
    all_predictions = np.array(all_predictions).T
    all_true_labels = np.array(all_true_labels).T

    # Calculate accuracy for each genre
    accuracies = [accuracy_score(all_true_labels[:, i], all_predictions[:, i]) for i in range(genre_count)]
    overall_accuracy = np.mean(accuracies)

    print(f"Overall Accuracy: {overall_accuracy:.4f}")

    # Confusion Matrix for each genre
    for i in range(genre_count):
        cm = confusion_matrix(all_true_labels[:, i], all_predictions[:, i])
        print(f"\nConfusion Matrix for Genre {i}:\n{cm}")

# Evaluate the model
evaluate_model(X_train, y_train)
