In [22]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.utils.class_weight import compute_class_weight
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import tensorflow as tf

# Load the training data
train_data = pd.read_csv("train.csv")
validation_data = pd.read_csv("validate.csv")
test_data = pd.read_csv("test.csv")

# Prepare data
X_train = train_data.drop(columns=["genre"])
y_train = train_data["genre"]
X_val = validation_data.drop(columns=["genre"])
y_val = validation_data["genre"]
X_test = test_data.drop(columns=["ID"])

# Normalize input features
scaler = StandardScaler()
X_train_normalized = scaler.fit_transform(X_train)
X_val_normalized = scaler.transform(X_val)

# Define label encoder
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)

# Compute class weights
class_weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(y_train_encoded), y=y_train_encoded
)


# Define model
def build_model(
    input_shape, output_classes, hidden_layers, hidden_nodes, learning_rate
):
    model = tf.keras.Sequential()
    model.add(
        tf.keras.layers.Dense(hidden_nodes, activation="relu", input_shape=input_shape)
    )
    for _ in range(hidden_layers - 1):
        model.add(tf.keras.layers.Dense(hidden_nodes, activation="relu"))
    model.add(tf.keras.layers.Dense(output_classes, activation="softmax"))

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )
    return model


# Evaluate model
def evaluate_model(model, X_train, y_train, X_val, y_val):
    y_train_pred = np.argmax(model.predict(X_train), axis=1)
    y_val_pred = np.argmax(model.predict(X_val), axis=1)
    y_val_test = np.argmax(model.predict(X_test), axis=1)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    val_accuracy = accuracy_score(y_val, y_val_pred)
    test_accuracy = accuracy_score(y_val, y_val_test)

    print("Accuracy Scores:")
    print(f"Training Accuracy: {train_accuracy}")
    print(f"Validation Accuracy: {val_accuracy}")
    print(f"Test Accuracy: {test_accuracy}")

    print("\nClassification Report:")
    print(classification_report(y_val, y_val_pred))


# Model parameters
input_shape = (X_train.shape[1],)
output_classes = len(label_encoder.classes_)
hidden_layers = 1
hidden_nodes = 64
learning_rate = 0.001
epochs = 100

# Build and train the model
model = build_model(
    input_shape, output_classes, hidden_layers, hidden_nodes, learning_rate
)
history = model.fit(
    X_train,
    y_train_encoded,
    epochs=epochs,
    validation_data=(X_val, y_val_encoded),
    verbose=1,
    # class_weight=dict(enumerate(class_weights)),
)

# Evaluate the model
evaluate_model(model, X_train, y_train_encoded, X_val, y_val_encoded)

# Make predictions on test set
test_predictions = np.argmax(model.predict(X_test), axis=1)

# Create submission DataFrame
submission_df = pd.DataFrame({"ID": test_data["ID"], "label": test_predictions})

# Save submission to CSV
submission_df.to_csv("submission.csv", index=False)

Epoch 1/100


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 601us/step - accuracy: 0.3230 - loss: 1.6156 - val_accuracy: 0.3509 - val_loss: 1.4786
Epoch 2/100
[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 516us/step - accuracy: 0.4683 - loss: 1.3839 - val_accuracy: 0.4412 - val_loss: 1.3090
Epoch 3/100
[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 541us/step - accuracy: 0.5219 - loss: 1.2452 - val_accuracy: 0.5663 - val_loss: 1.1812
Epoch 4/100
[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 551us/step - accuracy: 0.5643 - loss: 1.1588 - val_accuracy: 0.5615 - val_loss: 1.1285
Epoch 5/100
[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 556us/step - accuracy: 0.5886 - loss: 1.1055 - val_accuracy: 0.6084 - val_loss: 1.0708
Epoch 6/100
[1m601/601[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 537us/step - accuracy: 0.5941 - loss: 1.0674 - val_accuracy: 0.5894 - val_loss: 1.0954
Epoch 7/100
[1m601/60

In [23]:
# Load the training data
train_data = pd.read_csv("train.csv")

# Count the occurrences of each genre
genre_counts = train_data["genre"].value_counts()


# Plot the distribution of genres
plt.figure(figsize=(10, 6))
plt.bar(genre_counts.index, genre_counts.values, color="skyblue")
plt.xlabel("Genre")
plt.ylabel("Count")
plt.title("Distribution of Movie Genres")
plt.xticks(genre_counts.index, genre_counts.index.tolist())
plt.show()

NameError: name 'plt' is not defined

Drama and documentary genres are the most common, with 5483 and 4861
occurrences, respectively. Comedy is the third most common genre with 3896
occurrences.Horror, thriller, and action genres have fewer occurrences compared
to drama, documentary, and comedy, with 2104, 1568, and 1312 occurrences,
respectively. The distribution of the dataset is imbalanced, with drama and
documentary genres dominating the dataset, while horror, thriller, and action
genres are underrepresented. This class imbalance can potentially affect the
performance of machine learning models, particularly for genres with fewer
occurrences, as the model may struggle to generalize well to these classes.

In [None]:
validation_data = pd.read_csv("validate.csv")
test_data = pd.read_csv("test.csv")

In [None]:
X_train = train_data.drop(columns=["genre"])
y_train = train_data["genre"]
X_val = validation_data.drop(columns=["genre"])
y_val = validation_data["genre"]
X_test = test_data

In [None]:
# # Normalize input features
# scaler = StandardScaler()
# X_train_normalized = scaler.fit_transform(X_train)
# X_val_normalized = scaler.transform(X_val)

In [None]:
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_val_encoded = label_encoder.transform(y_val)
class_weights = compute_class_weight(
    class_weight="balanced", classes=np.unique(y_train_encoded), y=y_train_encoded
)

In [None]:
# # Tokenize text data to get vocabulary size and maximum sequence length
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts(X_train)
# vocab_size = len(tokenizer.word_index) + 1
# max_seq_length = max([len(text.split()) for text in X_train])

# # Set embedding dimension
# embedding_dim = 100  # Example dimension, you can adjust it based on your preference

In [None]:
def build_model(hidden_layers, hidden_nodes, learning_rate):
    model = tf.keras.Sequential()
    model.add(
        tf.keras.layers.Dense(
            hidden_nodes, activation="relu", input_shape=(X_train.shape[1],)
        )
    )
    for _ in range(hidden_layers - 1):
        model.add(tf.keras.layers.Dense(hidden_nodes, activation="relu"))
    model.add(tf.keras.layers.Dense(len(label_encoder.classes_), activation="softmax"))

    optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
    model.compile(
        optimizer=optimizer,
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )
    return model

In [None]:
def plot_training_history(history, title):
    plt.plot(history.history["loss"], label="Train")
    plt.plot(history.history["val_loss"], label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Error")
    plt.title(title)
    plt.legend()
    plt.show()

In [None]:
def evaluate_model(model, X_train, y_train, X_val, y_val):
    y_train_pred_data = model.predict(X_train)
    y_train_pred = np.argmax(y_train_pred_data, axis=1)
    y_val_pred_data = model.predict(X_val)
    y_val_pred = np.argmax(y_val_pred_data, axis=1)

    train_accuracy = accuracy_score(y_train, y_train_pred)
    val_accuracy = accuracy_score(y_val, y_val_pred)

    conf_matrix = confusion_matrix(y_val, y_val_pred)
    class_wise_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)

    print("Accuracy Scores:")
    print(f"Training Accuracy: {train_accuracy}")
    print(f"Validation Accuracy: {val_accuracy}")

    print("\nClassification Report:")
    print(classification_report(y_val, y_val_pred, target_names=label_encoder.classes_))

    print("\nClass-wise Accuracy:")
    for genre, acc in zip(label_encoder.classes_, class_wise_accuracy):
        print(f"{genre}: {acc}")

In [None]:
model_structures = [
    {"hidden_layers": 1, "hidden_nodes": 64},
    # {"hidden_layers": 2, "hidden_nodes": 32},
    # {"hidden_layers": 2, "hidden_nodes": 64},
]

for idx, structure in enumerate(model_structures, start=1):
    hidden_layers = structure["hidden_layers"]
    hidden_nodes = structure["hidden_nodes"]
    learning_rate = 0.001
    epochs = 100

    print(
        f"\nModel {idx}: Hidden Layers: {hidden_layers}, Hidden Nodes: {hidden_nodes}"
    )
    model = build_model(hidden_layers, hidden_nodes, learning_rate)
    history = model.fit(
        X_train,
        y_train_encoded,
        epochs=epochs,
        validation_data=(X_val, y_val_encoded),
        verbose=0,
        # class_weight=dict(enumerate(class_weights))
    )

    plot_training_history(history, f"Model {idx}: Training vs Validation Error")

    evaluate_model(model, X_train, y_train_encoded, X_val, y_val_encoded)


Model 1: Hidden Layers: 1, Hidden Nodes: 64


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# # Define custom neural network model
# def build_custom_model(input_shape, num_classes):
#     input_layer = Input(input_shape)
#     x = Dense(128, activation="relu")(input_layer)
#     x = Dense(64, activation="relu")(x)
#     output_layer = tf.keras.layers.Dense((num_classes), activation="sigmoid")(x)
#     model = Model(inputs=input_layer, outputs=output_layer)
#     return model

In [None]:
# # Instantiate the model
# model = build_custom_model(
#     input_shape=(X_train.shape[1],), num_classes=len(label_encoder.classes_)
# )

# # Compile the model
# model.compile(
#     optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
# )

# # Train the model
# history = model.fit(
#     X_train,
#     y_train_encoded,
#     epochs=100,
#     batch_size=256,
#     validation_data=(X_val, y_val_encoded),
#     verbose=1,
# )

In [None]:
# # Evaluate the model
# train_loss, train_accuracy = model.evaluate(X_train, y_train_encoded, verbose=0)
# val_loss, val_accuracy = model.evaluate(X_val, y_val_encoded, verbose=0)

# # Predict on validation set
# y_val_pred = np.argmax(model.predict(X_val), axis=-1)

# # Compute class-wise accuracy
# conf_matrix = confusion_matrix(y_val_encoded, y_val_pred)
# class_wise_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)

# # Compute F1 score
# f1_scores = []
# for genre, acc in zip(label_encoder.classes_, class_wise_accuracy):
#     precision = (
#         conf_matrix[label_encoder.transform([genre]), label_encoder.transform([genre])]
#         / conf_matrix[:, label_encoder.transform([genre])].sum()
#     )
#     recall = (
#         conf_matrix[label_encoder.transform([genre]), label_encoder.transform([genre])]
#         / conf_matrix[label_encoder.transform([genre]), :].sum()
#     )
#     f1_score = 2 * (precision * recall) / (precision + recall)
#     f1_scores.append(f1_score)

In [None]:
# # Print evaluation metrics
# print("Training Accuracy:", train_accuracy)
# print("Validation Accuracy:", val_accuracy)
# print("\nClass-wise Accuracy:")
# for genre, acc in zip(label_encoder.classes_, class_wise_accuracy):
#     print(f"{genre}: {acc}")
# print("\nF1 Scores:")
# for genre, f1 in zip(label_encoder.classes_, f1_scores):
#     print(f"{genre}: {f1}")

# # Plot training history
# plt.plot(history.history["loss"], label="Train")
# plt.plot(history.history["val_loss"], label="Validation")
# plt.xlabel("Epoch")
# plt.ylabel("Loss")
# plt.title("Training vs Validation Loss")
# plt.legend()
# plt.show()

In [None]:
def train_and_evaluate_model(
    build_model_func,
    X_train,
    y_train,
    X_val,
    y_val,
    label_encoder,
    epochs=100,
    batch_size=64,
):
    # Instantiate the model
    model = build_model_func(
        input_shape=X_train.shape[1:], num_classes=len(label_encoder.classes_)
    )

    # Compile the model
    model.compile(
        optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"]
    )

    # Train the model
    history = model.fit(
        X_train,
        y_train,
        epochs=epochs,
        batch_size=batch_size,
        validation_data=(X_val, y_val),
        verbose=1,
        # class_weight=dict(enumerate(class_weights)),
    )

    # Evaluate the model
    train_loss, train_accuracy = model.evaluate(X_train, y_train, verbose=0)
    val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=0)

    # Predict on validation set
    y_val_pred = np.argmax(model.predict(X_val), axis=-1)

    # Compute class-wise accuracy
    conf_matrix = confusion_matrix(y_val, y_val_pred)
    class_wise_accuracy = conf_matrix.diagonal() / conf_matrix.sum(axis=1)

    # Compute F1 score
    f1_scores = []
    for genre, acc in zip(label_encoder.classes_, class_wise_accuracy):
        precision = (
            conf_matrix[
                label_encoder.transform([genre]), label_encoder.transform([genre])
            ]
            / conf_matrix[:, label_encoder.transform([genre])].sum()
        )
        recall = (
            conf_matrix[
                label_encoder.transform([genre]), label_encoder.transform([genre])
            ]
            / conf_matrix[label_encoder.transform([genre]), :].sum()
        )
        f1_score = 2 * (precision * recall) / (precision + recall)
        f1_scores.append(f1_score)

    # Print evaluation metrics
    print("Training Accuracy:", train_accuracy)
    print("Validation Accuracy:", val_accuracy)
    print("\nClass-wise Accuracy:")
    for genre, acc in zip(label_encoder.classes_, class_wise_accuracy):
        print(f"{genre}: {acc}")
    print("\nF1 Scores:")
    for genre, f1 in zip(label_encoder.classes_, f1_scores):
        print(f"{genre}: {f1}")

    # Plot training history
    plt.plot(history.history["loss"], label="Train")
    plt.plot(history.history["val_loss"], label="Validation")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.title("Training vs Validation Loss")
    plt.legend()
    plt.show()
    # Predict using the model
    predictions = model.predict(X_test)

    submission_df = pd.DataFrame({"ID": test_data['ID'], "label": np.argmax(predictions, axis=1)})

    submission_df.to_csv("submission.csv", index=False)

In [None]:
# Define custom neural network model
def build_custom_model(input_shape, num_classes):
    input_layer = Input(input_shape)
    x = Dense(128, activation="relu")(input_layer)
    x = Dense(64, activation="relu")(x)
    output_layer = tf.keras.layers.Dense((num_classes), activation="sigmoid")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    # predictions = model.predict(X_test)

    return model

In [None]:
def build_cnn_model(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_layer)
    x = Conv1D(128, 5, activation="relu")(x)
    x = GlobalMaxPooling1D()(x)
    x = Dense(128, activation="relu")(x)
    output_layer = Dense(num_classes, activation="softmax")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    return model

In [None]:
def build_bidirectional_lstm(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_layer)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    x = Bidirectional(LSTM(32))(x)
    x = Dense(64, activation="relu")(x)
    output_layer = Dense(num_classes, activation="softmax")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    return model

In [None]:
def build_lstm_with_attention(input_shape, num_classes):
    input_layer = Input(shape=input_shape)
    x = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_layer)
    x = Bidirectional(LSTM(64, return_sequences=True))(x)
    attention = Attention()([x, x])
    x = Dense(64, activation="relu")(attention)
    output_layer = Dense(num_classes, activation="softmax")(x)
    model = Model(inputs=input_layer, outputs=output_layer)
    return model

In [None]:
def build_ensemble_model(input_shape, num_classes):
    input_layer = Input(shape=input_shape)

    # CNN branch
    cnn_branch = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_layer)
    cnn_branch = Conv1D(128, 5, activation="relu")(cnn_branch)
    cnn_branch = GlobalMaxPooling1D()(cnn_branch)
    cnn_branch = Dense(64, activation="relu")(cnn_branch)

    # LSTM branch
    lstm_branch = Embedding(input_dim=vocab_size, output_dim=embedding_dim)(input_layer)
    lstm_branch = Bidirectional(LSTM(64, return_sequences=True))(lstm_branch)
    lstm_branch = Bidirectional(LSTM(32))(lstm_branch)
    lstm_branch = Dense(64, activation="relu")(lstm_branch)

    # Concatenate both branches
    concatenated = Concatenate()([cnn_branch, lstm_branch])
    output_layer = Dense(num_classes, activation="softmax")(concatenated)

    model = Model(inputs=input_layer, outputs=output_layer)
    return model

In [None]:
train_and_evaluate_model(
    build_custom_model, X_train, y_train_encoded, X_val, y_val_encoded, label_encoder
)
# train_and_evaluate_model(
#     build_cnn_model, X_train, y_train_encoded, X_val, y_val_encoded, label_encoder
# )
# train_and_evaluate_model(
#     build_bidirectional_lstm,
#     X_train,
#     y_train_encoded,
#     X_val,
#     y_val_encoded,
#     label_encoder,
# )
# train_and_evaluate_model(
#     build_lstm_with_attention,
#     X_train,
#     y_train_encoded,
#     X_val,
#     y_val_encoded,
#     label_encoder,
# )
# train_and_evaluate_model(
#     build_ensemble_model,
#     X_train,
#     y_train_encoded,
#     X_val,
#     y_val_encoded,
#     label_encoder,
# )