In [None]:
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

In [None]:
#Load and preprocess the training dataset
train_data = []
with open("/content/drive/MyDrive/Genre_Classification_Dataset/train_data.txt", "r", encoding="utf-8") as file:
    for line in file:
        parts = line.strip().split(" ::: ")
        if len(parts) == 4:
            train_data.append((parts[3], parts[2]))  # (Description, Genre)

descriptions, genres = zip(*train_data)

In [None]:
# Vectorize the training text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(descriptions)

# Encode genre labels
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(genres)

# Train the SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_tfidf, y_train_encoded)

In [None]:
# Load and preprocess the test dataset (description dataset)
test_data = []
test_solutions = []
with open("/content/drive/MyDrive/Genre_Classification_Dataset/test_data.txt", "r", encoding="utf-8") as file, open("/content/drive/MyDrive/Genre_Classification_Dataset/test_data_solution.txt", "r", encoding="utf-8") as solution_file:
    for line, solution in zip(file, solution_file):
        parts = line.strip().split(" ::: ")
        if len(parts) == 3:
            test_data.append(parts[2])  # Description
            test_solutions.append(solution.strip())  # Genre from the solution file

In [None]:
# Vectorize the test text data using the same TF-IDF vectorizer
X_test_tfidf = tfidf_vectorizer.transform(test_data)

# Make predictions on the test data
y_pred_encoded = svm_classifier.predict(X_test_tfidf)

In [None]:
# Handle unknown labels
y_pred = label_encoder.inverse_transform(y_pred_encoded)

# Replace predicted labels with "UnknownLabel" for unknown labels
for i in range(len(y_pred)):
    try:
        y_pred[i] = label_encoder.inverse_transform([y_pred_encoded[i]])[0]
    except KeyError:
        y_pred[i] = "UnknownLabel"

In [None]:
# Evaluate the model on the test data
accuracy = accuracy_score(y_pred_encoded, y_pred_encoded)
classification_rep = classification_report(y_pred_encoded, y_pred_encoded)

print(f"Accuracy: {accuracy}")
print(classification_rep)

Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1092
           1       1.00      1.00      1.00       377
           2       1.00      1.00      1.00       330
           3       1.00      1.00      1.00       150
           5       1.00      1.00      1.00      8318
           6       1.00      1.00      1.00       103
           7       1.00      1.00      1.00     16048
           8       1.00      1.00      1.00     18613
           9       1.00      1.00      1.00       178
          10       1.00      1.00      1.00        80
          11       1.00      1.00      1.00       143
          13       1.00      1.00      1.00      2038
          14       1.00      1.00      1.00       540
          15       1.00      1.00      1.00        37
          16       1.00      1.00      1.00        18
          17       1.00      1.00      1.00        25
          18       1.00      1.00      1.00       417
          19 

In [None]:
# Create a function for genre classification
def classify_movie_genre(user_input):
    # Vectorize the user input using the same TF-IDF vectorizer
    user_input_tfidf = tfidf_vectorizer.transform([user_input])

    # Make predictions on the user input
    user_pred_encoded = svm_classifier.predict(user_input_tfidf)
    user_pred = label_encoder.inverse_transform(user_pred_encoded)

    return user_pred[0]

while True:
    # Get a movie description from the user
    user_description = input("Enter a movie description: ")

    # Classify the movie genre
    predicted_genre = classify_movie_genre(user_description)

    print(f"Predicted Genre: {predicted_genre}")

    # Ask the user if they want to classify another movie
    another_movie = input("Do you want to classify another movie? (yes/no): ").lower()

    if another_movie != 'yes':
        break



Enter a movie description: The story revolves around three main characters: Baburao Ganpatrao Apte (played by Paresh Rawal), Raju (played by Akshay Kumar), and Shyam (played by Suniel Shetty). Baburao is a landlord who is struggling to collect rent from his tenants. Raju and Shyam are two friends who are desperately seeking employment.  One day, Raju and Shyam stumble upon a wrong-number call that offers them the opportunity to make a lot of money. They mistakenly believe it's a job offer, which sets off a series of comical events. Soon, they find themselves in a web of confusion and chaos involving a kidnapping and ransom demand.
Predicted Genre: comedy
Do you want to classify another movie? (yes/no): yes
Enter a movie description: the Warrens are called to assist in the exorcism of a young boy who appears to be possessed by a malevolent entity. However, the case takes a dark turn when the boy's possession leads to a gruesome murder, and the Warrens find themselves at the center of a 