In [2]:
# FINAL MODULE
# Import necessary libraries
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import nltk

# Ensure you have downloaded the necessary NLTK data files
nltk.download('stopwords')
nltk.download('wordnet')

# Load the training dataset
data = pd.read_csv('/content/kaggle_movie_train.csv')  # Adjust the path to your training dataset

# Print column names to verify if 'synopsis' and 'genre' exist
print("Column names in the training dataset:", data.columns)

# Initialize stopwords and lemmatizer
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# Preprocessing function
def preprocess_text(text):
    if isinstance(text, str):  # Ensure the input is a string
        text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
        text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
        text = text.lower()  # Convert to lowercase
        tokens = text.split()  # Tokenize
        tokens = [lemmatizer.lemmatize(token) for token in tokens if token not in stop_words]  # Lemmatize and remove stopwords
        return ' '.join(tokens)
    return ''

# Ensure the necessary columns are present in the training data
if 'text' not in data.columns or 'genre' not in data.columns:
    raise ValueError("The training dataset must contain 'text' and 'genre' columns.")

# Apply preprocessing to the 'synopsis' column
data['cleaned_text'] = data['genre'].apply(preprocess_text)

# Display the first few rows to verify preprocessing
print(data[['genre', 'cleaned_text']].head())

# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(max_features=5000)  # Adjust the number of features as needed
X = tfidf.fit_transform(data['genre']).toarray()

# The target variable (genre)
y = data['genre']

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print("Shape of X_train:", X_train.shape)
print("Shape of X_test:", X_test.shape)
print("Number of classes:", len(set(y)))

# Initialize the Multinomial Naive Bayes model
model = MultinomialNB()

# Train the model on the training data
model.fit(X_train, y_train)

# Make predictions on the test data
y_pred = model.predict(X_test)

# Evaluate the model
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy * 100:.2f}%")

# Print the classification report
print("Classification Report:")
print(classification_report(y_test, y_pred))

# Print the confusion matrix
print("Confusion Matrix:")
print(confusion_matrix(y_test, y_pred))

# --- Prediction on test data from CSV file ---

# Load the test data CSV file
test_data = pd.read_csv('/content/kaggle_movie_test.csv')  # Adjust the path to your test dataset

# Check if 'synopsis' column exists in the test dataset
if 'text' not in test_data.columns:
    raise ValueError("The test dataset must contain a 'text' column.")

# Preprocess the test data
test_data['cleaned_text'] = test_data['text'].apply(preprocess_text)

# Transform the cleaned test data using the same TF-IDF vectorizer
X_test_new = tfidf.transform(test_data['cleaned_text']).toarray()

# Make predictions using the trained model
test_data['predicted_genre'] = model.predict(X_test_new)

# Display the predictions
print("Predictions for the test data:")
print(test_data[['text', 'predicted_genre']])





[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Column names in the training dataset: Index(['id', 'text', 'genre'], dtype='object')
      genre cleaned_text
0  thriller     thriller
1    comedy       comedy
2     drama        drama
3  thriller     thriller
4     drama        drama
Shape of X_train: (18063, 10)
Shape of X_test: (4516, 10)
Number of classes: 9
Accuracy: 100.00%
Classification Report:
              precision    recall  f1-score   support

      action       1.00      1.00      1.00       473
   adventure       1.00      1.00      1.00        25
      comedy       1.00      1.00      1.00       635
       drama       1.00      1.00      1.00      1728
      horror       1.00      1.00      1.00        84
       other       1.00      1.00      1.00        66
     romance       1.00      1.00      1.00        12
      sci-fi       1.00      1.00      1.00       106
    thriller       1.00      1.00      1.00      1387

    accuracy                           1.00      4516
   macro avg       1.00      1.00      1.00      