In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

In [None]:
# Step 1: Parse the train_data.txt file
def load_dataset(file_path):
    data = []
    with open(file_path, 'r') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            if len(parts) == 4:
                _, title, genre, plot = parts
                data.append({'title': title, 'genre': genre, 'plot': plot})
    return pd.DataFrame(data)
train_data_path = "/content/train_data.txt"
df = load_dataset(train_data_path)

In [None]:
# Step 2: Preprocess the data
texts = df['plot']
labels = df['genre']

In [None]:
# Step 3: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(
    texts, labels, test_size=0.2, random_state=42
)

In [None]:
# Step 4: Convert text data into numerical features using TF-IDF
tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
# Step 5: Train the Logistic Regression model
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X_train_tfidf, y_train)

In [None]:
# Step 6: Evaluate the model
y_pred = classifier.predict(X_test_tfidf)

# Print accuracy and classification report
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5794521811306834

Classification Report:
               precision    recall  f1-score   support

      action       0.53      0.27      0.35       263
       adult       0.71      0.21      0.33       112
   adventure       0.42      0.14      0.21       139
   animation       0.61      0.11      0.18       104
   biography       0.00      0.00      0.00        61
      comedy       0.51      0.58      0.55      1443
       crime       0.43      0.03      0.05       107
 documentary       0.66      0.85      0.74      2659
       drama       0.54      0.78      0.64      2697
      family       0.41      0.07      0.12       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.94      0.42      0.59        40
     history       0.00      0.00      0.00        45
      horror       0.63      0.56      0.59       431
       music       0.63      0.47      0.54       144
     musical       1.00      0.02      0.04        50
     mystery       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [None]:
# Step 7: Predict new genres
new_text = [
    "A young wizard discovers his magical heritage and battles dark forces."
]
new_text_tfidf = tfidf_vectorizer.transform(new_text)
prediction = classifier.predict(new_text_tfidf)
print("Predicted Genre:", prediction[0])


Predicted Genre: fantasy
