In [2]:
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
import joblib

train_data_path = 'E:/BCA/PROJECT/Task 1/Genre Classification Dataset/train_data.txt'
test_data_solution_path = 'E:/BCA/PROJECT/Task 1/Genre Classification Dataset/test_data_solution.txt'
test_data_path = 'E:/BCA/PROJECT/Task 1/Genre Classification Dataset/test_data.txt'


train_data = pd.read_csv(train_data_path, delimiter=':::', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
test_data_solution = pd.read_csv(test_data_solution_path, delimiter=':::', engine='python', names=['ID', 'TITLE', 'GENRE', 'DESCRIPTION'])
test_data = pd.read_csv(test_data_path, delimiter=':::', engine='python', names=['ID', 'TITLE', 'DESCRIPTION'])


In [3]:

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    return text


train_data['DESCRIPTION'] = train_data['DESCRIPTION'].apply(preprocess_text)
test_data_solution['DESCRIPTION'] = test_data_solution['DESCRIPTION'].apply(preprocess_text)
test_data['DESCRIPTION'] = test_data['DESCRIPTION'].apply(preprocess_text)

train_data.dropna(inplace=True)
test_data_solution.dropna(inplace=True)


In [4]:

combined_data = pd.concat([train_data['DESCRIPTION'], test_data_solution['DESCRIPTION']])
tfidf = TfidfVectorizer(max_features=5000)
tfidf.fit(combined_data)

X_train = tfidf.transform(train_data['DESCRIPTION'])
y_train = train_data['GENRE']

X_test = tfidf.transform(test_data_solution['DESCRIPTION'])
y_test = test_data_solution['GENRE']


In [5]:
# Define models and parameters for GridSearchCV
models = {
    'Naive Bayes': {
        'model': MultinomialNB(),
        'params': {
            'alpha': [0.5, 1.0, 1.5, 2.0]
        }
    },
    'Logistic Regression': {
        'model': LogisticRegression(max_iter=200),
        'params': {
            'C': [0.1, 1, 10, 100],
            'solver': ['liblinear', 'lbfgs']
        }
    }
}

best_model = None
best_accuracy = 0

for name, model_info in models.items():
    grid = GridSearchCV(model_info['model'], model_info['params'], cv=5, n_jobs=-1)
    grid.fit(X_train, y_train)
    
    predictions = grid.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{name} Accuracy: {accuracy}")
    print(classification_report(y_test, predictions))
    
    if accuracy > best_accuracy:
        best_model = grid.best_estimator_
        best_accuracy = accuracy

print(f"Best Model: {best_model}")
print(f"Best Accuracy: {best_accuracy}")


Naive Bayes Accuracy: 0.5271771217712177


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

      action        0.56      0.13      0.22      1314
       adult        0.48      0.08      0.13       590
   adventure        0.76      0.07      0.13       775
   animation        1.00      0.00      0.00       498
   biography        0.00      0.00      0.00       264
      comedy        0.53      0.42      0.47      7446
       crime        0.00      0.00      0.00       505
 documentary        0.57      0.88      0.69     13096
       drama        0.46      0.82      0.59     13612
      family        0.71      0.01      0.01       783
     fantasy        0.00      0.00      0.00       322
   game-show        0.97      0.36      0.52       193
     history        0.00      0.00      0.00       243
      horror        0.70      0.35      0.47      2204
       music        0.69      0.27      0.39       731
     musical        0.00      0.00      0.00       276
     mystery        0.00      0.00      0.00       318
        n

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.5884317343173432


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


               precision    recall  f1-score   support

      action        0.47      0.28      0.35      1314
       adult        0.60      0.23      0.33       590
   adventure        0.59      0.15      0.25       775
   animation        0.58      0.06      0.10       498
   biography        0.00      0.00      0.00       264
      comedy        0.53      0.59      0.56      7446
       crime        0.38      0.03      0.05       505
 documentary        0.68      0.86      0.76     13096
       drama        0.55      0.77      0.64     13612
      family        0.48      0.08      0.13       783
     fantasy        0.65      0.03      0.06       322
   game-show        0.88      0.50      0.64       193
     history        0.00      0.00      0.00       243
      horror        0.64      0.57      0.60      2204
       music        0.66      0.46      0.54       731
     musical        0.25      0.02      0.03       276
     mystery        0.25      0.00      0.01       318
        n

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [6]:

X_test_final = tfidf.transform(test_data['DESCRIPTION'])
predicted_genres = best_model.predict(X_test_final)

joblib.dump(best_model, 'best_genre_model.joblib')
joblib.dump(tfidf, 'tfidf_vectorizer.joblib')

test_data['PREDICTED_GENRE'] = predicted_genres
test_data[['ID', 'TITLE', 'PREDICTED_GENRE']].to_csv('predicted_genres.csv', index=False)
