In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, accuracy_score

# Function to parse the training and test data files
def parse_train_file(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            if len(parts) == 4:
                data.append((parts[2], parts[3]))  # genre, description
    return data

def parse_test_file(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            if len(parts) == 3:
                data.append(parts[2])  # description only
    return data

def parse_test_solution_file(filename):
    data = []
    with open(filename, 'r', encoding='utf-8') as file:
        for line in file:
            parts = line.strip().split(' ::: ')
            if len(parts) == 4:
                data.append(parts[2])  # genre only
    return data

# Load training data
train_data = parse_train_file('train_data.txt')

# Load test data
test_data = parse_test_file('test_data.txt')
test_solutions = parse_test_solution_file('test_data_solution.txt')

# Convert to DataFrame
train_df = pd.DataFrame(train_data, columns=['genre', 'description'])
test_df = pd.DataFrame({'description': test_data, 'genre': test_solutions})

# Display first few rows of training data
print(train_df.head())
print(test_df.head())


      genre                                        description
0     drama  Listening in to a conversation between his doc...
1  thriller  A brother and sister with a past incestuous re...
2     adult  As the bus empties the students for their fiel...
3     drama  To help their unemployed father make ends meet...
4     drama  The film's title refers not only to the un-rec...
                                         description        genre
0  L.R. Brane loves his life - his car, his apart...     thriller
1  Spain, March 1964: Quico is a very naughty chi...       comedy
2  One year in the life of Albin and his family o...  documentary
3  His father has died, he hasn't spoken with his...        drama
4  Before he was known internationally as a marti...        drama


In [None]:
# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7)

# Fit and transform the training data
X_train_tfidf = vectorizer.fit_transform(train_df['description'])

# Transform the test data
X_test_tfidf = vectorizer.transform(test_df['description'])


In [None]:
# Naive Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train_tfidf, train_df['genre'])

# Predict
nb_predictions = nb_model.predict(X_test_tfidf)

# Evaluate
print("Naive Bayes Accuracy:", accuracy_score(test_df['genre'], nb_predictions))
print(classification_report(test_df['genre'], nb_predictions))

# Logistic Regression
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, train_df['genre'])

# Predict
lr_predictions = lr_model.predict(X_test_tfidf)

# Evaluate
print("Logistic Regression Accuracy:", accuracy_score(test_df['genre'], lr_predictions))
print(classification_report(test_df['genre'], lr_predictions))

# Support Vector Machine
svm_model = SVC(kernel='linear')
svm_model.fit(X_train_tfidf, train_df['genre'])

# Predict
svm_predictions = svm_model.predict(X_test_tfidf)

# Evaluate
print("SVM Accuracy:", accuracy_score(test_df['genre'], svm_predictions))
print(classification_report(test_df['genre'], svm_predictions))

Naive Bayes Accuracy: 0.44474169741697417


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      action       0.00      0.00      0.00      1314
       adult       0.00      0.00      0.00       590
   adventure       0.00      0.00      0.00       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.72      0.05      0.09      7446
       crime       0.00      0.00      0.00       505
 documentary       0.53      0.90      0.66     13096
       drama       0.38      0.88      0.53     13612
      family       0.00      0.00      0.00       783
     fantasy       0.00      0.00      0.00       322
   game-show       0.00      0.00      0.00       193
     history       0.00      0.00      0.00       243
      horror       0.00      0.00      0.00      2204
       music       0.00      0.00      0.00       731
     musical       0.00      0.00      0.00       276
     mystery       0.00      0.00      0.00       318
        news       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      action       0.52      0.26      0.35      1314
       adult       0.62      0.21      0.31       590
   adventure       0.69      0.14      0.24       775
   animation       0.60      0.02      0.05       498
   biography       0.00      0.00      0.00       264
      comedy       0.55      0.59      0.57      7446
       crime       0.42      0.02      0.04       505
 documentary       0.66      0.87      0.75     13096
       drama       0.54      0.80      0.64     13612
      family       0.58      0.08      0.14       783
     fantasy       0.71      0.02      0.03       322
   game-show       0.93      0.48      0.63       193
     history       0.00      0.00      0.00       243
      horror       0.66      0.57      0.61      2204
       music       0.70      0.39      0.50       731
     musical       1.00      0.01      0.01       276
     mystery       1.00      0.00      0.01       318
        news       0.80    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      action       0.48      0.36      0.41      1314
       adult       0.64      0.41      0.50       590
   adventure       0.60      0.23      0.33       775
   animation       0.49      0.11      0.18       498
   biography       0.00      0.00      0.00       264
      comedy       0.55      0.60      0.58      7446
       crime       0.32      0.04      0.07       505
 documentary       0.69      0.86      0.76     13096
       drama       0.55      0.78      0.65     13612
      family       0.51      0.12      0.19       783
     fantasy       0.37      0.06      0.10       322
   game-show       0.85      0.63      0.72       193
     history       0.00      0.00      0.00       243
      horror       0.66      0.62      0.64      2204
       music       0.69      0.51      0.59       731
     musical       0.32      0.02      0.04       276
     mystery       0.50      0.02      0.04       318
        news       0.64    

  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# Evaluate all models
models = {
    'Naive Bayes': (nb_model, nb_predictions),
    'Logistic Regression': (lr_model, lr_predictions),
    'Support Vector Machine': (svm_model, svm_predictions)
}

for model_name, (model, predictions) in models.items():
    print(f"=== {model_name} ===")
    print(f"Accuracy: {accuracy_score(test_df['genre'], predictions)}")
    print(classification_report(test_df['genre'], predictions))
    print()

=== Naive Bayes ===
Accuracy: 0.44474169741697417


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      action       0.00      0.00      0.00      1314
       adult       0.00      0.00      0.00       590
   adventure       0.00      0.00      0.00       775
   animation       0.00      0.00      0.00       498
   biography       0.00      0.00      0.00       264
      comedy       0.72      0.05      0.09      7446
       crime       0.00      0.00      0.00       505
 documentary       0.53      0.90      0.66     13096
       drama       0.38      0.88      0.53     13612
      family       0.00      0.00      0.00       783
     fantasy       0.00      0.00      0.00       322
   game-show       0.00      0.00      0.00       193
     history       0.00      0.00      0.00       243
      horror       0.00      0.00      0.00      2204
       music       0.00      0.00      0.00       731
     musical       0.00      0.00      0.00       276
     mystery       0.00      0.00      0.00       318
        news       0.00    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      action       0.52      0.26      0.35      1314
       adult       0.62      0.21      0.31       590
   adventure       0.69      0.14      0.24       775
   animation       0.60      0.02      0.05       498
   biography       0.00      0.00      0.00       264
      comedy       0.55      0.59      0.57      7446
       crime       0.42      0.02      0.04       505
 documentary       0.66      0.87      0.75     13096
       drama       0.54      0.80      0.64     13612
      family       0.58      0.08      0.14       783
     fantasy       0.71      0.02      0.03       322
   game-show       0.93      0.48      0.63       193
     history       0.00      0.00      0.00       243
      horror       0.66      0.57      0.61      2204
       music       0.70      0.39      0.50       731
     musical       1.00      0.01      0.01       276
     mystery       1.00      0.00      0.01       318
        news       0.80    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

      action       0.48      0.36      0.41      1314
       adult       0.64      0.41      0.50       590
   adventure       0.60      0.23      0.33       775
   animation       0.49      0.11      0.18       498
   biography       0.00      0.00      0.00       264
      comedy       0.55      0.60      0.58      7446
       crime       0.32      0.04      0.07       505
 documentary       0.69      0.86      0.76     13096
       drama       0.55      0.78      0.65     13612
      family       0.51      0.12      0.19       783
     fantasy       0.37      0.06      0.10       322
   game-show       0.85      0.63      0.72       193
     history       0.00      0.00      0.00       243
      horror       0.66      0.62      0.64      2204
       music       0.69      0.51      0.59       731
     musical       0.32      0.02      0.04       276
     mystery       0.50      0.02      0.04       318
        news       0.64    

  _warn_prf(average, modifier, msg_start, len(result))
