In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import LabelEncoder

def load_data(file_path, is_train=True):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    data = []
    for line in lines:
        parts = line.strip().split(' ::: ')
        if is_train and len(parts) == 4:
            data.append((parts[0], parts[1], parts[2], parts[3]))
        elif not is_train and len(parts) == 3:
            data.append((parts[0], parts[1], parts[2]))

    if is_train:
        df = pd.DataFrame(data, columns=['ID', 'Title', 'Genre', 'Description'])
    else:
        df = pd.DataFrame(data, columns=['ID', 'Title', 'Description'])

    return df

train_data_path = 'train_data.txt'  # Replace with actual path
test_data_path = 'test_data.txt'  # Replace with actual path
test_data_solution_path = 'test_data_solution.txt'  # Replace with actual path

train_df = load_data(train_data_path, is_train=True)
test_df = load_data(test_data_path, is_train=False)
test_solution_df = load_data(test_data_solution_path, is_train=True)

label_encoder = LabelEncoder()
train_df['Genre'] = label_encoder.fit_transform(train_df['Genre'])
test_solution_df['Genre'] = label_encoder.transform(test_solution_df['Genre'])

X_train, X_val, y_train, y_val = train_test_split(train_df['Description'], train_df['Genre'], test_size=0.2, random_state=42)

def create_model_pipeline(model):
    return Pipeline([
        ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.95, min_df=2)),
        ('clf', model)
    ])

def evaluate_model(model, X_train, y_train, X_val, y_val):
    pipeline = create_model_pipeline(model)
    try:
        pipeline.fit(X_train, y_train)
        y_pred = pipeline.predict(X_val)
        print(f'Accuracy: {accuracy_score(y_val, y_pred)}')
        print(classification_report(y_val, y_pred, target_names=label_encoder.classes_))
        return pipeline
    except Exception as e:
        print(f"Error during model evaluation: {e}")
        return None

models = [
    ('Naive Bayes', MultinomialNB()),
    ('Logistic Regression', LogisticRegression(max_iter=1000)),
    ('SVM', SVC(kernel='linear'))
]

best_model = None
best_accuracy = 0
for model_name, model in models:
    print(f'Evaluating {model_name}...')
    trained_model = evaluate_model(model, X_train, y_train, X_val, y_val)
    if trained_model is not None:
        val_accuracy = accuracy_score(y_val, trained_model.predict(X_val))
        if val_accuracy > best_accuracy:
            best_accuracy = val_accuracy
            best_model = trained_model

if best_model is None:
    print("No best model found, trying GridSearch for SVM optimization...")
    svm_pipeline = create_model_pipeline(SVC())
    param_grid = {
        'clf__C': [0.1, 1, 10],
        'clf__kernel': ['linear', 'rbf'],
        'clf__gamma': ['scale', 'auto']
    }
    grid_search = GridSearchCV(svm_pipeline, param_grid, cv=5, verbose=2, n_jobs=-1)
    grid_search.fit(X_train, y_train)
    best_model = grid_search.best_estimator_
    best_accuracy = accuracy_score(y_val, best_model.predict(X_val))
    print(f"Best SVM model parameters: {grid_search.best_params_}")
    print(f"Best SVM model validation accuracy: {best_accuracy}")

if best_model is not None:
    test_predictions = best_model.predict(test_df['Description'])
    test_df['Genre'] = label_encoder.inverse_transform(test_predictions)

    output_file_path = 'test_predictions.txt'
    test_df[['ID', 'Title', 'Genre']].to_csv(output_file_path, sep=',', index=False, header=False)
    print(f'Predictions saved to {output_file_path}')
else: 
    print("No model was successfully trained and evaluated.")


The history saving thread hit an unexpected error (OperationalError('database or disk is full')).History will not be written to the database.
Evaluating Naive Bayes...
Accuracy: 0.459098035599004
              precision    recall  f1-score   support

      action       0.00      0.00      0.00       263
       adult       0.00      0.00      0.00       112
   adventure       0.00      0.00      0.00       139
   animation       0.00      0.00      0.00       104
   biography       0.00      0.00      0.00        61
      comedy       0.62      0.14      0.24      1443
       crime       0.00      0.00      0.00       107
 documentary       0.54      0.90      0.67      2659
       drama       0.39      0.87      0.54      2697
      family       0.00      0.00      0.00       150
     fantasy       0.00      0.00      0.00        74
   game-show       0.00      0.00      0.00        40
     history       0.00      0.00      0.00        45
      horror       1.00      0.01      0.02    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


Evaluating Logistic Regression...
