In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
import joblib
import logging

from features import calculate_text_features_gbc_cv as calculate_text_features
from shared_functions import update_accuracy_in_config

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_data(file_path):
    """Load data from CSV file."""
    try:
        df = pd.read_csv(file_path, header=None, names=['text', 'label'], delimiter=',', quoting=3)
        logging.info(f"Data loaded successfully. Shape: {df.shape}")
        return df
    except Exception as e:
        logging.error(f"Error loading data: {str(e)}")
        raise

def preprocess_data(df):
    """Preprocess the data."""
    X = df['text']
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    logging.info(f"Data split into training and test sets. Training set size: {len(X_train)}, Test set size: {len(X_test)}")
    return X_train, X_test, y_train, y_test

def create_model():
    """Create a pipeline with TfidfVectorizer and GradientBoostingClassifier."""
    pipeline = Pipeline([
        ('tfidf', TfidfVectorizer(max_features=5000, stop_words='english')),
        ('classifier', GradientBoostingClassifier(random_state=42))
    ])

    return pipeline

def train_model(model, X_train, y_train):
    """Train the model with GridSearchCV."""
    param_grid = {
        'classifier__n_estimators': [100, 200],
        'classifier__learning_rate': [0.01, 0.1],
        'classifier__max_depth': [3, 5]
    }

    grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=1)
    grid_search.fit(X_train, y_train)
    
    logging.info(f"Best parameters: {grid_search.best_params_}")
    return grid_search.best_estimator_

def evaluate_model(model, X_test, y_test):
    """Evaluate the model and print results."""
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    logging.info(f'Accuracy: {accuracy}')
    logging.info('Classification Report:')
    logging.info(classification_report(y_test, y_pred))
    return accuracy

def save_model(vectorizer, model, file_path):
    """Save the vectorizer and trained model."""
    joblib.dump((vectorizer, model), file_path)
    logging.info(f"Model and vectorizer saved to {file_path}")

# Load data
df = load_data('../sampled_data.csv')

# Preprocess data
X_train, X_test, y_train, y_test = preprocess_data(df)

# Create and train model
model = create_model()
trained_model = train_model(model, X_train, y_train)

# Evaluate model
accuracy = evaluate_model(trained_model, X_test, y_test)

# Save model and vectorizer
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
save_model(vectorizer, trained_model, '../trained_models/GBC_CV.pkl')

# Update accuracy in config
update_accuracy_in_config(accuracy, 'count_gradient_boosting')


2024-08-02 14:55:28,363 - INFO - Data loaded successfully. Shape: (3000, 2)
2024-08-02 14:55:28,369 - INFO - Data split into training and test sets. Training set size: 2400, Test set size: 600


Fitting 3 folds for each of 8 candidates, totalling 24 fits


2024-08-02 14:55:53,620 - INFO - Best parameters: {'classifier__learning_rate': 0.1, 'classifier__max_depth': 5, 'classifier__n_estimators': 200}
2024-08-02 14:55:53,638 - INFO - Accuracy: 0.8916666666666667
2024-08-02 14:55:53,639 - INFO - Classification Report:
2024-08-02 14:55:53,647 - INFO -               precision    recall  f1-score   support

       anger       0.95      0.89      0.92       118
        fear       0.94      0.83      0.88        99
         joy       0.79      0.91      0.84        96
        love       0.90      0.96      0.93       101
     sadness       0.92      0.77      0.84        91
    surprise       0.86      0.99      0.92        95

    accuracy                           0.89       600
   macro avg       0.89      0.89      0.89       600
weighted avg       0.90      0.89      0.89       600

2024-08-02 14:55:53,703 - INFO - Model and vectorizer saved to ../trained_models/GBC_CV.pkl
