In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, classification_report
import joblib
import logging

from shared_functions import update_accuracy_in_config

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

def load_data(file_path):
    """Load data from CSV file."""
    try:
        df = pd.read_csv(file_path, header=None, names=['text', 'label'], delimiter=',', quoting=3)
        logging.info(f"Data loaded successfully. Shape: {df.shape}")
        return df
    except Exception as e:
        logging.error(f"Error loading data: {str(e)}")
        raise

def preprocess_data(df):
    """Preprocess the data."""
    X = df['text']
    y = df['label']
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
    logging.info(f"Data split into training and test sets. Training set size: {len(X_train)}, Test set size: {len(X_test)}")
    return X_train, X_test, y_train, y_test

def create_tfidf_vectorizer():
    """Create a TfidfVectorizer."""
    vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
    return vectorizer

def create_model():
    """Create a GradientBoostingClassifier."""
    model = GradientBoostingClassifier(random_state=42)
    return model

def train_model(vectorizer, model, X_train, y_train):
    """Train the model with GridSearchCV."""
    X_train_tfidf = vectorizer.fit_transform(X_train)
    
    param_grid = {
        'n_estimators': [100, 200],
        'learning_rate': [0.01, 0.1],
        'max_depth': [3, 5]
    }

    grid_search = GridSearchCV(model, param_grid, cv=3, n_jobs=-1, verbose=1)
    grid_search.fit(X_train_tfidf, y_train)
    
    logging.info(f"Best parameters: {grid_search.best_params_}")
    return vectorizer, grid_search.best_estimator_

def evaluate_model(vectorizer, model, X_test, y_test):
    """Evaluate the model and print results."""
    X_test_tfidf = vectorizer.transform(X_test)
    y_pred = model.predict(X_test_tfidf)
    accuracy = accuracy_score(y_test, y_pred)
    logging.info(f'Accuracy: {accuracy}')
    logging.info('Classification Report:')
    logging.info(classification_report(y_test, y_pred))
    return accuracy

def save_model(vectorizer, model, file_path):
    """Save the vectorizer and model together in a single file."""
    joblib.dump((vectorizer, model), file_path)
    logging.info(f"Vectorizer and model saved to {file_path}")

# Load data
df = load_data('../sampled_data.csv')

# Preprocess data
X_train, X_test, y_train, y_test = preprocess_data(df)

# Create and train model
vectorizer = create_tfidf_vectorizer()
model = create_model()
vectorizer, trained_model = train_model(vectorizer, model, X_train, y_train)

# Evaluate model
accuracy = evaluate_model(vectorizer, trained_model, X_test, y_test)

# Save model
save_model(vectorizer, trained_model, '../trained_models/GBC_TF.pkl')

# Update accuracy in config
update_accuracy_in_config(accuracy, 'tfidf_gradient_boosting')


2024-08-04 13:54:20,449 - INFO - Data loaded successfully. Shape: (3000, 2)
2024-08-04 13:54:20,452 - INFO - Data split into training and test sets. Training set size: 2400, Test set size: 600


Fitting 3 folds for each of 8 candidates, totalling 24 fits


2024-08-04 13:55:06,365 - INFO - Best parameters: {'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 200}
2024-08-04 13:55:06,377 - INFO - Accuracy: 0.8883333333333333
2024-08-04 13:55:06,377 - INFO - Classification Report:
2024-08-04 13:55:06,386 - INFO -               precision    recall  f1-score   support

       anger       0.95      0.88      0.92       118
        fear       0.95      0.83      0.89        99
         joy       0.77      0.90      0.83        96
        love       0.91      0.96      0.93       101
     sadness       0.91      0.76      0.83        91
    surprise       0.86      1.00      0.93        95

    accuracy                           0.89       600
   macro avg       0.89      0.89      0.89       600
weighted avg       0.89      0.89      0.89       600

2024-08-04 13:55:06,439 - INFO - Vectorizer and model saved to ../trained_models/Tfidf_vectorizer_model.pkl


In [2]:
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

def plot_confusion_matrix(y_predicted, y_true, labels):
    cm = confusion_matrix(y_true, y_predicted, normalize='true')
    fig, ax = plt.subplots(figsize=(6, 6))
    disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=labels)
    disp.plot(cmap='Blues', values_format='.2f', ax=ax, colorbar=False)
    plt.title('Normalized Confusion Matrix')
    plt.show()

# Example usage
labels = df['label'].unique()  # Extract unique labels from the DataFrame
plot_confusion_matrix(y_pred, y_test, labels)


NameError: name 'y_pred' is not defined

In [None]:
# import os

# # Define the directory and file names
# output_dir = '../python_models'
# notebook_filename = 'GBC_TF.ipynb'
# python_filename = notebook_filename.replace('.ipynb', '.py')

# # Ensure the output directory exists
# os.makedirs(output_dir, exist_ok=True)

# # Convert the notebook to a Python script
# !jupyter nbconvert --to script --output-dir {output_dir} {notebook_filename}

[NbConvertApp] Converting notebook GBC_TF.ipynb to script
[NbConvertApp] Writing 6922 bytes to ..\python_models\GBC_TF.py
