In [1]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from shared_functions import update_accuracy_in_config

# Load your dataset
df = pd.read_csv('../sampled_data.csv', header=None, names=['text', 'label'], delimiter=',', quoting=3)

# Split data into features and labels
X = df['text']
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert text data to numerical data using CountVectorizer
vectorizer = CountVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Train a Naive Bayes model
model = MultinomialNB()
model.fit(X_train_vect, y_train)

# Evaluate the model
y_pred = model.predict(X_test_vect)
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report(y_test, y_pred))

# Predicting new sentences
sentences = [
    "I love programming and data science.",
    "The weather is terrible today.",
    "Let's have a meeting to discuss the project."
]

# Convert new sentences to numerical data
sentences_vect = vectorizer.transform(sentences)
predictions = model.predict(sentences_vect)

for sentence, prediction in zip(sentences, predictions):
    print(f'Sentence: "{sentence}"')
    print(f'Predicted Label: {prediction}')
    print()

joblib.dump((vectorizer, model), '../trained_models/NB_CV.pkl')

update_accuracy_in_config(accuracy, 'count_naivebase')

print("Model saved to text_classification_model.pkl")


Accuracy: 0.66
Classification Report:
              precision    recall  f1-score   support

       anger       0.82      0.57      0.67       162
        fear       0.63      0.63      0.63       151
         joy       0.58      0.61      0.60       148
        love       0.65      0.74      0.69       143
     sadness       0.64      0.62      0.63       141
    surprise       0.68      0.79      0.73       155

    accuracy                           0.66       900
   macro avg       0.67      0.66      0.66       900
weighted avg       0.67      0.66      0.66       900

Sentence: "I love programming and data science."
Predicted Label: love

Sentence: "The weather is terrible today."
Predicted Label: sadness

Sentence: "Let's have a meeting to discuss the project."
Predicted Label: fear

Model saved to text_classification_model.pkl
