In [2]:
import pandas as pd
import joblib
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.metrics import accuracy_score, classification_report
from shared_functions import update_accuracy_in_config

# Load your dataset
df = pd.read_csv('../sampled_data.csv', header=None, names=['text', 'label'], delimiter=',', quoting=3)

# Split data into features and labels
X = df['text']
y = df['label']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Convert text data to numerical data using TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# Feature Selection using Chi-Square
def select_features_chi2(X_train, y_train, X_test, k=1000):
    chi2_selector = SelectKBest(chi2, k=k)
    X_train_selected = chi2_selector.fit_transform(X_train, y_train)
    X_test_selected = chi2_selector.transform(X_test)
    return chi2_selector, X_train_selected, X_test_selected

# Train and evaluate Naive Bayes model with selected features
def train_and_evaluate(X_train, X_test, y_train, y_test, feature_selector):
    # Feature selection
    selector, X_train_selected, X_test_selected = feature_selector(X_train, y_train, X_test)
    
    # Train Naive Bayes model
    model = MultinomialNB()
    model.fit(X_train_selected, y_train)
    
    # Evaluate model
    y_pred = model.predict(X_test_selected)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'Accuracy: {accuracy}')
    print('Classification Report:')
    print(classification_report(y_test, y_pred))
    
    return model, vectorizer, selector, accuracy

# Perform feature selection and model training
feature_selector = lambda X_train, y_train, X_test: select_features_chi2(X_train, y_train, X_test, k=1000)
model, vectorizer, selector, accuracy = train_and_evaluate(X_train_vect, X_test_vect, y_train, y_test, feature_selector)

# Predicting new sentences
sentences = [
    "I love programming and data science.",
    "The weather is terrible today.",
    "Let's have a meeting to discuss the project."
]

# Convert new sentences to numerical data
sentences_vect = vectorizer.transform(sentences)

# Apply feature selection to new sentences
sentences_selected = selector.transform(sentences_vect)

# Predict using the trained model
predictions = model.predict(sentences_selected)

for sentence, prediction in zip(sentences, predictions):
    print(f'Sentence: "{sentence}"')
    print(f'Predicted Label: {prediction}')
    print()

# Save the model, vectorizer, and selector to a pickle file
joblib.dump((vectorizer, model , selector), '../trained_models/NB_TF.pkl')

update_accuracy_in_config(accuracy, 'tfidf_naivebase')

print("Model saved to NB_TF.pkl")


Accuracy: 0.7755555555555556
Classification Report:
              precision    recall  f1-score   support

       anger       0.92      0.74      0.82       162
        fear       0.86      0.75      0.80       151
         joy       0.85      0.58      0.69       148
        love       0.67      0.93      0.78       143
     sadness       0.69      0.72      0.71       141
    surprise       0.75      0.93      0.83       155

    accuracy                           0.78       900
   macro avg       0.79      0.78      0.77       900
weighted avg       0.79      0.78      0.77       900

Sentence: "I love programming and data science."
Predicted Label: love

Sentence: "The weather is terrible today."
Predicted Label: sadness

Sentence: "Let's have a meeting to discuss the project."
Predicted Label: fear

Model saved to NB_TF.pkl
