In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import confusion_matrix, accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB
import joblib  # for saving the model
import numpy as np
# Load your dataset
df = pd.read_csv('drugs.csv')

# Drop rows with any missing values
df.dropna(inplace=True)

# Ensure ratings are numeric and filter out invalid entries
df = df[pd.to_numeric(df['Rating'], errors='coerce').notnull()]
df.loc[:, 'Rating'] = df['Rating'].astype(float)

# Convert text data into TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000)
X = vectorizer.fit_transform(df['Content']).toarray()

# Target variable
y = df['Rating'].values

# Transform ratings to categorical labels for classification
y = np.where(y >= 6, 1, 0)  # Example: ratings >= 6 are considered positive (1), others are negative (0)

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train a Naive Bayes model
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

# Save the trained Naive Bayes model
joblib.dump(nb_model, 'naive_bayes_model.joblib')

# Load the Naive Bayes model (if needed)
# nb_model = joblib.load('naive_bayes_model.joblib')

# Predict on the test set with the Naive Bayes model
y_pred_nb = nb_model.predict(X_test)

# Calculate metrics for the Naive Bayes model
conf_matrix = confusion_matrix(y_test, y_pred_nb)
accuracy = accuracy_score(y_test, y_pred_nb)
precision = precision_score(y_test, y_pred_nb)
recall = recall_score(y_test, y_pred_nb)
f1 = f1_score(y_test, y_pred_nb)

# Display metrics
print("Confusion Matrix (Naive Bayes):")
print(conf_matrix)
print("\nMetrics (Naive Bayes):")
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)


Confusion Matrix (Naive Bayes):
[[ 9549  9519]
 [ 2017 30744]]

Metrics (Naive Bayes):
Accuracy: 0.7774219066545757
Precision: 0.7635794650175098
Recall: 0.938432892768841
F1-score: 0.8420245398773006
