In [None]:
import pandas as pd
import re
import joblib
from nltk.corpus import stopwords
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Load the dataset from the same folder
file_path = 'spam.csv'
df = pd.read_csv(file_path, encoding='latin-1')

# Drop unnecessary columns and rename relevant columns
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Encode the labels
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# Basic text preprocessing
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    text = text.lower()
    text = ' '.join([word for word in text.split() if word not in stop_words])
    return text

df['message'] = df['message'].apply(preprocess_text)

# Feature extraction
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['message']).toarray()
y = df['label']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the Naive Bayes model
model = MultinomialNB()
model.fit(X_train, y_train)
print('Naive Bayes model trained.')

# Evaluate the model
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Naive Bayes Accuracy: {accuracy * 100:.2f}%')  # Convert accuracy to percentage and print

# Save the model and vectorizer
joblib.dump(model, 'sms_spam_classifier_naive_bayes.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')
print("Naive Bayes model and TF-IDF vectorizer saved successfully.")

# Load the saved model and vectorizer
model = joblib.load('sms_spam_classifier_naive_bayes.pkl')
tfidf = joblib.load('tfidf_vectorizer.pkl')

# Function for text preprocessing and classification
def classify_message(input_message):
    preprocessed_message = preprocess_text(input_message)
    X_input = tfidf.transform([preprocessed_message]).toarray()
    prediction = model.predict(X_input)[0]
    label_map = {0: 'ham', 1: 'spam'}
    predicted_label = label_map[prediction]
    return predicted_label

# Example of using the function with user input
user_input = input("Enter a message: ")
classification_result = classify_message(user_input)
print(f"The message '{user_input}' is classified as: {classification_result}")
