In [4]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier, VotingClassifier
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Load and preprocess data
df = pd.read_csv("D:/Master/Master Dissertation/Dissertation/cleaned_spamdata.csv")
df['v2'] = df['v2'].fillna('').str.lower()  # Preprocessing: Lowercase and fill NaNs
X_text = df['v2']  
y = df['v1']  

# Feature Engineering
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1, 2), max_df=0.95, min_df=0.01)
X_tfidf = vectorizer.fit_transform(X_text).toarray()

# Resampling to balance the classes
smote = SMOTE(random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)

# Split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=5)

# Define classifiers
extra_trees = ExtraTreesClassifier(n_estimators=1500, random_state=50)
random_forest = RandomForestClassifier(n_estimators=1500, random_state=50, class_weight='balanced')

# Define the voting classifier (soft voting)
voting_classifier = VotingClassifier(estimators=[('extra_trees', extra_trees), 
                                                 ('random_forest', random_forest)], 
                                     voting='soft')

# Train the model
voting_classifier.fit(X_train, y_train)

# Save the trained model and vectorizer using pickle
with open(r'D:\Master\Master Dissertation\Dissertation\voting_classifier_model_soft.pkl', 'wb') as model_file:
    pickle.dump(voting_classifier, model_file)

with open(r'D:\Master\Master Dissertation\Dissertation\tfidf_vectorizer.pkl', 'wb') as tfidf_file:
    pickle.dump(vectorizer, tfidf_file)

print("Model and vectorizer have been saved successfully as pickle files.")

# Function to predict if a message is spam or ham
def predict_spam(message):
    # Load the trained model and vectorizer
    with open(r'D:\Master\Master Dissertation\Dissertation\voting_classifier_model_soft.pkl', 'rb') as model_file:
        model = pickle.load(model_file)

    with open(r'D:\Master\Master Dissertation\Dissertation\tfidf_vectorizer.pkl', 'rb') as tfidf_file:
        vectorizer = pickle.load(tfidf_file)

    # Preprocess and vectorize the input message
    message_vectorized = vectorizer.transform([message])
    
    # Predict the class (0 = ham, 1 = spam)
    prediction = model.predict(message_vectorized)
    
    return prediction[0]  # Return the predicted label (spam/ham)

# Test new messages
new_data = [
    "Congratulations! You've won a prize. Call now.",
    "Hey, are we still meeting for coffee later?",
    "Hi, you are ok?",
    "Hey, are we still meeting at 3 PM today?",
    "I need help with my homework. Can you explain this math problem?",
    "Dinner at my place tonight? I’ll make spaghetti!",
    "Just wanted to check in and see how you're doing.",
    "Reminder: Your appointment with the dentist is tomorrow at 10 AM.",
    "Congratulations! You've won a $1000 gift card. Click here to claim it now!",
    "Limited time offer! Buy one, get one free on all products. Hurry, act now!",
    "Urgent! Your account has been compromised. Please log in immediately to secure it.",
    "You have a special offer waiting! Call now for a free consultation.",
    "Get rich quick! Invest in this cryptocurrency and double your money in days!"
]

# Test the new data and print predictions
for message in new_data:
    result = predict_spam(message)
    print(f"Message: {message}\nPrediction: {result}\n")







Model and vectorizer have been saved successfully as pickle files.
Message: Congratulations! You've won a prize. Call now.
Prediction: spam

Message: Hey, are we still meeting for coffee later?
Prediction: ham

Message: Hi, you are ok?
Prediction: ham

Message: Hey, are we still meeting at 3 PM today?
Prediction: ham

Message: I need help with my homework. Can you explain this math problem?
Prediction: ham

Message: Dinner at my place tonight? I’ll make spaghetti!
Prediction: ham

Message: Just wanted to check in and see how you're doing.
Prediction: ham

Message: Reminder: Your appointment with the dentist is tomorrow at 10 AM.
Prediction: ham

Message: Congratulations! You've won a $1000 gift card. Click here to claim it now!
Prediction: spam

Message: Limited time offer! Buy one, get one free on all products. Hurry, act now!
Prediction: spam

Message: Urgent! Your account has been compromised. Please log in immediately to secure it.
Prediction: spam

Message: You have a special offe