In [5]:
import os
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
import google.generativeai as genai
import joblib

# Set up the API key
os.environ["GEMINI_API_KEY"] = "AIzaSyBTN3zuKTzE4-FALGEfOOweg0IN37AuAPM"
genai.configure(api_key=os.environ["GEMINI_API_KEY"])

def load_and_train_model():
    # Load the data
    raw_mail_data = pd.read_csv('./mail_data.csv')
    mail_data = raw_mail_data.where((pd.notnull(raw_mail_data)), '')

    # Label spam mail as 0; ham mail as 1
    mail_data.loc[mail_data['Category'] == 'spam', 'Category'] = 0
    mail_data.loc[mail_data['Category'] == 'ham', 'Category'] = 1

    # Separate the data
    X = mail_data['Message']
    Y = mail_data['Category']

    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=3)

    # Feature extraction
    feature_extraction = TfidfVectorizer(min_df=1, stop_words='english', lowercase=True)
    X_train_features = feature_extraction.fit_transform(X_train)
    X_test_features = feature_extraction.transform(X_test)

    # Convert labels to integers
    Y_train = Y_train.astype('int')
    Y_test = Y_test.astype('int')

    # Train the model
    model = LogisticRegression()
    model.fit(X_train_features, Y_train)

    # Save the model and vectorizer
    joblib.dump(model, 'spam_model.pkl')
    joblib.dump(feature_extraction, 'tfidf_vectorizer.pkl')

    return model, feature_extraction

def predict_spam(mail_content, model, vectorizer):
    try:
        # Transform the input message using the vectorizer
        input_features = vectorizer.transform([mail_content])

        # Predict using the loaded model
        prediction = model.predict(input_features)

        # Get AI-based prediction using Gemini
        gemini_model = genai.GenerativeModel('gemini-1.5-flash')
        prompt = f"Is this email spam? {mail_content}"
        response = gemini_model.generate_content(prompt)
        ai_prediction_text = response.text.lower()

        # Process the AI's response (based on 'spam' keyword detection)
        ai_prediction = 0 if "spam" in ai_prediction_text else 1

        return {
            'model_prediction': 'not spam' if prediction[0] == 1 else 'spam',
            'ai_prediction': 'not spam' if ai_prediction == 1 else 'spam',
            'confidence': model.predict_proba(input_features)[0][prediction[0]]
        }
    except Exception as e:
        print(f"An error occurred: {str(e)}")
        return None

if __name__ == '__main__':
    # Train and save the model
    load_and_train_model()
    
    # Example usage
    mail_content = "You wan 300000$"
    result = predict_spam(mail_content, joblib.load('spam_model.pkl'), joblib.load('tfidf_vectorizer.pkl'))
    print(result)


{'model_prediction': 'not spam', 'ai_prediction': 'spam', 'confidence': 0.9455836943131702}
