In [2]:
pip install pgmpy pandas

Collecting pgmpy
  Downloading pgmpy-0.1.26-py3-none-any.whl.metadata (9.1 kB)
Downloading pgmpy-0.1.26-py3-none-any.whl (2.0 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/2.0 MB[0m [31m36.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pgmpy
Successfully installed pgmpy-0.1.26


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import LabelEncoder
from pgmpy.models import MarkovNetwork
from pgmpy.factors.discrete import DiscreteFactor
from pgmpy.inference import Mplp

In [4]:
data = {
    'text': [
        "Win a free lottery now", "Cheap meds available", "Meeting at 5 PM",
        "Get your diploma now", "Let's catch up tomorrow", "Urgent: Account update required",
        "Win big prizes now", "Hello, how are you?", "Final chance to win lottery",
        "Meeting schedule attached"
    ],
    'label': [
        "spam", "spam", "not spam", "spam", "not spam", "spam",
        "spam", "not spam", "spam", "not spam"
    ]
}

In [5]:
df = pd.DataFrame(data)

In [6]:
vectorizer = CountVectorizer(binary=True)
X = vectorizer.fit_transform(df['text']).toarray()
feature_names = vectorizer.get_feature_names_out()

In [7]:
label_encoder = LabelEncoder()
y = label_encoder.fit_transform(df['label'])

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)


In [9]:
model = MarkovNetwork()

for i, feature in enumerate(feature_names):
    model.add_nodes_from([feature])

model.add_edges_from([
    (feature_names[0], feature_names[1]),
    (feature_names[1], feature_names[2]),
    (feature_names[2], feature_names[3]),
    (feature_names[3], feature_names[4])
])

In [18]:
factors = []
for i, feature in enumerate(feature_names[:5]):
    factor_values = np.array([0.9, 0.1]) if i % 2 == 0 else np.array([0.1, 0.9])
    factor = DiscreteFactor(variables=[feature], cardinality=[2], values=factor_values)
    factors.append(factor)

# Add factors to the model
for factor in factors:
    model.add_factors(factor)

In [20]:
mplp_infer = Mplp(model)


In [21]:
result = mplp_infer.map_query()

In [22]:
print("Inference results (MAP estimate for all features):")
print(result)


Inference results (MAP estimate for all features):
{'account': 0, 'are': 1, 'at': 0, 'attached': 1, 'available': 0, 'big': 0, 'catch': 0, 'chance': 0, 'cheap': 0, 'diploma': 0, 'final': 0, 'free': 0, 'get': 0, 'hello': 0, 'how': 0, 'let': 0, 'lottery': 0, 'meds': 0, 'meeting': 0, 'now': 0, 'pm': 0, 'prizes': 0, 'required': 0, 'schedule': 0, 'to': 0, 'tomorrow': 0, 'up': 0, 'update': 0, 'urgent': 0, 'win': 0, 'you': 0, 'your': 0}


In [24]:
def predict_spam(email_text):
    email_vector = vectorizer.transform([email_text]).toarray().flatten()
    email_features = {feature_names[i]: email_vector[i] for i in range(len(feature_names))}

    # Inference based on the evidence provided
    # We extract the features that are present in the email for inference
    result_with_evidence = mplp_infer.map_query()

    # Assume the first feature decides if it's spam or not, for illustration purposes
    return "spam" if result_with_evidence.get(feature_names[0], 0) == 1 else "not spam"

# Test prediction
test_email = "Win a big prize now"
print(f"Prediction for '{test_email}': {predict_spam(test_email)}")

Prediction for 'Win a big prize now': not spam
