In [1]:
#practical implementation of MultinomialNB 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import make_pipeline
from sklearn.metrics import classification_report

In [2]:
#taking a demo sample data for simple understanding 
data = {
    'text': [
        "free crypto money click link",
        "subscribe to my new crypto channel",
        "hello, meeting at 10 AM?",
        "what a great video",
        "win free iphone now",
        "good morning, see you later"
    ],
    'label': [
        'spam',  # 1
        'spam',  # 1
        'ham',   # 0
        'ham',   # 0
        'spam',  # 1
        'ham'    # 0
    ]
}

In [3]:
df = pd.DataFrame(data)
df

Unnamed: 0,text,label
0,free crypto money click link,spam
1,subscribe to my new crypto channel,spam
2,"hello, meeting at 10 AM?",ham
3,what a great video,ham
4,win free iphone now,spam
5,"good morning, see you later",ham


In [4]:
df['label_num'] = df['label'].map({'spam': 1, 'ham': 0})
df

Unnamed: 0,text,label,label_num
0,free crypto money click link,spam,1
1,subscribe to my new crypto channel,spam,1
2,"hello, meeting at 10 AM?",ham,0
3,what a great video,ham,0
4,win free iphone now,spam,1
5,"good morning, see you later",ham,0


In [11]:
#splitting the data 
X = df['text']  #independent feature
y = df['label_num']  #target(dependent) variable
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [12]:
pipeline = make_pipeline(
    TfidfVectorizer(),
    MultinomialNB()  # Note: The `alpha` parameter in MultinomialNB
                     # is the Laplace Smoothing (default is 1.0)
)

In [13]:
pipeline.fit(X_train, y_train)

In [14]:
y_pred = pipeline.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.00      0.00      0.00       0.0
           1       0.00      0.00      0.00       2.0

    accuracy                           0.00       2.0
   macro avg       0.00      0.00      0.00       2.0
weighted avg       0.00      0.00      0.00       2.0



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# The pipeline.predict() method does the following:
# 1. Turns "click to win free crypto" into TF-IDF numbers.
# 2. Calculates Score(Spam) and Score(Ham) using the formulas.
# 3. Returns the class with the higher score.

In [15]:
new_comments = [
    "this is a good comment",
    "click to win free crypto"
]
predictions = pipeline.predict(new_comments)
for comment, pred in zip(new_comments, predictions):
    label = 'spam' if pred == 1 else 'ham'
    print(f"'{comment}'  -->  [{label}]")

'this is a good comment'  -->  [ham]
'click to win free crypto'  -->  [ham]
