In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.datasets import load_iris
from sklearn.preprocessing import StandardScaler

In [2]:
# 1. Load the Iris Dataset
iris = load_iris()
X = pd.DataFrame(iris.data, columns=iris.feature_names)
y = pd.Series(iris.target)

print("Iris Dataset (first 5 rows of features):")
print(X.head())
print("\nTarget Variable (first 5 rows):")
print(y.head())
print(f"\nTarget Names: {iris.target_names}")

Iris Dataset (first 5 rows of features):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2
1                4.9               3.0                1.4               0.2
2                4.7               3.2                1.3               0.2
3                4.6               3.1                1.5               0.2
4                5.0               3.6                1.4               0.2

Target Variable (first 5 rows):
0    0
1    0
2    0
3    0
4    0
dtype: int64

Target Names: ['setosa' 'versicolor' 'virginica']


In [3]:
# 2. Prepare the Data (Optional Scaling)
# While Naive Bayes doesn't strictly require scaling, it's generally harmless and can sometimes help with numerical stability, especially if features have vastly different scales.
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_scaled_df = pd.DataFrame(X_scaled, columns=X.columns)

print("Scaled Iris Data (first 5 rows):")
print(X_scaled_df.head())

Scaled Iris Data (first 5 rows):
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0          -0.900681          1.019004          -1.340227         -1.315444
1          -1.143017         -0.131979          -1.340227         -1.315444
2          -1.385353          0.328414          -1.397064         -1.315444
3          -1.506521          0.098217          -1.283389         -1.315444
4          -1.021849          1.249201          -1.340227         -1.315444


In [4]:
# 3. Split the Data
# Using a common split size, e.g., 70% train, 30% test
X_train_scaled, X_test_scaled, y_train, y_test = train_test_split(
    X_scaled, y, test_size=0.3, random_state=42, stratify=y
)

print(f"Training data shape: {X_train_scaled.shape}, {y_train.shape}")
print(f"Testing data shape: {X_test_scaled.shape}, {y_test.shape}")

Training data shape: (105, 4), (105,)
Testing data shape: (45, 4), (45,)


In [5]:
# 4. Train the Naïve Bayes Model
# GaussianNB is used for continuous data, assuming features follow a Gaussian distribution.
gnb_model = GaussianNB()
gnb_model.fit(X_train_scaled, y_train)

print("Naïve Bayes (GaussianNB) Model Trained Successfully!")

Naïve Bayes (GaussianNB) Model Trained Successfully!


In [6]:
# 5. Make Predictions
y_pred = gnb_model.predict(X_test_scaled)
y_pred_proba = gnb_model.predict_proba(X_test_scaled) # Probabilities for each class

print("Predictions on Test Set (first 10):")
predictions_df = pd.DataFrame({
    'Actual_Label': y_test.reset_index(drop=True),
    'Actual_Species': [iris.target_names[label] for label in y_test.reset_index(drop=True)],
    'Predicted_Label': y_pred,
    'Predicted_Species': [iris.target_names[label] for label in y_pred]
})
print(predictions_df.head(10))

Predictions on Test Set (first 10):
   Actual_Label Actual_Species  Predicted_Label Predicted_Species
0             2      virginica                2         virginica
1             1     versicolor                1        versicolor
2             2      virginica                1        versicolor
3             1     versicolor                1        versicolor
4             2      virginica                2         virginica
5             2      virginica                2         virginica
6             1     versicolor                1        versicolor
7             1     versicolor                1        versicolor
8             0         setosa                0            setosa
9             2      virginica                2         virginica


In [7]:
# 6. Evaluate the Model
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=iris.target_names)

print(f"Model Accuracy: {accuracy:.2f}")
print("\nClassification Report:")
print(report)

Model Accuracy: 0.91

Classification Report:
              precision    recall  f1-score   support

      setosa       1.00      1.00      1.00        15
  versicolor       0.82      0.93      0.88        15
   virginica       0.92      0.80      0.86        15

    accuracy                           0.91        45
   macro avg       0.92      0.91      0.91        45
weighted avg       0.92      0.91      0.91        45



In [8]:
# 7. Classify a New Sample
print("Classifying a New Sample with Naïve Bayes:")

# Example new sample: sepal length=5.1, sepal width=3.5, petal length=1.4, petal width=0.2 (Looks like a Setosa)
new_sample = pd.DataFrame([[5.1, 3.5, 1.4, 0.2]], columns=iris.feature_names)
print(f"\nNew Sample to classify:\n{new_sample}")

# Crucially, scale the new sample using the *same* fitted scaler
new_sample_scaled = scaler.transform(new_sample)
new_sample_scaled_df = pd.DataFrame(new_sample_scaled, columns=iris.feature_names)

print(f"\nNew Sample Scaled:\n{new_sample_scaled_df}")

# Predict the class and probabilities for the new sample
new_sample_prediction_label = gnb_model.predict(new_sample_scaled)[0]
new_sample_prediction_species = iris.target_names[new_sample_prediction_label]
new_sample_prediction_proba = gnb_model.predict_proba(new_sample_scaled)[0]

print(f"\nThe new sample is predicted to be: '{new_sample_prediction_species}'")
print("\nProbabilities for each species:")
for i, proba in enumerate(new_sample_prediction_proba):
    print(f"  {iris.target_names[i]}: {proba:.4f}")


Classifying a New Sample with Naïve Bayes:

New Sample to classify:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0                5.1               3.5                1.4               0.2

New Sample Scaled:
   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)
0          -0.900681          1.019004          -1.340227         -1.315444

The new sample is predicted to be: 'setosa'

Probabilities for each species:
  setosa: 1.0000
  versicolor: 0.0000
  virginica: 0.0000


In [9]:
pip install pandas scikit-learn nltk




In [10]:
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk
from nltk.corpus import stopwords

# Download stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# 1. Load Dataset
df = pd.read_csv("SMSSpamCollection", sep='\t', names=["label", "message"])

# 2. Preprocessing Function
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    # Remove punctuation
    text = text.translate(str.maketrans('', '', string.punctuation))
    # Remove stopwords
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

# Apply preprocessing
df['message'] = df['message'].apply(preprocess_text)

# 3. Convert labels to binary (ham=0, spam=1)
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

# 4. Train-Test Split
X_train, X_test, y_train, y_test = train_test_split(df['message'], df['label'], test_size=0.3, random_state=42)

# 5. Vectorization
vectorizer = CountVectorizer()
X_train_vect = vectorizer.fit_transform(X_train)
X_test_vect = vectorizer.transform(X_test)

# 6. Train Naïve Bayes Classifier
nb = MultinomialNB()
nb.fit(X_train_vect, y_train)

# 7. Predict and Evaluate
y_pred = nb.predict(X_test_vect)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


FileNotFoundError: [Errno 2] No such file or directory: 'SMSSpamCollection'

In [None]:
# Import necessary libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
# Download NLTK data if not already downloaded
try:
    stopwords.words('english')
except LookupError:
    nltk.download('stopwords')
try:
    WordNetLemmatizer().lemmatize('test')
except LookupError:
    nltk.download('wordnet')
try:
    nltk.data.find('corpora/omw-1.4')
except LookupError:
    nltk.download('omw-1.4')


# --- 1. Load the Dataset ---
# The dataset is typically a CSV file with two columns: 'v1' (label) and 'v2' (text)
# Make sure to place 'SMSSpamCollection' in the same directory as your script,
# or provide the full path to the file.
try:
    df = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
    print("Dataset loaded successfully.")
    print(f"Number of samples: {len(df)}")
    print("First 5 rows of the dataset:")
    print(df.head())
    print("\nLabel distribution:")
    print(df['label'].value_counts())
except FileNotFoundError:
    print("Error: 'SMSSpamCollection' not found.")
    print("Please download the dataset from:")
    print("https://archive.ics.uci.edu/ml/datasets/SMS+Spam+Collection")
    print("And place it in the same directory as this script, or provide the full path.")
    exit()

# --- 2. Preprocessing the Text Data ---

# Initialize stemmer and lemmatizer
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    """
    Cleans and preprocesses the text:
    1. Removes non-alphabetic characters and multiple spaces.
    2. Converts text to lowercase.
    3. Tokenizes the text (splits into words).
    4. Removes stop words.
    5. Applies lemmatization (can also use stemming).
    6. Joins the processed words back into a single string.
    """
    # Remove non-alphabetic characters and extra spaces
    text = re.sub('[^a-zA-Z]', ' ', text)
    text = re.sub(r'\s+', ' ', text).strip() # Replace multiple spaces with single space
    # Convert to lowercase
    text = text.lower()
    # Tokenize
    words = text.split()
    # Remove stop words and apply lemmatization
    processed_words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    # Alternatively, use stemming:
    # processed_words = [stemmer.stem(word) for word in words if word not in stop_words]
    return ' '.join(processed_words)

# Apply preprocessing to the 'message' column
print("\nPreprocessing messages...")
df['processed_message'] = df['message'].apply(preprocess_text)
print("Preprocessing complete. First 5 processed messages:")
print(df[['message', 'processed_message']].head())

# --- 3. Feature Extraction (Vectorization) ---
# Convert text messages into numerical feature vectors.
# We'll use two common methods: CountVectorizer and TfidfVectorizer.

# Option 1: CountVectorizer (Bag-of-Words)
# This converts a collection of text documents to a matrix of token counts.
print("\nApplying CountVectorizer...")
count_vectorizer = CountVectorizer()
X_counts = count_vectorizer.fit_transform(df['processed_message'])
print(f"Shape of CountVectorizer features: {X_counts.shape}")

# Option 2: TfidfVectorizer (Term Frequency-Inverse Document Frequency)
# This transforms text to feature vectors that reflect the importance of a word in a document
# relative to the entire corpus.
print("Applying TfidfVectorizer...")
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['processed_message'])
print(f"Shape of TfidfVectorizer features: {X_tfidf.shape}")

# Choose one for the model. TF-IDF often performs better for text classification.
X = X_tfidf
# X = X_counts

# Convert labels to numerical format (ham: 0, spam: 1)
y = df['label'].map({'ham': 0, 'spam': 1})

# --- 4. Split Data into Training and Testing Sets ---
print("\nSplitting data into training and testing sets (80/20 split)...")
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print(f"Training set size: {X_train.shape[0]} samples")
print(f"Testing set size: {X_test.shape[0]} samples")

# --- 5. Train the Naïve Bayes Classifier ---
# We use Multinomial Naïve Bayes, which is well-suited for count-based features.
print("\nTraining Multinomial Naïve Bayes classifier...")
model = MultinomialNB()
model.fit(X_train, y_train)
print("Model training complete.")

# --- 6. Make Predictions ---
print("Making predictions on the test set...")
y_pred = model.predict(X_test)

# --- 7. Evaluate the Model ---
print("\n--- Model Evaluation ---")
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred)
recall = recall_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred)

print(f"Accuracy: {accuracy:.4f}")
print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1-Score: {f1:.4f}")

# --- 8. Test with Custom Messages ---
print("\n--- Testing with Custom Messages ---")
custom_messages = [
    "Congratulations! You've won a free iPhone. Click here to claim.", # Spam
    "Hey, how are you doing today? Let's catch up soon.",              # Ham
    "URGENT! Your account has been suspended. Verify your details now.", # Spam
    "Hi, just confirming our meeting for tomorrow at 10 AM.",          # Ham
    "Free entry to a contest! Text WIN to 12345.",                     # Spam
    "Call me back please, it's urgent."                                # Ham (can be tricky)
]

# Preprocess and vectorize custom messages using the *trained* vectorizer
# (Do not fit_transform again, only transform)
processed_custom_messages = [preprocess_text(msg) for msg in custom_messages]
X_custom = tfidf_vectorizer.transform(processed_custom_messages) # Use the same vectorizer as for training

# Predict labels for custom messages
predictions_custom = model.predict(X_custom)

label_map = {0: 'ham', 1: 'spam'}
for i, msg in enumerate(custom_messages):
    predicted_label = label_map[predictions_custom[i]]
    print(f"Message: '{msg}'\nPredicted: {predicted_label}\n")

