**Naive Bayes**

In [1]:
import os
import io
import numpy
import pandas as pd
from pandas import DataFrame


def readFiles(path):
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            path = os.path.join(root, filename)

            inBody = False
            lines = []
            f = io.open(path, 'r', encoding='latin1')
            for line in f:
                if inBody:
                    lines.append(line)
                elif line == '\n':
                    inBody = True
            f.close()
            message = '\n'.join(lines)
            yield path, message


def dataFrameFromDirectory(path, classification):
    rows = []
    index = []
    for filename, message in readFiles(path):
        rows.append({'message': message, 'class': classification})
        index.append(filename)

    return DataFrame(rows, index=index)

data = DataFrame({'message': [], 'class': []}) # creates an empty df

data = pd.concat([data, dataFrameFromDirectory("C:/Users/ahmad/OneDrive/Documents/Machine Learning Course/MLCourse/emails/spam", "spam")]);
data = pd.concat([data, dataFrameFromDirectory("C:/Users/ahmad/OneDrive/Documents/Machine Learning Course/MLCourse/emails/ham", "ham")])


In [2]:
data.head()

Unnamed: 0,message,class
C:/Users/ahmad/OneDrive/Documents/Machine Learning Course/MLCourse/emails/spam\00001.7848dde101aa985090474a91ec93fcf0,"<!DOCTYPE HTML PUBLIC ""-//W3C//DTD HTML 4.0 Tr...",spam
C:/Users/ahmad/OneDrive/Documents/Machine Learning Course/MLCourse/emails/spam\00002.d94f1b97e48ed3b553b3508d116e6a09,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,spam
C:/Users/ahmad/OneDrive/Documents/Machine Learning Course/MLCourse/emails/spam\00003.2ee33bc6eacdb11f38d052c44819ba6c,1) Fight The Risk of Cancer!\n\nhttp://www.adc...,spam
C:/Users/ahmad/OneDrive/Documents/Machine Learning Course/MLCourse/emails/spam\00004.eac8de8d759b7e74154f142194282724,##############################################...,spam
C:/Users/ahmad/OneDrive/Documents/Machine Learning Course/MLCourse/emails/spam\00005.57696a39d7d84318ce497886896bf90d,I thought you might like these:\n\n1) Slim Dow...,spam


We call <b><u>vectorization</u></b> the general process of turning a collection of text documents into numerical feature vectors. This specific strategy (tokenization, counting and normalization) is called the Bag of Words or “Bag of n-grams” representation. Documents are described by word occurrences while completely ignoring the relative position information of the words in the document.

In [3]:
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

Naive Bayes classifier for multinomial models.

The multinomial Naive Bayes classifier is suitable for classification with discrete features (e.g., word counts for text classification). The multinomial distribution normally requires integer feature counts. However, in practice, fractional counts such as tf-idf may also work.

In [4]:
from sklearn.naive_bayes import MultinomialNB

classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

MultinomialNB()

In [5]:
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]
example_counts = vectorizer.transform(examples)
predictions = classifier.predict(example_counts)
predictions

array(['spam', 'ham'], dtype='<U4')

**Used ChatGPT to transform the original code to a more "beginner" friendly code:**

In [6]:
import os
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB

# Function to read emails from a directory
def read_emails(directory_path, label):
    data = []
    for filename in os.listdir(directory_path):
        with open(os.path.join(directory_path, filename), mode='r', encoding='latin1') as file:
            content = file.read()
            data.append({'message': content, 'class': label, 'filename': filename})
    return data

# Function to create a DataFrame from a list of emails
def create_dataframe(directory_path, label):
    emails_data = read_emails(directory_path, label)
    return pd.DataFrame(emails_data)

# Paths to spam and ham (normal) email directories
spam_path = "C:/Users/ahmad/OneDrive/Documents/Machine Learning Course/MLCourse/emails/spam"
normal_path = "C:/Users/ahmad/OneDrive/Documents/Machine Learning Course/MLCourse/emails/ham"

# Create DataFrames for spam and normal emails
spam_df = create_dataframe(spam_path, 'spam')
normal_df = create_dataframe(normal_path, 'normal')

# Combine DataFrames into one
data = pd.concat([spam_df, normal_df])

# Vectorize the email messages
vectorizer = CountVectorizer()
counts = vectorizer.fit_transform(data['message'].values)

# Train a Naive Bayes classifier
classifier = MultinomialNB()
targets = data['class'].values
classifier.fit(counts, targets)

# Examples to classify
examples = ['Free Viagra now!!!', "Hi Bob, how about a game of golf tomorrow?"]

# Vectorize the examples
example_counts = vectorizer.transform(examples)

# Predict using the trained classifier
predictions = classifier.predict(example_counts)

# Print the predictions
for example, prediction in zip(examples, predictions):
    print(f"Example: '{example}' is predicted as '{prediction}'")

Example: 'Free Viagra now!!!' is predicted as 'spam'
Example: 'Hi Bob, how about a game of golf tomorrow?' is predicted as 'normal'
