In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
data = pd.read_csv('SMSSpamCollection.csv', encoding='latin-1')

# Rename columns
data = data.rename(columns={"v1": "target", "v2": "Email text"})

# Convert labels to binary (0 for 'ham' and 1 for 'spam')
data['Target'] = data['Target'].map({'ham': 0, 'spam': 1})

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(data['Email Text'], data['Target'], test_size=0.2, random_state=42)

# Vectorize the text data using TF-IDF
tfidf_vectorizer = TfidfVectorizer()
X_train_vectorized = tfidf_vectorizer.fit_transform(X_train)
X_test_vectorized = tfidf_vectorizer.transform(X_test)

# Train Naive Bayes classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_vectorized, y_train)

# Predict with Naive Bayes
nb_pred = nb_classifier.predict(X_test_vectorized)

# Evaluate Naive Bayes
nb_accuracy = accuracy_score(y_test, nb_pred)
print("Naive Bayes Accuracy:", nb_accuracy)
print("\nNaive Bayes Classification Report:")
print(classification_report(y_test, nb_pred))

# Train SVM classifier
svm_classifier = SVC(kernel='linear')
svm_classifier.fit(X_train_vectorized, y_train)

# Predict with SVM
svm_pred = svm_classifier.predict(X_test_vectorized)

# Evaluate SVM
svm_accuracy = accuracy_score(y_test, svm_pred)
print("\nSVM Accuracy:", svm_accuracy)
print("\nSVM Classification Report:")
print(classification_report(y_test, svm_pred))

# Train Logistic Regression classifier
lr_classifier = LogisticRegression()
lr_classifier.fit(X_train_vectorized, y_train)

# Predict with Logistic Regression
lr_pred = lr_classifier.predict(X_test_vectorized)

# Evaluate Logistic Regression
lr_accuracy = accuracy_score(y_test, lr_pred)
print("\nLogistic Regression Accuracy:", lr_accuracy)
print("\nLogistic Regression Classification Report:")
print(classification_report(y_test, lr_pred))

Naive Bayes Accuracy: 0.9596412556053812

Naive Bayes Classification Report:
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       954
           1       1.00      0.72      0.84       161

    accuracy                           0.96      1115
   macro avg       0.98      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115


SVM Accuracy: 0.9865470852017937

SVM Classification Report:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       954
           1       0.99      0.91      0.95       161

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115


Logistic Regression Accuracy: 0.9721973094170404

Logistic Regression Classification Report:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       954
    

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents and transform them to TF-IDF feature vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Print the TF-IDF matrix
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Print the feature names (terms)
print("\nFeature Names (Terms):")
print(tfidf_vectorizer.get_feature_names_out())

# Print the shape of the TF-IDF matrix
print("\nShape of TF-IDF Matrix:")
print(tfidf_matrix.shape)

TF-IDF Matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]

Feature Names (Terms):
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']

Shape of TF-IDF Matrix:
(4, 9)


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Sample documents
documents = [
    "This is the first document.",
    "This document is the second document.",
    "And this is the third one.",
    "Is this the first document?",
]

# Initialize TF-IDF vectorizer
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer to the documents and transform them to TF-IDF feature vectors
tfidf_matrix = tfidf_vectorizer.fit_transform(documents)

# Print the TF-IDF matrix
print("TF-IDF Matrix:")
print(tfidf_matrix.toarray())

# Print the feature names (terms)
print("\nFeature Names (Terms):")
print(tfidf_vectorizer.get_feature_names_out())

# Print the shape of the TF-IDF matrix
print("\nShape of TF-IDF Matrix:")
print(tfidf_matrix.shape)

TF-IDF Matrix:
[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]
 [0.         0.6876236  0.         0.28108867 0.         0.53864762
  0.28108867 0.         0.28108867]
 [0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]
 [0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]

Feature Names (Terms):
['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']

Shape of TF-IDF Matrix:
(4, 9)
