In [None]:
import pandas as pd


UCI IRVINE

In [None]:
import pandas as pd
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/00228/smsspamcollection.zip'
dataset_zip = 'smsspamcollection.zip'
import urllib.request
urllib.request.urlretrieve(url, dataset_zip)
import zipfile
with zipfile.ZipFile(dataset_zip, 'r') as zip_ref:
    zip_ref.extractall()
data = pd.read_csv('SMSSpamCollection', sep='\t', header=None, names=['label', 'message'])
data.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
import re
data['message_cleaned'] = data['message'].apply(lambda x: re.sub(r'\W', ' ', x.lower()))
data.head()


Unnamed: 0,label,message,message_cleaned
0,ham,"Go until jurong point, crazy.. Available only ...",go until jurong point crazy available only ...
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry in 2 a wkly comp to win fa cup fina...
3,ham,U dun say so early hor... U c already then say...,u dun say so early hor u c already then say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah i don t think he goes to usf he lives aro...


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['message_cleaned']).toarray()

# Encode labels (ham = 0, spam = 1)
y = data['label'].map({'ham': 0, 'spam': 1}).values

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import pickle

# Load dataset and preprocess
data = pd.read_csv('SMSSpamCollection.csv', encoding='latin-1', header=None, names=['label', 'message'])

# Remove missing values
data.dropna(subset=['label', 'message'], inplace=True)

# Keep only 'ham' and 'spam' labels
data = data[data['label'].isin(['ham', 'spam'])]

# Convert labels to numerical values
y = data['label'].map({'ham': 0, 'spam': 1}).values

# Clean text
data['message'] = data['message'].str.lower().str.replace(r'[^\w\s]', '', regex=True)

# Vectorization
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['message']).toarray()

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Naïve Bayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
y_pred_nb = nb_model.predict(X_test)
print("Naïve Bayes Performance:")
print(classification_report(y_test, y_pred_nb))
print(f"Accuracy: {accuracy_score(y_test, y_pred_nb):.4f}\n")

# SVM
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
print("SVM Performance:")
print(classification_report(y_test, y_pred_svm))
print(f"Accuracy: {accuracy_score(y_test, y_pred_svm):.4f}\n")

# Random Forest
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)
print("Random Forest Performance:")
print(classification_report(y_test, y_pred_rf))
print(f"Accuracy: {accuracy_score(y_test, y_pred_rf):.4f}\n")

# Save vectorizer
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf, file)

print("TfidfVectorizer saved as 'tfidf_vectorizer.pkl'")

Naïve Bayes Performance:
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       966
           1       1.00      0.79      0.88       149

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.93      1115
weighted avg       0.97      0.97      0.97      1115

Accuracy: 0.9722

SVM Performance:
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       966
           1       0.99      0.92      0.95       149

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

Accuracy: 0.9883

Random Forest Performance:
              precision    recall  f1-score   support

           0       0.97      1.00      0.99       966
           1       1.00      0.83      0.90       149

    accuracy                           0.98      1115
   macro avg       0.99

In [None]:
!pip install catboost


Collecting catboost
  Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl.metadata (1.2 kB)
Downloading catboost-1.2.8-cp311-cp311-manylinux2014_x86_64.whl (99.2 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m99.2/99.2 MB[0m [31m6.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: catboost
Successfully installed catboost-1.2.8


In [None]:
import joblib
joblib.dump(nb_model, 'spam_detector_nb.pkl')
joblib.dump(svm_model, 'spam_detector_svm.pkl')
model = joblib.load('spam_detector_nb.pkl')

# Test with a sample message
sample_message = ["Congratulations! We've created the spam detection model."]
sample_message_clean = [re.sub(r'\W', ' ', msg.lower()) for msg in sample_message]

# Convert to TF-IDF/BOW format
sample_vector = tfidf.transform(sample_message_clean).toarray()

# Predict
prediction = model.predict(sample_vector)[0]
print("Spam" if prediction == 1 else "Not Spam")


Not Spam


In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Load and clean the dataset
data = pd.read_csv('SMSSpamCollection.csv', encoding='latin-1', header=None, names=['label', 'message'])


# Trim column names and keep only relevant columns
data = data[['label', 'message']]

data['message'] = data['message'].str.lower()
data['message'] = data['message'].str.replace(r'[^\w\s]', '', regex=True)

# Initialize and fit the TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features if needed
X = tfidf_vectorizer.fit_transform(data['message'])

# Save the vectorizer as a .pkl file
with open('tfidf_vectorizer.pkl', 'wb') as file:
    pickle.dump(tfidf_vectorizer, file)

print("TfidfVectorizer saved as 'tfidf_vectorizer.pkl'")


TfidfVectorizer saved as 'tfidf_vectorizer.pkl'


In [None]:
import pandas as pd

# Read the dataset
df = pd.read_csv("SMSSpamCollection.csv", sep="\t", header=None, names=["label", "message"])


In [None]:
print(df.columns.tolist())  # Ensure correct column names


['label', 'message']


In [None]:

import joblib

# Save model
joblib.dump(model, "spam_detector_nb.pkl")

# Save vectorizer
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

# Download the files
from google.colab import files

files.download("spam_detector_nb.pkl")
files.download("tfidf_vectorizer.pkl")


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
# Download the files
from google.colab import files

files.download("spam_detector_nb.pkl")
files.download("tfidf_vectorizer.pkl")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

MENDELEY

In [None]:
# Required libraries
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Step 1: Load the dataset
df = pd.read_csv("Dataset_5971.csv")  # Make sure this file is in your working directory

# Step 2: Simplify labels (ham = 0, spam-like = 1)
df['label'] = df['LABEL'].apply(lambda x: 0 if x.lower() == 'ham' else 1)

# Step 3: Clean the text
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

df['clean_text'] = df['TEXT'].apply(preprocess_text)

# Step 4: Convert text to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

# Step 5: Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9648535564853556

Classification Report:
               precision    recall  f1-score   support

         Ham       0.96      0.99      0.98       974
        Spam       0.97      0.83      0.90       221

    accuracy                           0.96      1195
   macro avg       0.97      0.91      0.94      1195
weighted avg       0.97      0.96      0.96      1195


Confusion Matrix:
 [[969   5]
 [ 37 184]]


In [None]:
# Required libraries
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score, confusion_matrix

# Step 1: Load the dataset
df = pd.read_csv("Dataset_5971.csv")  # Make sure this file is in your working directory

# Step 2: Simplify labels (ham = 0, spam-like = 1)
df['label'] = df['LABEL'].apply(lambda x: 0 if x.lower() == 'ham' else 1)

# Step 3: Clean the text
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    return text

df['clean_text'] = df['TEXT'].apply(preprocess_text)

# Step 4: Convert text to TF-IDF features
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

# Step 5: Split into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 6: Train the model
model = MultinomialNB()
model.fit(X_train, y_train)

# Step 7: Evaluate the model
y_pred = model.predict(X_test)

# Print results with accuracy rounded to 5 decimal places
accuracy = accuracy_score(y_test, y_pred)
print(f"Accuracy: {accuracy:.5f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.96485

Classification Report:
               precision    recall  f1-score   support

         Ham       0.96      0.99      0.98       974
        Spam       0.97      0.83      0.90       221

    accuracy                           0.96      1195
   macro avg       0.97      0.91      0.94      1195
weighted avg       0.97      0.96      0.96      1195


Confusion Matrix:
 [[969   5]
 [ 37 184]]


In [None]:
# Required Libraries
import pandas as pdq  ``
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('Dataset_5971.csv')

# Label Encoding (ham = 0, spam = 1)
df['label'] = df['LABEL'].apply(lambda x: 0 if x.lower() == 'ham' else 1)

# Text Cleaning Function
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Apply text cleaning
df['clean_text'] = df['TEXT'].apply(preprocess_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CatBoost Classifier
cat_model = CatBoostClassifier(verbose=0)
cat_model.fit(X_train, y_train)

# Prediction and Evaluation
y_pred = cat_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"CatBoost Accuracy: {accuracy:.5f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))


CatBoost Accuracy: 0.95314

Classification Report:
               precision    recall  f1-score   support

         Ham       0.95      0.99      0.97       974
        Spam       0.96      0.78      0.86       221

    accuracy                           0.95      1195
   macro avg       0.96      0.89      0.92      1195
weighted avg       0.95      0.95      0.95      1195



In [None]:
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import joblib

# Load the dataset
df = pd.read_csv("Dataset_5971.csv")

# Label encoding (ham = 0, spam-like = 1)
df['label'] = df['LABEL'].apply(lambda x: 0 if x.lower() == 'ham' else 1)

# Clean the text
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

df['clean_text'] = df['TEXT'].apply(clean_text)

# TF-IDF vectorization
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X = tfidf.fit_transform(df['clean_text'])
y = df['label']

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# Save models
joblib.dump(nb_model, 'spam_detector_nb.pkl')
joblib.dump(svm_model, 'spam_detector_svm.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')  # Save the vectorizer as well

# Load model and vectorizer
model = joblib.load('spam_detector_nb.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Test with a sample message
sample_message = ["Congratulations! We've created the spam detection model."]
sample_message_clean = [re.sub(r'\W', ' ', msg.lower()) for msg in sample_message]
sample_vector = vectorizer.transform(sample_message_clean)

# Predict
prediction = model.predict(sample_vector)[0]
print("Spam" if prediction == 1 else "Not Spam")


Not Spam


KAGGLE

In [None]:
import pandas as pd

df = pd.read_csv("kaggle dataset.csv")
print(df.columns)


Index(['  v1', 'v2', 'Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], dtype='object')


In [None]:
import pandas as pd
import re

# Load the dataset
df = pd.read_csv("kaggle dataset.csv")

# Strip extra spaces from column names
df.columns = df.columns.str.strip()

# Keep only the relevant columns
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Convert labels to binary values
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# Define a basic cleaning function
def basic_clean(text):
    text = text.lower()  # Lowercase
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation and numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Remove extra whitespace
    return text

# Apply cleaning
df['cleaned_message'] = df['message'].apply(basic_clean)

# Preview cleaned messages
print(df[['label', 'cleaned_message']].head())


  label                                    cleaned_message
0   ham  go until jurong point crazy available only in ...
1   ham                            ok lar joking wif u oni
2  spam  free entry in a wkly comp to win fa cup final ...
3   ham        u dun say so early hor u c already then say
4   ham  nah i dont think he goes to usf he lives aroun...


In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.metrics import accuracy_score

# Load and clean the dataset
df = pd.read_csv("kaggle dataset.csv")
df.columns = df.columns.str.strip()
df = df[['v1', 'v2']]
df.columns = ['label', 'message']
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_message'] = df['message'].apply(clean_text)

# Split data
X = df['cleaned_message']
y = df['label_num']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize text
vectorizer = TfidfVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

# Models
models = {
    'Naive Bayes': MultinomialNB(),
    'Random Forest': RandomForestClassifier(n_estimators=100, random_state=42),
    'SVM': LinearSVC()
}

# Train and evaluate
for name, model in models.items():
    model.fit(X_train_vec, y_train)
    predictions = model.predict(X_test_vec)
    accuracy = accuracy_score(y_test, predictions)
    print(f"{name} Accuracy: {accuracy:.5f}")


Naive Bayes Accuracy: 0.95247
Random Forest Accuracy: 0.97220
SVM Accuracy: 0.98027


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Use TF-IDF to vectorize the cleaned messages
tfidf_vectorizer = TfidfVectorizer()
X_tfidf = tfidf_vectorizer.fit_transform(df['cleaned_message'])

# View shape
print("TF-IDF matrix shape:", X_tfidf.shape)


TF-IDF matrix shape: (5572, 8512)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Use CountVectorizer (Bag of Words)
count_vectorizer = CountVectorizer()
X_bow = count_vectorizer.fit_transform(df['cleaned_message'])

# View shape
print("Bag of Words matrix shape:", X_bow.shape)


Bag of Words matrix shape: (5572, 8512)


In [None]:
import pandas as pd
import string
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
import joblib

# Load the dataset (update filename if needed)
df = pd.read_csv("kaggle dataset.csv")

# Clean column names
df.columns = df.columns.str.strip()

# Keep only required columns and rename
df = df[['v1', 'v2']]
df.columns = ['label_text', 'message']

# Encode labels: ham = 0, spam = 1
df['label'] = df['label_text'].map({'ham': 0, 'spam': 1})

# Clean the message text
def clean_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))  # Remove punctuation
    text = re.sub(r'\d+', '', text)  # Remove numbers
    text = re.sub(r'\s+', ' ', text).strip()  # Normalize whitespace
    return text

df['clean_text'] = df['message'].apply(clean_text)

# Vectorize using TF-IDF
tfidf = TfidfVectorizer(stop_words='english', max_features=3000)
X = tfidf.fit_transform(df['clean_text'])
y = df['label']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train models
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)

svm_model = LinearSVC()
svm_model.fit(X_train, y_train)

# Save models and vectorizer
joblib.dump(nb_model, 'spam_detector_nb.pkl')
joblib.dump(svm_model, 'spam_detector_svm.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

# Load model and vectorizer
model = joblib.load('spam_detector_nb.pkl')
vectorizer = joblib.load('tfidf_vectorizer.pkl')

# Test with a sample message
sample_message = ["Congratulations! You've been selected for a $1000 gift card."]
sample_message_clean = [clean_text(msg) for msg in sample_message]
sample_vector = vectorizer.transform(sample_message_clean)

# Predict
prediction = model.predict(sample_vector)[0]
print("Spam" if prediction == 1 else "Not Spam")


Spam


In [None]:
# Required Libraries
import pandas as pd
import string
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score, classification_report

# Load the dataset
df = pd.read_csv('Dataset_5971.csv')

# Label Encoding (ham = 0, spam = 1)
df['label'] = df['LABEL'].apply(lambda x: 0 if x.lower() == 'ham' else 1)

# Text Cleaning Function
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    return text

# Apply text cleaning
df['clean_text'] = df['TEXT'].apply(preprocess_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(stop_words='english', max_features=3000)
X = vectorizer.fit_transform(df['clean_text'])
y = df['label']

# Train/Test Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# CatBoost Classifier
cat_model = CatBoostClassifier(verbose=0)
cat_model.fit(X_train, y_train)

# Prediction and Evaluation
y_pred = cat_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f"CatBoost Accuracy: {accuracy:.5f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=["Ham", "Spam"]))


CatBoost Accuracy: 0.95314

Classification Report:
               precision    recall  f1-score   support

         Ham       0.95      0.99      0.97       974
        Spam       0.96      0.78      0.86       221

    accuracy                           0.95      1195
   macro avg       0.96      0.89      0.92      1195
weighted avg       0.95      0.95      0.95      1195



In [None]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report
from catboost import CatBoostClassifier

# Load the dataset
df = pd.read_csv("kaggle dataset.csv")

# Clean column names and select relevant columns
df.columns = df.columns.str.strip()
df = df[['v1', 'v2']]
df.columns = ['label', 'message']

# Encode labels
df['label_num'] = df['label'].map({'ham': 0, 'spam': 1})

# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

df['cleaned_message'] = df['message'].apply(clean_text)

# TF-IDF Vectorization
vectorizer = TfidfVectorizer(max_features=3000)
X = vectorizer.fit_transform(df['cleaned_message'])
y = df['label_num']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train CatBoost model
cat_model = CatBoostClassifier(verbose=0, random_state=42)
cat_model.fit(X_train, y_train)

# Predict and evaluate
y_pred = cat_model.predict(X_test)
print(f"Accuracy: {accuracy_score(y_test, y_pred):.5f}")
print("\nClassification Report:\n", classification_report(y_test, y_pred, target_names=['Ham', 'Spam']))


Accuracy: 0.97668

Classification Report:
               precision    recall  f1-score   support

         Ham       0.97      1.00      0.99       965
        Spam       0.99      0.83      0.91       150

    accuracy                           0.98      1115
   macro avg       0.98      0.92      0.95      1115
weighted avg       0.98      0.98      0.98      1115



DATASETS

In [None]:
!pip install datasets


Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m17.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
from datasets import load_dataset
import pandas as pd
import re

# Load the dataset
ds = load_dataset("autoevaluate/autoeval-staging-eval-project-sms_spam-216c1ded-12215630")

# Assuming the dataset is available in a 'train' split
data = ds['train']

# Convert to pandas DataFrame for easier manipulation
df = pd.DataFrame(data)

# Define a function for cleaning text
def clean_text(text):
    # Lowercase the text
    text = text.lower()

    # Remove special characters and digits (keep only letters and spaces)
    text = re.sub(r'[^a-z\s]', '', text)

    # Remove extra spaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

# Apply the cleaning function to the 'text' column
df['cleaned_text'] = df['text'].apply(clean_text)

# Display cleaned data
print(df[['text', 'cleaned_text']].head())


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md:   0%|          | 0.00/896 [00:00<?, ?B/s]

predictions.parquet:   0%|          | 0.00/332k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/5574 [00:00<?, ? examples/s]

                                                text  \
0  Got it. Seventeen pounds for seven hundred ml ...   
1  Oic cos me n my sis got no lunch today my dad ...   
2                    Yup. Thk of u oso boring wat.\n   
3  Good afternoon my boytoy. How goes that walkin...   
4  PRIVATE! Your 2003 Account Statement for 07973...   

                                        cleaned_text  
0  got it seventeen pounds for seven hundred ml h...  
1  oic cos me n my sis got no lunch today my dad ...  
2                        yup thk of u oso boring wat  
3  good afternoon my boytoy how goes that walking...  
4  private your account statement for shows unred...  


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from datasets import load_dataset
import pandas as pd

# Load the dataset
ds = load_dataset("autoevaluate/autoeval-staging-eval-project-sms_spam-216c1ded-12215630")

# Convert to pandas DataFrame for easier manipulation
df = pd.DataFrame(ds['train'])

# Assuming 'text' is the column that contains the messages
messages = df['text']

# --- Using Bag of Words ---
vectorizer_bow = CountVectorizer(stop_words='english')  # Ignore common stop words
X_bow = vectorizer_bow.fit_transform(messages)

# Convert the result to an array or DataFrame for inspection
bow_array = X_bow.toarray()
bow_df = pd.DataFrame(bow_array, columns=vectorizer_bow.get_feature_names_out())

# --- Using TF-IDF ---
vectorizer_tfidf = TfidfVectorizer(stop_words='english')  # Ignore common stop words
X_tfidf = vectorizer_tfidf.fit_transform(messages)

# Convert the result to an array or DataFrame for inspection
tfidf_array = X_tfidf.toarray()
tfidf_df = pd.DataFrame(tfidf_array, columns=vectorizer_tfidf.get_feature_names_out())

# Displaying the shape of the resulting matrices
print("Bag of Words Shape:", bow_df.shape)
print("TF-IDF Shape:", tfidf_df.shape)

# You can also inspect the first few rows of the resulting DataFrames
print("Bag of Words (first 5 rows):")
print(bow_df.head())

print("TF-IDF (first 5 rows):")
print(tfidf_df.head())


Bag of Words Shape: (5574, 8444)
TF-IDF Shape: (5574, 8444)
Bag of Words (first 5 rows):
   00  000  000pes  008704050406  0089  0121  01223585236  01223585334  \
0   0    0       0             0     0     0            0            0   
1   0    0       0             0     0     0            0            0   
2   0    0       0             0     0     0            0            0   
3   0    0       0             0     0     0            0            0   
4   0    0       0             0     0     0            0            0   

   0125698789  02  ...  zhong  zindgi  zoe  zogtorius  zoom  zouk  zyada  èn  \
0           0   0  ...      0       0    0          0     0     0      0   0   
1           0   0  ...      0       0    0          0     0     0      0   0   
2           0   0  ...      0       0    0          0     0     0      0   0   
3           0   0  ...      0       0    0          0     0     0      0   0   
4           0   0  ...      0       0    0          0     0     0 

In [None]:
# Check the column names of the dataset
print(df.columns)

# Print the first few rows of the dataset to inspect its structure
print(df.head())


Index(['text', 'target', 'evaluation_predictions'], dtype='object')
                                                text  target  \
0  Got it. Seventeen pounds for seven hundred ml ...       0   
1  Oic cos me n my sis got no lunch today my dad ...       0   
2                    Yup. Thk of u oso boring wat.\n       0   
3  Good afternoon my boytoy. How goes that walkin...       0   
4  PRIVATE! Your 2003 Account Statement for 07973...       1   

    evaluation_predictions  
0   [4.4140625, -3.984375]  
1   [4.4140625, -3.984375]  
2   [4.4140625, -3.984375]  
3   [4.4140625, -3.984375]  
4  [-3.9765625, 3.5859375]  


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import load_dataset
import pandas as pd

# Load the dataset
ds = load_dataset("autoevaluate/autoeval-staging-eval-project-sms_spam-216c1ded-12215630")

# Convert to pandas DataFrame
df = pd.DataFrame(ds['train'])

# Preprocess the data (assuming 'text' column for messages and 'target' column for target)
X = df['text']
y = df['target']

# Convert text data into numerical format using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# --- Naive Bayes ---
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)

# --- SVM ---
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)

# --- Random Forest ---
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

# --- CatBoost ---
catboost_model = CatBoostClassifier(learning_rate=0.1, iterations=1000, depth=6, verbose=0)
catboost_model.fit(X_train, y_train)
catboost_pred = catboost_model.predict(X_test)
catboost_accuracy = accuracy_score(y_test, catboost_pred)

# Print the accuracy for all models up to five decimal places
print(f"Naive Bayes Accuracy: {nb_accuracy:.5f}")
print(f"SVM Accuracy: {svm_accuracy:.5f}")
print(f"Random Forest Accuracy: {rf_accuracy:.5f}")
print(f"CatBoost Accuracy: {catboost_accuracy:.5f}")


Naive Bayes Accuracy: 0.97848
SVM Accuracy: 0.97937
Random Forest Accuracy: 0.98027
CatBoost Accuracy: 0.97848


In [None]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from catboost import CatBoostClassifier
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from datasets import load_dataset
import pandas as pd

# Load the dataset
ds = load_dataset("autoevaluate/autoeval-staging-eval-project-sms_spam-216c1ded-12215630")

# Convert to pandas DataFrame
df = pd.DataFrame(ds['train'])

# Preprocess the data (assuming 'text' column for messages and 'target' column for target)
X = df['text']
y = df['target']

# Convert text data into numerical format using TF-IDF
vectorizer = TfidfVectorizer(stop_words='english')
X_tfidf = vectorizer.fit_transform(X)

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# --- Naive Bayes ---
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
nb_pred = nb_model.predict(X_test)
nb_accuracy = accuracy_score(y_test, nb_pred)

# --- SVM ---
svm_model = SVC()
svm_model.fit(X_train, y_train)
svm_pred = svm_model.predict(X_test)
svm_accuracy = accuracy_score(y_test, svm_pred)

# --- Random Forest ---
rf_model = RandomForestClassifier()
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)
rf_accuracy = accuracy_score(y_test, rf_pred)

# --- CatBoost ---
catboost_model = CatBoostClassifier(learning_rate=0.1, iterations=1000, depth=6, verbose=0)
catboost_model.fit(X_train, y_train)
catboost_pred = catboost_model.predict(X_test)
catboost_accuracy = accuracy_score(y_test, catboost_pred)

# Print the accuracy for all models up to five decimal places
print(f"Naive Bayes Accuracy: {nb_accuracy:.5f}")
print(f"SVM Accuracy: {svm_accuracy:.5f}")
print(f"Random Forest Accuracy: {rf_accuracy:.5f}")
print(f"CatBoost Accuracy: {catboost_accuracy:.5f}")

# --- Testing with a Sample Message ---
sample_message = ["Free lottery win! Claim your prize now!"]

# Transform the sample message using the same vectorizer
sample_message_tfidf = vectorizer.transform(sample_message)

# Predict with each model
nb_sample_pred = nb_model.predict(sample_message_tfidf)
svm_sample_pred = svm_model.predict(sample_message_tfidf)
rf_sample_pred = rf_model.predict(sample_message_tfidf)
catboost_sample_pred = catboost_model.predict(sample_message_tfidf)

# Print predictions for the sample message
print("\nSample Message Prediction:")
print(f"Naive Bayes Prediction: {'Spam' if nb_sample_pred[0] == 1 else 'Not Spam'}")
print(f"SVM Prediction: {'Spam' if svm_sample_pred[0] == 1 else 'Not Spam'}")
print(f"Random Forest Prediction: {'Spam' if rf_sample_pred[0] == 1 else 'Not Spam'}")
print(f"CatBoost Prediction: {'Spam' if catboost_sample_pred[0] == 1 else 'Not Spam'}")


Naive Bayes Accuracy: 0.97848
SVM Accuracy: 0.97937
Random Forest Accuracy: 0.98027
CatBoost Accuracy: 0.97848

Sample Message Prediction:
Naive Bayes Prediction: Spam
SVM Prediction: Spam
Random Forest Prediction: Spam
CatBoost Prediction: Spam
