In [1]:
import string
import nltk
import pandas as pd 
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split

In [2]:
messages = pd.read_csv(r'dataset\spam.csv', encoding='latin-1')
messages.head() 

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
messages = messages.rename(columns={'v1': 'label', 'v2': 'message'})

# Keep only relevant columns
messages_cleaned = messages[['label', 'message']]


print(messages_cleaned.info())
print(messages_cleaned.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
  label                                            message
0   ham  Go until jurong point, crazy.. Available only ...
1   ham                      Ok lar... Joking wif u oni...
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...
3   ham  U dun say so early hor... U c already then say...
4   ham  Nah I don't think he goes to usf, he lives aro...


In [4]:
def text_preprocess(message):
    no_punch = [char for char in message if char not in string.punctuation]
    
    no_punch = "".join(no_punch)
    no_punch = no_punch.lower()
    
    no_stop = [ word 
               for word in no_punch.split() 
               if word.lower() not in stopwords.words("english") and word.isalpha()
    ]
    return no_stop

In [5]:
spam_messages = messages[messages["label"] == "spam"]["message"]
ham_messages = messages[messages["label"] == "ham"]["message"]
print(f"Number of spam messages: {len(spam_messages)}")
print(f"Number of ham messages: {len(ham_messages)}")

Number of spam messages: 747
Number of ham messages: 4825


In [6]:
nltk.download('stopwords')
spam_words = []
for each_message in spam_messages:
    spam_words += text_preprocess(each_message)

print(f"Top 10 Spam Words are:\n {pd.Series(spam_words).value_counts().head(10)}")

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\rajas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Top 10 Spam Words are:
 call      347
free      216
txt       150
u         147
ur        144
mobile    123
text      120
claim     113
stop      113
reply     101
Name: count, dtype: int64


In [7]:
ham_words = []
for each_message in ham_messages:
    ham_words += text_preprocess(each_message)
    
print(f"Top 10 ham words are: \n {pd.Series(ham_words).value_counts().head(10)}")

Top 10 ham words are: 
 u       972
im      449
get     303
ltgt    276
ok      272
dont    257
go      247
ur      240
ill     236
know    232
Name: count, dtype: int64


In [8]:
messages["message"]=messages["message"].apply(text_preprocess)
messages.head()

Unnamed: 0,label,message,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"[go, jurong, point, crazy, available, bugis, n...",,,
1,ham,"[ok, lar, joking, wif, u, oni]",,,
2,spam,"[free, entry, wkly, comp, win, fa, cup, final,...",,,
3,ham,"[u, dun, say, early, hor, u, c, already, say]",,,
4,ham,"[nah, dont, think, goes, usf, lives, around, t...",,,


In [9]:
# Drop all columns with "Unnamed" in their names
messages = messages.loc[:, ~messages.columns.str.contains('^Unnamed')]

# Check the cleaned dataset
print(messages.info())
print(messages.head())


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   label    5572 non-null   object
 1   message  5572 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB
None
  label                                            message
0   ham  [go, jurong, point, crazy, available, bugis, n...
1   ham                     [ok, lar, joking, wif, u, oni]
2  spam  [free, entry, wkly, comp, win, fa, cup, final,...
3   ham      [u, dun, say, early, hor, u, c, already, say]
4   ham  [nah, dont, think, goes, usf, lives, around, t...


In [10]:
messages.head()

Unnamed: 0,label,message
0,ham,"[go, jurong, point, crazy, available, bugis, n..."
1,ham,"[ok, lar, joking, wif, u, oni]"
2,spam,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,"[u, dun, say, early, hor, u, c, already, say]"
4,ham,"[nah, dont, think, goes, usf, lives, around, t..."


In [11]:
# Convert messages (as lists of string tokens) to strings
messages["message"] = messages["message"].agg(lambda x: " ".join(map(str, x)))
messages.head()

  messages["message"] = messages["message"].agg(lambda x: " ".join(map(str, x)))


Unnamed: 0,label,message
0,ham,go jurong point crazy available bugis n great ...
1,ham,ok lar joking wif u oni
2,spam,free entry wkly comp win fa cup final tkts may...
3,ham,u dun say early hor u c already say
4,ham,nah dont think goes usf lives around though


In [12]:
# Initialize count vectorizer
vectorizer = CountVectorizer()
bow_transformer = vectorizer.fit(messages["message"])

# Fetch the vocabulary set
print(f"20 BOW Features: {vectorizer.get_feature_names_out()[20:40]}")
print(f"Total number of vocab words: {len(vectorizer.vocabulary_)}")

20 BOW Features: ['absence' 'absolutely' 'abstract' 'abt' 'abta' 'aburo' 'abuse' 'abusers'
 'ac' 'academic' 'acc' 'accent' 'accenture' 'accept' 'access' 'accessible'
 'accidant' 'accident' 'accidentally' 'accommodation']
Total number of vocab words: 8084


In [13]:
# Convert strings to vectors using BoW
messages_bow = bow_transformer.transform(messages["message"])

# Print the shape of the sparse matrix and count the number of non-zero occurrences
print(f"Shape of sparse matrix: {messages_bow.shape}")
print(f"Amount of non-zero occurrences: {messages_bow.nnz}")

Shape of sparse matrix: (5572, 8084)
Amount of non-zero occurrences: 44211


In [14]:
# TF-IDF
from sklearn.feature_extraction.text import TfidfTransformer

tfidf_transformer = TfidfTransformer().fit(messages_bow)

# Transform entire BoW into tf-idf corpus
messages_tfidf = tfidf_transformer.transform(messages_bow)
print(messages_tfidf.shape)

(5572, 8084)


In [15]:
# Convert spam and ham labels to 0 and 1 (or, vice-versa)
FactorResult = pd.factorize(messages["label"])
messages["label"] = FactorResult[0]
messages.head()

Unnamed: 0,label,message
0,0,go jurong point crazy available bugis n great ...
1,0,ok lar joking wif u oni
2,1,free entry wkly comp win fa cup final tkts may...
3,0,u dun say early hor u c already say
4,0,nah dont think goes usf lives around though


In [16]:
# Split the dataset to train and test sets
msg_train, msg_test, label_train, label_test = train_test_split(
    messages_tfidf, messages["label"], test_size=0.2
)

print(f"train dataset features size: {msg_train.shape}")
print(f"train dataset label size: {label_train.shape}")

print(f"test dataset features size: {msg_test.shape}")
print(f"test dataset label size: {label_test.shape}")

train dataset features size: (4457, 8084)
train dataset label size: (4457,)
test dataset features size: (1115, 8084)
test dataset label size: (1115,)


In [17]:

from catboost import CatBoostClassifier

# Instantiate our model
clf = CatBoostClassifier(iterations=100,learning_rate=0.1,depth=6,random_state=42,verbose=0)

# Fit the model to the training data
clf.fit(msg_train, label_train)
predictions = clf.predict(msg_test)

from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(label_test,predictions))
print(confusion_matrix(label_test,predictions))

              precision    recall  f1-score   support

           0       0.96      1.00      0.98       971
           1       0.98      0.72      0.83       144

    accuracy                           0.96      1115
   macro avg       0.97      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115

[[969   2]
 [ 40 104]]


In [18]:
# an example prediction
print(
    "predicted:",
    clf.predict(
        tfidf_transformer.transform(bow_transformer.transform([messages["message"][9]]))
    )[0],
)
print("expected:", messages["label"][9])



predicted: 1
expected: 1


In [19]:
# print the overall accuracy of the model
label_predictions = clf.predict(msg_test)
print(f"Accuracy of the model: {metrics.accuracy_score(label_test, label_predictions):0.3f}")

Accuracy of the model: 0.962


In [21]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report

acc = accuracy_score(label_test, predictions)
print(f"Accuracy: {acc}")

# Precision, Recall, F1-Score
print(f"Precision: {precision_score(label_test, predictions)}")
print(f"Recall: {recall_score(label_test, predictions)}")
print(f"F1-Score: {f1_score(label_test, predictions)}")

# Confusion Matrix
print("Confusion Matrix:")
print(confusion_matrix(label_test, predictions))

# Classification Report
print("Classification Report:")
print(classification_report(label_test, predictions))

Accuracy: 0.9623318385650225
Precision: 0.9811320754716981
Recall: 0.7222222222222222
F1-Score: 0.832
Confusion Matrix:
[[969   2]
 [ 40 104]]
Classification Report:
              precision    recall  f1-score   support

           0       0.96      1.00      0.98       971
           1       0.98      0.72      0.83       144

    accuracy                           0.96      1115
   macro avg       0.97      0.86      0.91      1115
weighted avg       0.96      0.96      0.96      1115



In [22]:
# Preprocess the sample messages
sample_messages = ["Free money offer!", "Let's meet tomorrow for lunch."]
sample_features = vectorizer.transform(sample_messages)  # Use the same vectorizer used during training

# Predict using the preprocessed data
sample_predictions = clf.predict(sample_features)
print(sample_predictions)  # Output will be labels like 'spam' or 'ham'


CatBoostError: Bad value for num_feature[non_default_doc_idx=0,feature_idx=0]="Free money offer!": Cannot convert 'b'Free money offer!'' to float