Import necessary things

In [20]:
import pandas
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.preprocessing import LabelEncoder

Import dataset from kaggle

In [3]:
import kagglehub
from kagglehub import KaggleDatasetAdapter

# Set the path to the file you'd like to load
file_path = "spam.csv"

# Load the latest version
df = kagglehub.dataset_load(
  KaggleDatasetAdapter.PANDAS,
  "uciml/sms-spam-collection-dataset",
  file_path, pandas_kwargs={"encoding": "latin-1"} 
)

print("First 5 records:", df.head())

  from .autonotebook import tqdm as notebook_tqdm


First 5 records:      v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4  
0        NaN        NaN  
1        NaN        NaN  
2        NaN        NaN  
3        NaN        NaN  
4        NaN        NaN  


Data Preprocessing

In [4]:
df.isnull().sum()

v1               0
v2               0
Unnamed: 2    5522
Unnamed: 3    5560
Unnamed: 4    5566
dtype: int64

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5572 entries, 0 to 5571
Data columns (total 5 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   v1          5572 non-null   object
 1   v2          5572 non-null   object
 2   Unnamed: 2  50 non-null     object
 3   Unnamed: 3  12 non-null     object
 4   Unnamed: 4  6 non-null      object
dtypes: object(5)
memory usage: 217.8+ KB


In [6]:
df = df[["v1", "v2"]].copy() #remove unnamed columns
df.columns = ["label", "message"]

In [7]:
le = LabelEncoder() #it encode the target column
df["label"] = le.fit_transform(df["label"])

Text cleaning

In [8]:
#text cleaning (message)
# Convert to lowercase
# Remove punctuation
# Remove numbers
# Remove stopwords (common words like is, the, a, an)
# Optionally do stemming or lemmatization

import re #remove numbers
import string #remove punctuations
#nltk natural languagle toolkit used for working with human language
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer


ps = PorterStemmer()
stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = text.lower()
    text = re.sub(r'\d+', '', text)              # remove numbers
    text = text.translate(str.maketrans('', '', string.punctuation))  # remove all punctuation marks from the text using a translation table
    text = ' '.join([ps.stem(word) for word in text.split() if word not in stop_words])
    return text

df['clean_message'] = df['message'].apply(clean_text)


In [9]:
df

Unnamed: 0,label,message,clean_message
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt st m...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though
...,...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...,nd time tri contact u u å£ pound prize claim e...
5568,0,Will Ì_ b going to esplanade fr home?,ì b go esplanad fr home
5569,0,"Pity, * was in mood for that. So...any other s...",piti mood soani suggest
5570,0,The guy did some bitching but I acted like i'd...,guy bitch act like id interest buy someth els ...


Text Vectorization

In [11]:
#text vectorization convert the text data in numerical form cause ML models can't read text
#we use TF-IDF to convert text → numbers while keeping the meaning (which words are important).

from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['clean_message']).toarray() #By default, TF-IDF gives you a sparse matrix (an efficient storage format). .toarray() converts it into a normal NumPy array so it’s easier to work with
y = df['label']


Data split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.3, random_state=42)

(Optional) Feature Scaling

For text data, usually not needed because TF-IDF already normalizes values.

Train model

In [15]:
model = LogisticRegression()
model.fit(X_train, y_train)

0,1,2
,penalty,'l2'
,dual,False
,tol,0.0001
,C,1.0
,fit_intercept,True
,intercept_scaling,1
,class_weight,
,random_state,
,solver,'lbfgs'
,max_iter,100


Test Model

In [18]:
y_pred = model.predict(X_test)

Evaluate Model

In [21]:

print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))



Accuracy: 0.9491626794258373

Confusion Matrix:
 [[1448    5]
 [  80  139]]

Classification Report:
               precision    recall  f1-score   support

           0       0.95      1.00      0.97      1453
           1       0.97      0.63      0.77       219

    accuracy                           0.95      1672
   macro avg       0.96      0.82      0.87      1672
weighted avg       0.95      0.95      0.94      1672



In [22]:
sample = ["You have won $1000! Click here to claim now!"]
sample_vec = tfidf.transform(sample)
print(model.predict(sample_vec))  # 1 → spam, 0 → ham


[0]
