In [1]:
#importing libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix

In [2]:
#importing kagglehub dataset
import kagglehub

# Download latest version
path = kagglehub.dataset_download("uciml/sms-spam-collection-dataset")

#identifying path to csv file
print("Path to dataset files:", path)

Using Colab cache for faster access to the 'sms-spam-collection-dataset' dataset.
Path to dataset files: /kaggle/input/sms-spam-collection-dataset


In [3]:
#reading the csv file
df = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding="latin-1")[["v1", "v2"]]
df.columns = ["label", "message"] #labeling the contents of the data with column 1 = spam/ham identifier and column 2 = message

df["label"] = df["label"].map({"ham": 0, "spam": 1}) #converting text values to boolean values for better training

#Split data into random train and test subsets. 20% will be utilized for testing 80% for training; random_state used to control shuffling to allow the same result of shuffling every time.
X_train, X_test, y_train, y_test = train_test_split( df["message"], df["label"], test_size=0.2, random_state=42)

#uses TfidfVectorizer to convert data text to a matrix of TF-IDF features.
vectorizer = TfidfVectorizer(stop_words="english", max_df=0.7) #remove stopwords and words that have higher document frequency than 0.7
X_train_tfidf = vectorizer.fit_transform(X_train) #Learn vocabulary and idf
X_test_tfidf = vectorizer.transform(X_test) #transform testing data to document frequencies

#Train logistic regression classifier on the TF-IDF features.
model = LogisticRegression()
model.fit(X_train_tfidf, y_train)

#predicions
y_pred = model.predict(X_test_tfidf)

#printing prediction results
print(classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97       965
           1       0.97      0.67      0.79       150

    accuracy                           0.95      1115
   macro avg       0.96      0.83      0.88      1115
weighted avg       0.95      0.95      0.95      1115

Confusion Matrix:
 [[962   3]
 [ 50 100]]
