In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import pandas as pd
import re
from nltk.tokenize import word_tokenize
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV

In [2]:
# Load the dataset
data = pd.read_csv('spam.csv', encoding='latin-1')

In [3]:
labels = data['v1']

In [4]:
# Step 1: Text Cleaning
def clean_text(text):
    # Remove special characters and numbers
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    return cleaned_text

In [5]:
data['cleaned_text'] = data['v2'].apply(clean_text)

# Display the first few rows of the cleaned dataset
print(data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4                                       cleaned_text  
0        NaN        NaN  Go until jurong point crazy Available only in ...  
1        NaN        NaN                            Ok lar Joking wif u oni  
2        NaN        NaN  Free entry in  a wkly comp to win FA Cup final...  
3        NaN        NaN        U dun say so early hor U c already then say  
4        NaN        NaN  Nah I dont think he goes to usf he lives aroun...  


In [6]:
# Step 1: Text Clearning (Continued)
def lowercase_text(text):
    # Convert text to lowercase
    lowercase_text = text.lower()
    return lowercase_text

In [7]:
data['lowercase_text'] = data['cleaned_text'].apply(lowercase_text)

# Display the first few rows of the dataset with lowercase text
print(data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4                                       cleaned_text  \
0        NaN        NaN  Go until jurong point crazy Available only in ...   
1        NaN        NaN                            Ok lar Joking wif u oni   
2        NaN        NaN  Free entry in  a wkly comp to win FA Cup final...   
3        NaN        NaN        U dun say so early hor U c already then say   
4        NaN        NaN  Nah I dont think he goes to usf he lives aroun...   

                                      lowercase_text  
0  go until jurong point crazy availa

In [8]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Azlaan\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Azlaan\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Azlaan\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [9]:
# Step 1: Text Cleaning (Continued)
def tokenize_text(text):
    # Tokenize the text
    tokens = word_tokenize(text)
    return tokens

In [10]:
data['tokenized_text'] = data['lowercase_text'].apply(tokenize_text)

# Display the first few rows of the dataset with tokenized text
print(data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4                                       cleaned_text  \
0        NaN        NaN  Go until jurong point crazy Available only in ...   
1        NaN        NaN                            Ok lar Joking wif u oni   
2        NaN        NaN  Free entry in  a wkly comp to win FA Cup final...   
3        NaN        NaN        U dun say so early hor U c already then say   
4        NaN        NaN  Nah I dont think he goes to usf he lives aroun...   

                                      lowercase_text  \
0  go until jurong point crazy avail

In [11]:
# Step 1: Text Cleaning (Continued)
def remove_stopwords(tokens):
    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens

In [12]:
data['filtered_text'] = data['tokenized_text'].apply(remove_stopwords)

# Display the first few rows of the dataset with filtered text
print(data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4                                       cleaned_text  \
0        NaN        NaN  Go until jurong point crazy Available only in ...   
1        NaN        NaN                            Ok lar Joking wif u oni   
2        NaN        NaN  Free entry in  a wkly comp to win FA Cup final...   
3        NaN        NaN        U dun say so early hor U c already then say   
4        NaN        NaN  Nah I dont think he goes to usf he lives aroun...   

                                      lowercase_text  \
0  go until jurong point crazy avail

In [13]:
# Step 1: Text Cleaning (Continued)
def apply_stemming(tokens):
    # Apply stemming
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

def apply_lemmatization(tokens):
    # Apply lemmatization
    lemmatizer = WordNetLemmatizer()
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
    return lemmatized_tokens


In [14]:
# Choose either stemming or lemmatization
data['processed_text'] = data['filtered_text'].apply(apply_stemming)
# or
# data['processed_text'] = data['filtered_text'].apply(apply_lemmatization)

In [15]:
# Display the first few rows of the dataset with processed text
print(data.head())

     v1                                                 v2 Unnamed: 2  \
0   ham  Go until jurong point, crazy.. Available only ...        NaN   
1   ham                      Ok lar... Joking wif u oni...        NaN   
2  spam  Free entry in 2 a wkly comp to win FA Cup fina...        NaN   
3   ham  U dun say so early hor... U c already then say...        NaN   
4   ham  Nah I don't think he goes to usf, he lives aro...        NaN   

  Unnamed: 3 Unnamed: 4                                       cleaned_text  \
0        NaN        NaN  Go until jurong point crazy Available only in ...   
1        NaN        NaN                            Ok lar Joking wif u oni   
2        NaN        NaN  Free entry in  a wkly comp to win FA Cup final...   
3        NaN        NaN        U dun say so early hor U c already then say   
4        NaN        NaN  Nah I dont think he goes to usf he lives aroun...   

                                      lowercase_text  \
0  go until jurong point crazy avail

In [16]:
# Step 2: Feature Extraction (TF-IDF)
corpus = [' '.join(tokens) for tokens in data['processed_text']]
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(corpus)

In [17]:
# Display the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X.shape)

TF-IDF Matrix Shape: (5572, 7050)


In [18]:
# Assuming you have 'X' as your TF-IDF matrix and 'labels' as corresponding labels
X_train, X_test, y_train, y_test = train_test_split(X, labels, test_size=0.2, random_state=42)

In [19]:
# Step 3: Model Training (Naive Bayes)
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [20]:
# Step 4: Model Evaluation
y_pred = nb_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9659192825112107


In [21]:
# Hyperparameter tuning for Naive Bayes
param_grid_nb = {'alpha': [0.1, 1.0, 10.0]}
grid_search_nb = GridSearchCV(MultinomialNB(), param_grid_nb, cv=3)
grid_search_nb.fit(X_train, y_train)
best_nb_classifier = grid_search_nb.best_estimator_

# Deploy the best Naive Bayes model
new_sms_nb = ["Get a free gift now!", "Hi, how are you?"]
new_sms_corpus_nb = [' '.join(tokenize_text(clean_text(sms.lower()))) for sms in new_sms_nb]
new_sms_tfidf_nb = tfidf_vectorizer.transform(new_sms_corpus_nb)
predicted_labels_nb = best_nb_classifier.predict(new_sms_tfidf_nb)

for sms, label in zip(new_sms_nb, predicted_labels_nb):
    print(f"SMS: {sms} - Predicted Label: {label}")

SMS: Get a free gift now! - Predicted Label: spam
SMS: Hi, how are you? - Predicted Label: ham


In [22]:
# Display classification report and confusion matrix
print("\nClassification Report:\n", classification_report(y_test, y_pred))
print("\nConfusion Matrix:\n", confusion_matrix(y_test, y_pred))


Classification Report:
               precision    recall  f1-score   support

         ham       0.96      1.00      0.98       965
        spam       1.00      0.75      0.85       150

    accuracy                           0.97      1115
   macro avg       0.98      0.87      0.92      1115
weighted avg       0.97      0.97      0.96      1115


Confusion Matrix:
 [[965   0]
 [ 38 112]]


In [23]:
# Step 3: Model Training (Logistic Regression)
logreg_classifier = LogisticRegression(max_iter=1000)  # Increase max_iter if necessary
logreg_classifier.fit(X_train, y_train)

In [24]:
# Step 4: Model Evaluation
y_pred_logreg = logreg_classifier.predict(X_test)
accuracy_logreg = accuracy_score(y_test, y_pred_logreg)
print("Logistic Regression Accuracy:", accuracy_logreg)

Logistic Regression Accuracy: 0.9488789237668162


In [25]:
# Hyperparameter tuning for Logistic Regression
param_grid_logreg = {'C': [0.1, 1.0, 10.0]}
grid_search_logreg = GridSearchCV(LogisticRegression(max_iter=1000), param_grid_logreg, cv=3)
grid_search_logreg.fit(X_train, y_train)
best_logreg_classifier = grid_search_logreg.best_estimator_

# Deploy the best Logistic Regression model
new_sms_logreg = ["Get a free gift now!", "Hi, how are you?"]
new_sms_corpus_logreg = [' '.join(tokenize_text(clean_text(sms.lower()))) for sms in new_sms_logreg]
new_sms_tfidf_logreg = tfidf_vectorizer.transform(new_sms_corpus_logreg)
predicted_labels_logreg = best_logreg_classifier.predict(new_sms_tfidf_logreg)

for sms, label in zip(new_sms_logreg, predicted_labels_logreg):
    print(f"SMS: {sms} - Predicted Label: {label}")

SMS: Get a free gift now! - Predicted Label: spam
SMS: Hi, how are you? - Predicted Label: ham


In [26]:
# Display classification report and confusion matrix for Logistic Regression
print("\nLogistic Regression Classification Report:\n", classification_report(y_test, y_pred_logreg))
print("\nLogistic Regression Confusion Matrix:\n", confusion_matrix(y_test, y_pred_logreg))


Logistic Regression Classification Report:
               precision    recall  f1-score   support

         ham       0.95      1.00      0.97       965
        spam       0.96      0.65      0.77       150

    accuracy                           0.95      1115
   macro avg       0.95      0.82      0.87      1115
weighted avg       0.95      0.95      0.94      1115


Logistic Regression Confusion Matrix:
 [[961   4]
 [ 53  97]]


In [27]:
# Step 3: Model Training (Support Vector Machines - SVM)
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

In [28]:
# Step 4: Model Evaluation
y_pred_svm = svm_classifier.predict(X_test)
accuracy_svm = accuracy_score(y_test, y_pred_svm)
print("SVM Accuracy:", accuracy_svm)

SVM Accuracy: 0.9721973094170404


In [29]:
# Hyperparameter tuning for SVM
param_grid = {'C': [0.1, 1, 10],
              'kernel': ['linear', 'rbf', 'sigmoid']}
grid_search = GridSearchCV(SVC(), param_grid, cv=3)
grid_search.fit(X_train, y_train)
best_svm_classifier = grid_search.best_estimator_

# Deploy the best SVM model
new_sms = ["Get a free gift now!", "Hi, how are you?"]
new_sms_corpus = [' '.join(tokenize_text(clean_text(sms.lower()))) for sms in new_sms]
new_sms_tfidf = tfidf_vectorizer.transform(new_sms_corpus)
predicted_labels = best_svm_classifier.predict(new_sms_tfidf)

for sms, label in zip(new_sms, predicted_labels):
    print(f"SMS: {sms} - Predicted Label: {label}")

SMS: Get a free gift now! - Predicted Label: ham
SMS: Hi, how are you? - Predicted Label: ham


In [30]:
# Display classification report and confusion matrix for SVM
print("\nSVM Classification Report:\n", classification_report(y_test, y_pred_svm))
print("\nSVM Confusion Matrix:\n", confusion_matrix(y_test, y_pred_svm))


SVM Classification Report:
               precision    recall  f1-score   support

         ham       0.97      1.00      0.98       965
        spam       0.98      0.81      0.89       150

    accuracy                           0.97      1115
   macro avg       0.98      0.90      0.94      1115
weighted avg       0.97      0.97      0.97      1115


SVM Confusion Matrix:
 [[963   2]
 [ 29 121]]
