# Natural Language Processing (NLP) training to identify scam messages

## Initial setup of Python libraries + defining functions

In [None]:
import pandas as pd
import numpy as np
import string
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from collections import Counter
from sklearn.metrics import classification_report,confusion_matrix
from sklearn.model_selection import GridSearchCV
%matplotlib inline

def transform_message(message):
    # Remove punctuation
    message_not_punc = ''.join([char for char in message if char not in string.punctuation])
    
    # Tokenize and remove stopwords
    stop_words = set(stopwords.words('english'))  # Load stopwords only once
    message_clean = [word.lower() for word in message_not_punc.split() if word.lower() not in stop_words]
    
    return message_clean

## Analysis of the UCI Dataset of Spam Messages

In [None]:
# Read the file using tab as the delimiter
data = pd.read_csv('Datasets/UCI_SPAM', sep='\t', header=None, names=["label", "message"])

# Clean up whitespace in the message column
data['message'] = data['message'].str.strip()

# Display the first rows
print(data.head())
data.describe()

In [None]:
data["length"] = data["message"].apply(len)
data.sort_values(by='length', ascending=False).head(10)

In [None]:
data.sort_values(by='length', ascending=False).tail(10)

### Message Length Distribution by Label

In [None]:
data.hist(column='length', by='label', figsize=(12, 4), bins=10)

plt.suptitle("Message Length Distribution by Label")  
plt.subplots_adjust(hspace=0.4)  

for ax in plt.gcf().axes:  
    x_min, x_max = ax.get_xlim()  
    tick_increment = 50  
    
    if x_max - x_min > 500:  
        tick_increment = 100
    elif x_max - x_min > 1000:
        tick_increment = 200

    x_ticks = np.arange(0, x_max + 1, tick_increment)  
    ax.set_xticks(x_ticks)  
    ax.tick_params(axis='x', rotation=45)  

plt.show()

## Vectorization of the spam messages

In [None]:
vectorization = CountVectorizer(analyzer=transform_message)

X = vectorization.fit_transform(data['message'])

# Display the shape of the resulting matrix
print(X.shape)

In [None]:
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X)  # Fit and transform on the count matrix 'X'

# Print the shape of the TF-IDF matrix
print("TF-IDF Matrix Shape:", X_tfidf.shape)

## Using Support Vector Classification (SVC) algorithm to train the NLP

In [None]:
# Correct target variable: 'label' column
X_train, X_test, y_train, y_test = train_test_split(
    X_tfidf,                  # TF-IDF features
    data['label'],            # Target: ham or spam
    test_size=0.30,           # 30% for testing
    random_state=50           # Random seed for reproducibility
)

# Train the Support Vector Classifier (SVC)
clf = SVC(kernel='linear')    # Linear kernel SVM
clf.fit(X_train, y_train)

# Test the model
predictions = clf.predict(X_test)

# Display results
print("Classification Report:\n", classification_report(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))