# Data Loading and Exploration/Visualization

In [None]:
import pandas as pd

data = pd.read_csv("/Users/mac/Desktop/Spam_Detection/spam.csv", encoding="latin-1")  

In [None]:
print(data.head())  # View the first few rows


In [None]:
print(data.info())  # Get information about data types and missing values

In [None]:
print(data.columns.values.tolist())

In [None]:
# List of colors (red for spam and green for ham)
colors = ["red", "green"]
# Import libraries (if not already imported)
import matplotlib.pyplot as plt

# Define label mapping 
label_map = {"spam": 0, "ham": 1}

# Separate message lengths based on labels
spam_lengths = []
ham_lengths = []
for message, label in zip(data['v2'], data['v1']):
  length = len(message)
  if label_map[label] == 0:  # Spam message
    spam_lengths.append(length)
  else:  # Ham message
    ham_lengths.append(length)

# Create histograms 
plt.hist([spam_lengths, ham_lengths], bins=20, stacked=True, label=['Spam', 'Ham'], color=colors)
#plt.hist(spam_lengths, bins=10, alpha=1, label='Spam', color=colors[0])  # Color-code spam
#plt.hist(ham_lengths, bins=15, alpha=0.5, label='Ham', color=colors[1])  # Color-code ham
plt.xlabel('Message Length')
plt.ylabel('Number of Messages')
plt.title('Message Length Distribution (Spam vs. Ham)')
plt.legend()  # Add legend to show color coding
plt.show()


In [None]:
from collections import Counter

# Define stop words 
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    words = [word for word in text.split() if word not in stop_words]  # Remove stop words
    return words  # Return list of words (no joining)

# Separate messages based on label ('spam' and 'ham' labels)
spam_messages = []
ham_messages = []
for message, label in zip(data['v2'], data['v1']):
  if label == "spam":
    spam_messages.append(preprocess_text(message))
  else:
    ham_messages.append(preprocess_text(message))

# Get most frequent words (without repetition)
def get_top_words(messages, n_words):
  all_words = sum(messages, [])  # Flatten the list of lists
  word_counts = Counter(all_words)  # Count word frequencies
  return [word for word, count in word_counts.most_common(n_words)]  # Get top n words

# Get top 10 most frequent words in spam and ham
spam_top_words = get_top_words(spam_messages, 10)
ham_top_words = get_top_words(ham_messages, 10)

# Print the top words
print("Top 10 Words in Spam Messages:")
print(*spam_top_words, sep="\n")  # Print each word on a new line

print("\nTop 10 Words in Ham Messages:")
print(*ham_top_words, sep="\n")


In [None]:
from wordcloud import WordCloud
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords

In [None]:


# Define stop words 
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    text = text.lower()  # Convert to lowercase
    words = [word for word in text.split() if word not in stop_words]  # Remove stop words
    return " ".join(words)  # Join words back into a string

# Separate messages based on label ('spam' and 'ham' labels)
spam_messages = []
ham_messages = []
for message, label in zip(data['v2'], data['v1']):
  if label == "spam":
    spam_messages.append(preprocess_text(message))
  else:
    ham_messages.append(preprocess_text(message))

# Create word cloud for spam messages
spam_text = " ".join(spam_messages)
spam_wordcloud = WordCloud(width=400, height=300).generate(spam_text)

# Create word cloud for ham messages
ham_text = " ".join(ham_messages)
ham_wordcloud = WordCloud(width=400, height=300).generate(ham_text)

# Plot the word clouds side-by-side
plt.figure(figsize=(8, 6))
plt.subplot(121)
plt.imshow(spam_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Spam Messages")

plt.subplot(122)
plt.imshow(ham_wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Ham Messages")

plt.suptitle("Word Clouds for Spam vs. Ham Messages")
plt.tight_layout()
plt.show()


# Feature Engineeing

In [None]:
import re

def has_valid_url(text):
  url_regex = r"(?i)\b((https?://|ftp://|www\.)\S+[^\s.,>)\]])"  # Case-insensitive regex
  return bool(re.findall(url_regex, text))

data['has_url'] = data['v2'].apply(has_valid_url)  # Apply function to 'v2' column (message)

# Convert True/False values to 1/0 
data['has_url'] = data['has_url'].astype(int)  # Convert boolean to integer

print(data.head(5)) # Print the first few rows to see the new column


In [None]:
print(data['has_url']==1)

# Choosing the model/Training/Evaluation

In [None]:
import pandas as pd
def drop_columns_by_index(df, indexes_to_drop):
  """
  Drops columns from a DataFrame based on a list of indexes.

  Args:
      df (pandas.DataFrame): The DataFrame to modify.
      indexes_to_drop (list): A list of integer indexes of columns to drop.

  Returns:
      pandas.DataFrame: The modified DataFrame with the specified columns dropped.
  """

  # Check if indexes_to_drop is a list
  if not isinstance(indexes_to_drop, list):
    raise ValueError("indexes_to_drop must be a list of integers")

  # Validate indexes are within range
  if any(i < 0 or i >= len(df.columns) for i in indexes_to_drop):
    raise ValueError("Indexes to drop are out of range")

  # Drop columns using list comprehension (modifies df in-place)
  df.drop(columns=[df.columns[i] for i in indexes_to_drop], inplace=True)

  return df



In [None]:

df = pd.DataFrame(data)
indexes_to_drop = [2, 3, 4]  # Drop columns at index 2, 3 and 4 (Unnamed)

try:
  df = drop_columns_by_index(df, indexes_to_drop)
  print(df.head())
except ValueError as e:
  print(e)

In [None]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

X = data['v2']  # Select the message column containing text data
y = data['v1']  # Select the label column containing spam/ham labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:


# Split data into training and testing sets
print(type(X_train))  # Should be list
print(type(X_train[0]))  # Should be str (string)
print(X_train[:2])  # Check the content of the first two elements





In [None]:

# Function for basic text preprocessing 
def preprocess_text(text):
  """
  This function performs basic text preprocessing steps.
  You can customize it further based on your data and needs.
  """
  text = text.lower()  # Lowercase conversion
  tokens = text.split()  # Basic tokenization

  # Remove punctuation and non-alphanumeric characters 
  import string
  punctuation = set(string.punctuation)
  tokens = [word.lower() for word in tokens if word not in punctuation]

  # Join tokens back into a single preprocessed string
  preprocessed_text = " ".join(tokens)  # Combine tokens with spaces

  return preprocessed_text


# Preprocess training and testing data
X_train_preprocessed = [preprocess_text(text) for text in X_train]
X_test_preprocessed = [preprocess_text(text) for text in X_test]






In [None]:
print(type(X_train_preprocessed))
print(X_train_preprocessed[:2])  # Print the first two elements


In [None]:
# TF-IDF vectorization
vectorizer = TfidfVectorizer(max_features=1000)  
X_train_features = vectorizer.fit_transform(X_train_preprocessed)
X_test_features = vectorizer.transform(X_test_preprocessed)

from sklearn.preprocessing import LabelEncoder

# Encode labels (y_train and y_test are numerical)
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train)
y_test_encoded = le.transform(y_test)



# Train Naive Bayes model
model = MultinomialNB()
# Train the model using encoded labels
model.fit(X_train_features, y_train_encoded)

# Make predictions using encoded test features
y_pred = model.predict(X_test_features)


In [None]:
# Evaluation using encoded labels
accuracy = accuracy_score(y_test_encoded, y_pred)
precision = precision_score(y_test_encoded, y_pred)
recall = recall_score(y_test_encoded, y_pred)
f1 = f1_score(y_test_encoded, y_pred)

print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1-score:", f1)



# Thanks for watching!