# Spam Mail Prediction

### Importing Libraries for Text Data Preprocessing and Model Building

This code snippet imports necessary libraries for both text data preprocessing and model building tasks.

In [None]:
import numpy as np
import pandas as pd

# Code for text data preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer

# Code for model buidling
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

###  Downloading NLTK Stopwords

This code snippet downloads the NLTK stopwords corpus necessary for text preprocessing tasks.

In [None]:
import nltk
nltk.download('stopwords')

In [None]:
# printing the stopwords in English
print(stopwords.words('english'))

### Loading Spam SMS Dataset

This code snippet loads the Spam SMS dataset from the specified file path using pd.read_csv().

In [None]:
spam_data = pd.read_csv('/kaggle/input/sms-spam-collection-dataset/spam.csv', encoding='ISO-8859-1')

In [None]:
spam_data

### Data Cleaning and Column Renaming

This code snippet performs data cleaning and column renaming on the loaded Spam SMS dataset.

In [None]:
df = spam_data.drop(columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'])

In [None]:
df

In [None]:
df.columns = ['Category', 'Message']

In [None]:
df

### Exploring DataFrame df: Size, Information, Features, Missing Values, and Summary Statistics

This section of the code prints the size and detailed information about the cleaned Spam SMS dataset. And the Summary Statistic of the dataframe.

In [None]:
print('The size of Dataframe is: ', df.shape)
print('-'*100)
print('The Column Name, Record Count and Data Types are as follows: ')
df.info()
print('-'*100)

In [None]:
# Defining numerical & categorical columns
numeric_features = [feature for feature in df.columns if df[feature].dtype != 'O']
categorical_features = [feature for feature in df.columns if df[feature].dtype == 'O']

# print columns
print('We have {} numerical features : {}'.format(len(numeric_features), numeric_features))
print('\nWe have {} categorical features : {}'.format(len(categorical_features), categorical_features))

In [None]:
print('Missing Value Presence in different columns of DataFrame are as follows : ')
print('-'*100)
total=df.isnull().sum().sort_values(ascending=False)
percent=(df.isnull().sum()/df.isnull().count()*100).sort_values(ascending=False)
pd.concat([total, percent], axis=1, keys=['Total', 'Percent'])

In [None]:
print('Summary Statistics of numerical features for DataFrame are as follows:')
print('-'*100)
df.describe(include='object')

### Data Cleaning and Label Encoding

This code snippet performs data cleaning and label encoding on the DataFrame df.

In [None]:
df = df.where((pd.notnull(df)),'')

In [None]:
df

In [None]:
df.loc[df['Category'] == 'spam', 'Category',] = 0
df.loc[df['Category'] == 'ham', 'Category',] = 1

In [None]:
df

In [None]:
df['Category'].value_counts()

### Stemming Function for Text Preprocessing

This function performs stemming on input text content for text preprocessing tasks.

In [None]:
porter_stemmer = PorterStemmer()

In [None]:
def stemming(content):
    stemmed_content = re.sub('[^a-zA-Z]',' ',content)
    stemmed_content = stemmed_content.lower()
    stemmed_content = stemmed_content.split()
    stemmed_content = [porter_stemmer.stem(word) for word in stemmed_content if not word in stopwords.words('english')]
    stemmed_content = ' '.join(stemmed_content)
    return stemmed_content

In [None]:
df['Message'] = df['Message'].apply(stemming)

In [None]:
df['Message']

In [None]:
# separating the data and labels
X = df['Message'] # Feature matrix
y = df['Category'] # Target variable

In [None]:
X

In [None]:
y

In [None]:
# convert Y_train and Y_test values as integers
y = y.astype('int')

### TF-IDF Vectorization

This code snippet performs TF-IDF vectorization on the feature matrix X using TfidfVectorizer from sklearn.feature_extraction.text.

In [None]:
vectorizer = TfidfVectorizer()

In [None]:
vectorizer.fit(X)

X = vectorizer.transform(X)

In [None]:
X

In [None]:
print(X)

### Splitting Data into Training and Testing Sets

This code snippet splits the data into training and testing sets using train_test_split from sklearn.model_selection.

In [None]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=45)

In [None]:
print(X.shape, X_train.shape, X_test.shape)

In [None]:
print(y.shape, y_train.shape, y_test.shape)

### Evaluating Multiple Machine Learning Models

This code snippet evaluates various machine learning models on the training and testing data, storing their performance metrics.

In [None]:
models = [LogisticRegression, SVC, DecisionTreeClassifier, RandomForestClassifier]
accuracy_scores = []
precision_scores = []
recall_scores = []
f1_scores = []

for model in models:
    classifier = model().fit(X_train, y_train)
    y_pred = classifier.predict(X_test)
    
    accuracy_scores.append(accuracy_score(y_test, y_pred))
    precision_scores.append(precision_score(y_test, y_pred))
    recall_scores.append(recall_score(y_test, y_pred))
    f1_scores.append(f1_score(y_test, y_pred))

In [None]:
classification_metrics_df = pd.DataFrame({
    "Model": ["Logistic Regression", "SVM", "Decision Tree", "Random Forest"],
    "Accuracy": accuracy_scores,
    "Precision": precision_scores,
    "Recall": recall_scores,
    "F1 Score": f1_scores
})

classification_metrics_df.set_index('Model', inplace=True)
classification_metrics_df

## Inference

**Best Performing Model:** Random Forest stands out as the top-performing model with perfect recall and the highest F1 score, indicating its robustness in identifying positive instances without false negatives.

**Consistency and Effectiveness:** SVM also demonstrates strong performance across all metrics, making it a reliable alternative to Random Forest, especially if interpretability is a consideration.

**Decision Tree and Logistic Regression:** While both models perform well, they exhibit slightly lower metrics compared to SVM and Random Forest, particularly in recall and overall F1 score.