# SMS Spam Detection

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('spam.csv',encoding='ISO-8859-1')

In [None]:
df.sample(5)

In [None]:
df.shape

# Data Cleaning

In [None]:
df.info()

In [None]:
# drop last 3 columns
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
# renaming column names
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

In [None]:
# encoding target column
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
df['target'] = encoder.fit_transform(df['target'])

In [None]:
# check missing values
df.isnull().sum()

In [None]:
# drop all duplicate values
df.duplicated().sum()

In [None]:
df = df.drop_duplicates(keep='first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape

# EDA

In [None]:
df.head()

In [None]:
# imbalanced dataset
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(),labels=['ham','spam'],autopct='%0.2f')
plt.show()

In [None]:
import nltk

In [None]:
# !pip install nltk
nltk.download('punkt')

In [None]:
# Adding columns for number of characters, words and sentences
# number of characters in text
df['num_characters'] = df['text'].apply(len)

In [None]:
# number of words in text
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
# number of sentences in text
df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
# Analsying the number of characters, words and sentences
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
# Analsying the number of characters, words and sentences
## for HAM
df[df['target'] == 0][['num_characters','num_words','num_sentences']].describe()

In [None]:
# Analsying the number of characters, words and sentences
## for SPAM
df[df['target'] == 1][['num_characters','num_words','num_sentences']].describe()

In [None]:
# Analysed that the number of characters for spam is greater than ham
import seaborn as sns

In [None]:
sns.histplot(df[df['target']==0]['num_characters'])
sns.histplot(df[df['target']==1]['num_characters'],color='red')

In [None]:
# to analyse the relationship b/w characters, words, sentences 
# can notice the presence of outliers
sns.pairplot(df,hue='target')

In [None]:
# plot an heatmap to get the correlation
# can notice the presence of multicollinearity b/w num characters, words & sentences
# as num_charcters is more related(0.38) with target , remove other two columns
sns.heatmap(df.corr(), annot=True)

# Data Preprocessing
1. Lower case
2. Tokenization
3. Removing special characters
4. Removing stop words and punctuation
5. stemming : Stripping words to their core or root meaning to improve search and analysis.

In [None]:
# nltk.download('stopwords')
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
import string

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
ps.stem('loving')

In [None]:
def transform_text(text):
    
    # step 1: lower case
    text = text.lower()
    
    # step 2: tokenisation 
    text = nltk.word_tokenize(text)
    
    # step 3: appending the words which only have alphabets and numbers
    text = [word for word in text if word.isalnum()]
    
    # step 4: appending those words which are not in stopwords and punctuation
    text = [word for word in text if word not in stopwords.words('english') and word not in string.punctuation]
        
    # step 5: stem the words in the list
    text = [ps.stem(word) for word in text]
    
    return " ".join(text)



# transform_text("hi i liked the shows enjoying how are you?")

In [None]:
df['transformed_text'] = df['text'].apply(transform_text)
df.head()

In [None]:
# !pip install wordcloud
from wordcloud import WordCloud
wc = WordCloud(width = 500 , height=500, min_font_size=10,background_color='white')

In [None]:
# To visually highlight the most frequent and important words in text.
# text - spam text
spam_wc = wc.generate(df[df['target']==1]['transformed_text'].str.cat(sep=" "))
plt.figure(figsize = (15,6))
plt.imshow(spam_wc)

In [None]:
ham_wc = wc.generate(df[df['target']==0]['transformed_text'].str.cat(sep=" "))
plt.figure(figsize = (15,6))
plt.imshow(ham_wc)

In [None]:
# create spam corpus to append all the words in transformed text into a list where the target is spam
spam_corpus = [word for msg in df[df['target'] == 1]['transformed_text'].tolist() for word in msg.split()]

# create ham corpus to append all the words in transformed text into a list where the target is ham
ham_corpus = [word for msg in df[df['target'] == 0]['transformed_text'].tolist() for word in msg.split()]

In [None]:
len(ham_corpus)

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter

# create a dataframe with 30 most common words in spam and ham
spam_30_df = pd.DataFrame(Counter(spam_corpus).most_common(30))

ham_30_df = pd.DataFrame(Counter(ham_corpus).most_common(30))

In [None]:
spam_30_df

In [None]:
ham_30_df

In [None]:
# plotting the most commonly used 30 words

sns.barplot(x = spam_30_df[0],y = spam_30_df[1])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
sns.barplot(x = ham_30_df[0],y = ham_30_df[1])
plt.xticks(rotation='vertical')
plt.show()

# Model Building

In [None]:
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
cv = CountVectorizer()

 **Here's an illustrative example:**

**Imagine a corpus of three documents:**

**Document 1:** "The dog ran after the ball."

**Document 2:** "The cat chased the mouse."

**Document 3:** "The dog barked at the mailman."


**Count Vectorizer representation:**

| Word  | Doc 1 | Doc 2 | Doc 3 |
|---|---|---|---|
| the  | 2    | 2    | 2    |
| dog  | 1    | 0    | 1    |
| ran  | 1    | 0    | 0    |
| after | 1    | 0    | 0    |
| ball | 1    | 0    | 0    |
| cat  | 0    | 1    | 0    |
| chased | 0    | 1    | 0    |
| mouse | 0    | 1    | 0    |
| barked | 0    | 0    | 1    |
| mailman | 0    | 0    | 1    |

**TF-IDF representation (normalized for clarity):**

| Word  | Doc 1 | Doc 2 | Doc 3 |
|---|---|---|---|
| the  | 0.15  | 0.15  | 0.15  |
| dog  | 0.31  | 0     | 0.22  |
| ran  | 0.15  | 0     | 0     |
| after | 0.15  | 0     | 0     |
| ball | 0.15  | 0     | 0     |
| cat  | 0     | 0.31  | 0     |
| chased | 0     | 0.31  | 0     |
| mouse | 0     | 0.31  | 0     |
| barked | 0     | 0     | 0.22  |
| mailman | 0     | 0     | 0.22  |

formula: 

TF(t, d) = (Number of times term t appears in document d) / (Total number of terms in document d)

IDF(t) = log(Total number of documents / Number of documents with term t)
(or) (smoothened to avoid division by zero)IDF(t) = log(1 + (Total number of documents / Number of documents with term t))

TF-IDF(t, d) = TF(t, d) * IDF(t)

**Key observations:**

- **Count Vectorizer** gives equal weight to all words, even common ones like "the".
- Represents text as raw word counts.
- Creates a matrix where each row represents a document and each column represents a unique word in the corpus.
- The value in each cell is the count of that word in the corresponding document.
- Simpler approach, but doesn't account for word importance.
- **TF-IDF** highlights more distinctive words like "dog", "cat", "ran", "chased", etc., which are more informative for understanding the content of each document.
- Combines word frequencies with their relative importance in the corpus.
- In this example, TF-IDF would likely be more effective for tasks like text classification or information retrieval, as it better captures the unique characteristics of each document.


In [None]:
X_cv = cv.fit_transform(df['transformed_text']).toarray()

# 6708 words
X_cv.shape

In [None]:
y_cv = df['target'].values
y_cv

In [None]:
# train test split
from sklearn.model_selection import train_test_split

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X_cv,y_cv,test_size=0.2,random_state=2)

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
tfidf = TfidfVectorizer()

In [None]:
X_tfidf = tfidf.fit_transform(df['transformed_text']).toarray()
y_tfidf = df['target'].values

# train test split
X_train,X_test,y_train,y_test = train_test_split(X_tfidf,y_tfidf,test_size=0.2,random_state=2)


gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

**In the case of Spam detection , focus is on the false positives, i.e. (Mail not spam(0) and predicted as spam(1)).We can see the precision is 1 in the case of MNB with number of false positives as 0.**

## Comparing other models with MNB

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
# !pip install xgboost
from xgboost import XGBClassifier


**BaggingClassifier**

Base model: Any type of classifier can be used as the base model.
Data sampling: Creates multiple bootstrap samples (random samples with replacement) from the training data.
Feature sampling: Uses all features for each model.
Combines predictions: Uses averaging (for regression) or majority voting (for classification).
RandomForestClassifier:

Base model: Uses decision trees as the base model.
Data sampling: Same as BaggingClassifier, uses bootstrap sampling.
Feature sampling: Randomly selects a subset of features at each split in each tree, further increasing diversity.
Combines predictions: Same as BaggingClassifier, uses averaging or voting.

**Extra Trees Classifier** (Extremely Randomized Trees) is an ensemble machine learning method that trains multiple decision trees on different subsets of the data and combines their predictions to make a final decision. It's similar to Random Forest, but with two key differences that make it even more randomized and often faster:

1. Random Sampling Without Replacement:

In Random Forest, each tree is trained on a random subset of data (bootstrap sample) obtained by sampling with replacement.
Extra Trees uses random sampling without replacement, meaning each data point can only appear in one tree's training set. This creates more diversity among trees.


2. Random Split Selection:

Random Forest finds the best feature and split point for each node in a tree.
Extra Trees selects a random split point for each feature and chooses the best one among those random splits. This further increases randomness and reduces training time.


In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'NB': mnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

train_classifier(svc,X_train,y_train,X_test,y_test)

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

# Create a dataframe to plot accuracy and precision of each algorithm
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
performance_df

In [None]:
# to plot the performance based on accuracy and precision of each algorithm
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")
performance_df1

sns.catplot(x = 'Algorithm', y='value', 
               hue = 'variable',data=performance_df1, kind='bar',height=5)
plt.ylim(0.5,1.0)
plt.xticks(rotation='vertical')
plt.show()

# Model Improvement

* **Improvement 1 :** use max_features as 3000 in tfidf, which considers only 3000 frequently used words for vectorisation.


In [None]:
tfidf = TfidfVectorizer(max_features=3000)

In [None]:
X_tfidf = tfidf.fit_transform(df['transformed_text']).toarray()
y_tfidf = df['target'].values

# train test split
X_train,X_test,y_train,y_test = train_test_split(X_tfidf,y_tfidf,test_size=0.2,random_state=2)

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

# Create a dataframe to plot accuracy and precision of each algorithm
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
performance_df

In [None]:
# to plot the performance based on accuracy and precision of each algorithm
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")
performance_df1

sns.catplot(x = 'Algorithm', y='value', 
               hue = 'variable',data=performance_df1, kind='bar',height=5)
plt.ylim(0.5,1.0)
plt.xticks(rotation='vertical')
plt.show()

**The precision of random forest has decreased but MNB seems to be performing well. but previously RF was better in terms of accuracy wrt to NB. Thus not considering this improvement**

# Improvement 2 

In [None]:
X_tfidf = tfidf.fit_transform(df['transformed_text']).toarray()
y_tfidf = df['target'].values

In [None]:
# usage of MinMaxscaler
# We dont use standard scaler as it gives negative values which cant be passed into NB algorithm.
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X_tfidf = scaler.fit_transform(X_tfidf)

In [None]:
# train test split
X_train,X_test,y_train,y_test = train_test_split(X_tfidf,y_tfidf,test_size=0.2,random_state=2)

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

# Create a dataframe to plot accuracy and precision of each algorithm
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
print(performance_df)

# to plot the performance based on accuracy and precision of each algorithm
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")
print(performance_df1)

sns.catplot(x = 'Algorithm', y='value', 
               hue = 'variable',data=performance_df1, kind='bar',height=5)
plt.ylim(0.5,1.0)
plt.xticks(rotation='vertical')
plt.show()


# Improvement 3

In [None]:
# appending the num_character col to X_tfidf
X_tfidf = tfidf.fit_transform(df['transformed_text']).toarray()
X_tfidf = np.hstack((X_tfidf,df['num_characters'].values.reshape(-1,1)))
y_tfidf = df['target'].values


# train test split
X_train,X_test,y_train,y_test = train_test_split(X_tfidf,y_tfidf,test_size=0.2,random_state=2)

accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

# Create a dataframe to plot accuracy and precision of each algorithm
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
print(performance_df)

# to plot the performance based on accuracy and precision of each algorithm
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")
print(performance_df1)

sns.catplot(x = 'Algorithm', y='value', 
               hue = 'variable',data=performance_df1, kind='bar',height=5)
plt.ylim(0.5,1.0)
plt.xticks(rotation='vertical')
plt.show()


**From the accuracy and precision scores, we could infer that scaling or appending num_characters to the input , does not seem to improve the model .**

# Improvement 4:

**A voting classifier is an ensemble machine learning model that combines the predictions of multiple individual models to make a final, more robust prediction. It works by aggregating the votes or predicted probabilities from each model and then selecting the class that receives the most support.**

**Types of voting strategies:**

**Hard voting**: Each base model votes for a single class, and the class with the most votes wins.

**Soft voting**: Each base model predicts a probability for each class, and the probabilities are averaged across models. The class with the highest average probability wins.**

In [None]:
# Voting Classifier

rfc = RandomForestClassifier(n_estimators=50, random_state=2)
mnb = MultinomialNB()
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

from sklearn.ensemble import VotingClassifier

voting = VotingClassifier(estimators=[('rf', rfc), ('nb', mnb), ('et', etc)],voting='soft')
voting.fit(X_train,y_train)

In [None]:
y_pred = voting.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

# Improvement 5:
simple example
Voting: You listen to everyone's opinions and go with the most popular choice. It's like a quick poll.
Stacking: You ask a wise mentor (meta-model) to consider everyone's suggestions and make the final decision based on their insights. It's like getting expert guidance.

Train multiple "expert" models: Each model learns from the data in its own way.
Collect their predictions: Each model makes predictions for the same data points.
Create a new dataset: Use these predictions as features for a new dataset.
Train a "master model": This model learns how to combine the expert predictions effectively.
Make final predictions: When given new data, each expert model makes predictions, and the master model uses those predictions to make the ultimate decision.

In [None]:
estimators=[('rf', rfc), ('nb', mnb), ('et', etc)]
final_estimator=RandomForestClassifier()

from sklearn.ensemble import StackingClassifier
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)
clf.fit(X_train,y_train)

y_pred = clf.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

In [None]:
import pickle
pickle.dump(tfidf,open('application/vectorizer.pkl','wb'))
pickle.dump(rfc,open('application/model.pkl','wb'))

In [None]:
rfc.fit()

In [None]:
def predict(input_sms):
    tfidf = pickle.load(open('vectorizer.pkl','rb'))
    model = pickle.load(open('model.pkl','rb'))

    # 1. preprocess
    transformed_sms = transform_text(input_sms)
    # 2. vectorize
    vector_input = tfidf.transform([transformed_sms])
    # 3. predict
    result = model.predict(vector_input)[0]
    # 4. Display
    if result == 1:
        return "Spam"
    else:
        return "Not Spam"

In [None]:
predict("sfsgdgsfbfb fgsfdgsdfg")