In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns



In [2]:
df = pd.read_csv('spam.csv',encoding='ISO-8859-1')
df.sample(5)


KeyboardInterrupt



In [None]:
df.shape

In [None]:
# 1. Data Clenaing
# 2. EDA
# 3. Text Preprocessing (stemming, vectorization,removal of stop words)
# 4. Model Building
# 5. Model Evaluation
# 6. Improvements
# 7. Converting into Website
# 8. Deployment of Website on Heroku

## 1. Data Cleaning

In [None]:
df.info()

In [None]:
# Drop last 3 columns

df.drop(columns = ['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace = True)

In [None]:
df.sample(2)

In [None]:
# Rename column names
df.rename(columns = {'v1':'target','v2':'message'},inplace = True)

In [None]:
df.sample(2)

In [None]:
# Applying LabelEncoder on target column

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
df['target'] = le.fit_transform(df['target'])

In [None]:
df.head(5)

In [None]:
# Missing values

df.isna().sum()

In [None]:
# Check for duplicates

df.duplicated().sum()

In [None]:
# Remove duplicates

df = df.drop_duplicates(keep = 'first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape

## 2. EDA

In [None]:
print(df.target.value_counts())
df.target.value_counts().plot(kind = 'bar')
plt.show()

In [None]:
print(df['target'].value_counts()/df['target'].count()*100)
plt.figure(figsize = (15,10))
plt.pie(df['target'].value_counts(),labels = ['ham','spam'],autopct = '%0.2f')
plt.show()

<b> It shows data is imbalanced

### Creation of Basic Features from 'message' column

In [None]:
len('I am')

In [None]:
# Length of message - Number of Characters in message
df['num_characters'] =  df['message'].apply(len) # OR df['message'].str.len()

In [None]:
df.head(2)

In [None]:
!pip install nltk

In [None]:
import nltk

In [None]:
nltk.download('punkt')

In [None]:
# Number of Words 

In [None]:
df['num_words'] = df['message'].apply(lambda x: len(nltk.word_tokenize(x)))  # OR df['message'].apply(lambda row: len(row.split(" ")))

In [None]:
# Number of Sentences

df['num_sentences'] = df['message'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head(2)

In [None]:
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
# ham messages

df[df['target']==0][['num_characters','num_words','num_sentences']].describe()

In [None]:
# spam messages

df[df['target']==1][['num_characters','num_words','num_sentences']].describe()

In [None]:
sns.histplot(df[df['target']==0]['num_characters'])
sns.histplot(df[df['target']==1]['num_characters'],color = 'red')
plt.show()

<b> It shows number of characters in most of ham messages are lesser than that of spam messages

In [None]:
sns.histplot(df[df['target']==0]['num_words'])
sns.histplot(df[df['target']==1]['num_words'],color = 'red')
plt.show()

<b> It shows number of words in most of ham messages are lesser than that of spam messages

In [None]:
sns.histplot(df[df['target']==0]['num_sentences'])
sns.histplot(df[df['target']==1]['num_sentences'],color = 'red')
plt.show()

<b> It shows number of sentences in most of ham messages are lesser than that of spam messages

In [None]:
plt.figure(figsize=(15,10))
sns.pairplot(df,hue='target')
plt.show()

<b> It shows data has outliers

In [None]:
# To show correlation in data
plt.figure(figsize = (15,12))
sns.heatmap(df.corr(),annot = True)
plt.show()

<b> It shows input variables(such as num_characters, num_words and num_sentences) have strong correlation with each other so multicollinearity problem exists. We will have to keep one variable (which will be num_characters as it has more correlation with target variable than other columns)

## 3. Data Preprocessing

i. Lower case </br>
ii. Tokenization</br>
iii. Removing Special Characters</br>
iv. Removing Stop Words and Punctuation</br>
v. Stemming


In [None]:
# List of stop words in English
from nltk.corpus import stopwords
stopwords.words('english')

In [None]:
# Punctuation marks
import string
string.punctuation

In [None]:
import re
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
# lemma = nltk.WordNetLemmatizer() # define lemmatizer


In [None]:
def transform_text(text):
    text = text.lower() # Lowercasing
    text = nltk.word_tokenize(text) # Tokenization
    text = re.sub("[^a-zA-Z]", " ", str(text)) # Remove non-letters   If this error 'expected string or bytes-like object' occurs then replace re.sub("[^a-zA-Z]", " ", text) with re.sub("[^a-zA-Z]", " ", str(text))
    text = text.translate(str.maketrans('','', string.punctuation)) # Remove punctuation
    text = re.sub(' +', ' ',text) # Remove extra space
    text = text.strip() # remove whitespaces
    text = ' '.join([word for word in text.split() if word not in stopwords.words("english")]) # Remove stop words
    text = ' '.join([ps.stem(word) for word in text.split()]) # OR ' '.join([lemma.lemmatize(word) for word in text.split()])
    
    return text

In [None]:
transform_text("I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.")

In [None]:
df['message'] = df['message'].apply(transform_text)

In [None]:
df.head(5)

In [None]:
# Remove url links
def remove_url(text):
    pattern = re.compile(r'https?://\S+|www\.\S+')
    return pattern.sub(r'',text)

In [None]:
remove_url('I am 12, www.google.com  23n ? ')

In [None]:
df['message'] = df['message'].apply(remove_url)

In [None]:
df['message'][0]

In [None]:
# Forming WordCloud which will highligh important words in both ham and spam
from wordcloud import WordCloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
## Spam messages
spam_wc = wc.generate(df[df['target'] == 1]['message'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(spam_wc)

In [None]:
## Ham messages
ham_wc = wc.generate(df[df['target'] == 0]['message'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(ham_wc)

In [None]:
## Top words in each category
### Spam category
spam_corpus = []
for msg in df[df['target']==1]['message'].tolist():
    for word in msg.split():
        spam_corpus.append(word)

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter
Counter(spam_corpus) # Dictionary will be craeted showing how many times each word has appeared

In [None]:
Counter(spam_corpus).most_common(30) # Most common words in spam messages

In [None]:
pd.DataFrame(Counter(spam_corpus).most_common(30))


In [None]:
pd.DataFrame(Counter(spam_corpus).most_common(30))[0]

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(pd.DataFrame(Counter(spam_corpus).most_common(30))[0],pd.DataFrame(Counter(spam_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

In [None]:
# Ham category

ham_corpus = []
for msg in df[df['target']==0]['message'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
Counter(ham_corpus).most_common(30)

In [None]:
plt.figure(figsize=(15,10))
sns.barplot(pd.DataFrame(Counter(ham_corpus).most_common(30))[0],pd.DataFrame(Counter(ham_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

## Model Building

In [None]:
## Creation of Bag of Words

from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

X = cv.fit_transform(df['message']).toarray() # Use toarray() to convert sparse array into dense array
X

In [None]:
cv.fit_transform(df['message'])

In [None]:
X.shape # total 5169 sms, and 6216 words

In [None]:
y = df['target'].values
y

In [None]:
## Applying train test split
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state =2)

In [None]:
X_train.shape

In [None]:
# On textual based data,naive bayes algorithm performs better. 

In [None]:
from sklearn.naive_bayes import GaussianNB,MultinomialNB,BernoulliNB
from sklearn.metrics import accuracy_score,confusion_matrix,precision_score

In [None]:
gnb = GaussianNB()
mnb = MultinomialNB()
bnb = BernoulliNB()

In [None]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

In [None]:
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

<b> In case of imbalanced dataset, precision score matters alot

In [None]:
# Creation of Tfidf Vectorizer

from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(max_features = 3000)

X = tfidf.fit_transform(df['message']).toarray()
X

In [None]:
# Applying MinMaxScaling because standardscaling also gives negative values which naive bayes does not accept

"""from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
X = scaler.fit_transform(X)
"""

In [None]:
# appending the num_character col to X
# X = np.hstack((X,df['num_characters'].values.reshape(-1,1)))

In [None]:
X.shape

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.2, random_state =2)

In [None]:
gnb.fit(X_train,y_train)
y_pred1 = gnb.predict(X_test)
print(accuracy_score(y_test,y_pred1))
print(confusion_matrix(y_test,y_pred1))
print(precision_score(y_test,y_pred1))

In [None]:
bnb.fit(X_train,y_train)
y_pred3 = bnb.predict(X_test)
print(accuracy_score(y_test,y_pred3))
print(confusion_matrix(y_test,y_pred3))
print(precision_score(y_test,y_pred3))

In [None]:
mnb.fit(X_train,y_train)
y_pred2 = mnb.predict(X_test)
print(accuracy_score(y_test,y_pred2))
print(confusion_matrix(y_test,y_pred2))
print(precision_score(y_test,y_pred2))

<b> This is performing best, maximum precision score (no false positive) and good accuracy score

<b> So, we have choosen tfidf and mnb

In [None]:
# Comparing results with other algorithms

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier

In [None]:
svc = SVC(kernel='sigmoid', gamma=1.0)
knc = KNeighborsClassifier()
mnb = MultinomialNB()
dtc = DecisionTreeClassifier(max_depth=5)
lrc = LogisticRegression(solver='liblinear', penalty='l1')
rfc = RandomForestClassifier(n_estimators=50, random_state=2)
abc = AdaBoostClassifier(n_estimators=50, random_state=2)
bc = BaggingClassifier(n_estimators=50, random_state=2)
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)
gbdt = GradientBoostingClassifier(n_estimators=50,random_state=2)
xgb = XGBClassifier(n_estimators=50,random_state=2)

In [None]:
clfs = {
    'SVC' : svc,
    'KN' : knc, 
    'NB': mnb, 
    'DT': dtc, 
    'LR': lrc, 
    'RF': rfc, 
    'AdaBoost': abc, 
    'BgC': bc, 
    'ETC': etc,
    'GBDT':gbdt,
    'xgb':xgb
}

In [None]:
def train_classifier(clf,X_train,y_train,X_test,y_test):
    clf.fit(X_train,y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test,y_pred)
    precision = precision_score(y_test,y_pred)
    
    return accuracy,precision

In [None]:
print('SVC: ',train_classifier(svc,X_train,y_train,X_test,y_test))
print('KNeighborsClassifier: ',train_classifier(knc,X_train,y_train,X_test,y_test))
print('MultinomialNB: ',train_classifier(mnb,X_train,y_train,X_test,y_test))
print('DecisionTreeClassifier: ',train_classifier(dtc,X_train,y_train,X_test,y_test))
print('LogisticRegression: ',train_classifier(lrc,X_train,y_train,X_test,y_test))
print('RandomForestClassifier: ',train_classifier(rfc,X_train,y_train,X_test,y_test))
print('AdaBoostClassifier: ',train_classifier(abc,X_train,y_train,X_test,y_test))
print('BaggingClassifier: ',train_classifier(bc,X_train,y_train,X_test,y_test))
print('ExtraTreesClassifier: ',train_classifier(etc,X_train,y_train,X_test,y_test))
print('GradientBoostingClassifier: ',train_classifier(gbdt,X_train,y_train,X_test,y_test))
print('XGBClassifier: ',train_classifier(xgb,X_train,y_train,X_test,y_test))

In [None]:
# OR

In [None]:
accuracy_scores = []
precision_scores = []

for name,clf in clfs.items():
    
    current_accuracy,current_precision = train_classifier(clf, X_train,y_train,X_test,y_test)
    
    print("For ",name)
    print("Accuracy - ",current_accuracy)
    print("Precision - ",current_precision)
    
    accuracy_scores.append(current_accuracy)
    precision_scores.append(current_precision)

In [None]:
performance_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy':accuracy_scores,'Precision':precision_scores}).sort_values('Precision',ascending=False)
performance_df

In [None]:
performance_df1 = pd.melt(performance_df, id_vars = "Algorithm")
performance_df1

In [None]:
sns.catplot(x = 'Algorithm', y='value', 
               hue = 'variable',data=performance_df1, kind='bar',height=5)
plt.ylim(0.5,1.0)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
# model improve
# 1. Change the max_features parameter of TfIdf

In [None]:
#temp_df = pd.DataFrame({'Algorithm':clfs.keys(),'Accuracy_max_ft_3000':accuracy_scores,'Precision_max_ft_3000':precision_scores}).sort_values('Precision_max_ft_3000',ascending=False)
#temp_df

In [None]:
#new_df = performance_df.merge(temp_df,on='Algorithm')
#new_df

In [None]:
#new_df_scaled = new_df.merge(performance_df,on='Algorithm')
#new_df_scaled 

In [None]:
#new_d = new_df_scaled.merge(performance_df,on='Algorithm')
#new_d

<b> Results show scaling is not favorable.Only tfidf with max_features =3000 gives better results

In [None]:
# Voting Classifier
svc = SVC(kernel='sigmoid', gamma=1.0,probability=True)
mnb = MultinomialNB()
etc = ExtraTreesClassifier(n_estimators=50, random_state=2)

from sklearn.ensemble import VotingClassifier

In [None]:
voting = VotingClassifier(estimators=[('svm', svc), ('nb', mnb), ('et', etc)],voting='soft')

In [None]:
voting.fit(X_train,y_train)

In [None]:
y_pred = voting.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

In [None]:
# Applying stacking
estimators=[('svm', svc), ('nb', mnb), ('et', etc)]
final_estimator=RandomForestClassifier()

In [None]:
from sklearn.ensemble import StackingClassifier

In [None]:
clf = StackingClassifier(estimators=estimators, final_estimator=final_estimator)

In [None]:
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)
print("Accuracy",accuracy_score(y_test,y_pred))
print("Precision",precision_score(y_test,y_pred))

<b> We will go for tfidf and mnb, keeping in mind precision score first sicnce this is imbalanced dataset

In [None]:
"""
import pickle
pickle.dump(tfidf,open('vectorizer.pkl','wb'))
pickle.dump(mnb,open('model.pkl','wb'))
"""

In [None]:
# input statment
test_review = ['This is a bad movie'] 

# convert to number
test_vector = tfidf.transform(test_review)
test_vector = test_vector.toarray()

## encodeing predict class
text_predict_class = le.inverse_transform(mnb.predict(test_vector))
print(test_review[0], 'is: ',text_predict_class[0])