In [None]:
pip uninstall matplotlib

In [None]:
pip install matplotlib==3.7.3

In [None]:
import numpy as np
import pandas as pd

In [None]:
df = pd.read_csv('spam.csv')

In [None]:
df.sample(5)

In [None]:
df.shape

In [None]:
# 1. Data cleaning
# 2. EDA
# 3. Text Preprocessing
# 4. Model building
# 5. Evaluation
# 6. Improvement
# 7. Website
# 8. Deploy

1. Data Cleaning¶

In [None]:
df.info()

In [None]:
# drop last 3 cols
df.drop(columns=['Unnamed: 2','Unnamed: 3','Unnamed: 4'],inplace=True)

In [None]:
df.sample(5)

In [None]:
# renaming the cols
df.rename(columns={'v1':'target','v2':'text'},inplace=True)
df.sample(5)

In [None]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [None]:
df['target'] = encoder.fit_transform(df['target'])

In [None]:
df.head()

In [None]:
# missing values
df.isnull().sum()

In [None]:
# check for duplicate values
df.duplicated().sum()

In [None]:
# remove duplicates
df = df.drop_duplicates(keep='first')

In [None]:
df.duplicated().sum()

In [None]:
df.shape

'2.EDA'

In [None]:
df.head()

In [None]:
df['target'].value_counts()

In [None]:
import matplotlib.pyplot as plt
plt.pie(df['target'].value_counts(), labels=['ham','spam'],autopct="%0.2f")
plt.show()

In [None]:
# Data is imbalanced

In [None]:
import nltk

In [None]:
!pip install nltk

In [None]:
nltk.download('punkt')

In [None]:
df['num_characters'] = df['text'].apply(len)

In [None]:
df.head()

In [None]:
df['num_words'] = df['text'].apply(lambda x:len(nltk.word_tokenize(x)))

In [None]:
df.head()

In [None]:
df['num_sentences'] = df['text'].apply(lambda x:len(nltk.sent_tokenize(x)))

In [None]:
df.head()

In [None]:
df[['num_characters','num_words','num_sentences']].describe()

In [None]:
#ham
df[df['target'] == 0][['num_characters','num_words','num_sentences']].describe()

In [None]:
#spam
df[df['target'] == 1][['num_characters','num_words','num_sentences']].describe()

In [None]:
import seaborn as sns

In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target'] == 0]['num_characters'])
sns.histplot(df[df['target'] == 1]['num_characters'],color='red')

In [None]:
plt.figure(figsize=(12,6))
sns.histplot(df[df['target'] == 0]['num_words'])
sns.histplot(df[df['target'] == 1]['num_words'],color='red')

In [None]:
sns.pairplot(df,hue='target')

In [None]:
df.drop(columns=['text']).corr()

In [None]:
sns.heatmap(df.drop(columns=['text']).corr())

#3. Data Preprocessing
#Lower case
#Tokenization
#Removing special characters
#Removing stop words and punctuation
#Stemming'

In [None]:
nltk.download('stopwords')

In [None]:
import nltk
from nltk.corpus import stopwords

In [None]:
import string 

In [None]:
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
ps = PorterStemmer()

In [None]:
def transform_text(text):
    text = text.lower()
    text = nltk.word_tokenize(text)
    
    y = []
    for i in text:
        if i.isalnum():
            y.append(i)
    
    text = y[:]
    y.clear()
    
    for i in text:
        if i not in stopwords.words('english') and i not in string.punctuation:
            y.append(i)
            
    text = y[:]
    y.clear()
    
    for i in text:
        y.append(ps.stem(i))
    
            
    return " ".join(y)

In [None]:
transform_text("I'm gonna be home soon and i don't want to talk about this stuff anymore tonight, k? I've cried enough today.")

In [None]:
#df['text'][10]

In [None]:
df


In [None]:
df['text'][10]

In [None]:
from nltk.stem.porter import PorterStemmer
ps = PorterStemmer()
#stem means root form
ps.stem('eating')

In [None]:
df['transformed_text'] = df['text'].apply(transform_text)

In [None]:
df.head()

In [None]:
pip install wordcloud

In [None]:
from wordcloud import WordCloud
wc = WordCloud(width=500,height=500,min_font_size=10,background_color='white')

In [None]:
spam_wc = wc.generate(df[df['target'] == 1]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(spam_wc)

In [None]:
ham_wc = wc.generate(df[df['target'] == 0]['transformed_text'].str.cat(sep=" "))

In [None]:
plt.figure(figsize=(15,6))
plt.imshow(ham_wc)

In [None]:
df.head()

In [None]:
spam_corpus = []
for msg in df[df['target'] == 1]['transformed_text'].tolist():
    for word in msg.split():
        spam_corpus.append(word)
        

In [None]:
spam_corpus

In [None]:
len(spam_corpus)

In [None]:
from collections import Counter
Counter(spam_corpus).most_common(30)

In [None]:
pd.DataFrame(Counter(spam_corpus).most_common(30))

In [None]:
sns.barplot(x=pd.DataFrame(Counter(spam_corpus).most_common(30))[0], y=pd.DataFrame(Counter(spam_corpus).most_common(30))[1], alpha=0.8)
plt.xticks(rotation='vertical')
plt.show()

In [None]:
ham_corpus = []
for msg in df[df['target'] == 0]['transformed_text'].tolist():
    for word in msg.split():
        ham_corpus.append(word)

In [None]:
len(ham_corpus)

In [None]:
from collections import Counter
sns.barplot(x=pd.DataFrame(Counter(ham_corpus).most_common(30))[0],y=pd.DataFrame(Counter(ham_corpus).most_common(30))[1])
plt.xticks(rotation='vertical')
plt.show()

''4.Model Building'''