In [1]:
print("Kernel working fine.")

Kernel working fine.


## Importing the necessary libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

from wordcloud import WordCloud

import nltk
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/atharvabot7/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/atharvabot7/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [3]:
df = pd.read_csv('spam.csv')
df.head(10)

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
5,spam,FreeMsg Hey there darling it's been 3 week's n...,,,
6,ham,Even my brother is not like to speak with me. ...,,,
7,ham,As per your request 'Melle Melle (Oru Minnamin...,,,
8,spam,WINNER!! As a valued network customer you have...,,,
9,spam,Had your mobile 11 months or more? U R entitle...,,,


In [4]:
df.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)
df.head(5)

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [6]:
df.rename(columns={'v1': 'target', 'v2': 'text'}, inplace=True)
df.head()

Unnamed: 0,target,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## Data Preprocessing

In [7]:
from sklearn.preprocessing import LabelEncoder

encoder = LabelEncoder()
df['target'] = encoder.fit_transform(df['target'])

df.head()

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


In [8]:
df.duplicated().sum()

np.int64(403)

In [9]:
len(df)

5572

In [12]:
df.drop_duplicates(keep='first')

Unnamed: 0,target,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,1,This is the 2nd time we have tried 2 contact u...
5568,0,Will Ì_ b going to esplanade fr home?
5569,0,"Pity, * was in mood for that. So...any other s..."
5570,0,The guy did some bitching but I acted like i'd...


In [13]:
len(df)

5169

## Feature Engineering

In [15]:
from nltk.stem import PorterStemmer
import string 
import spacy

nlp = spacy.load("en_core_web_sm")

ps = PorterStemmer()

In [16]:
def transform_text(text):
    # Lowercase the text
    text = text.lower()

    # Tokenize using spaCy
    doc = nlp(text)

    # Process tokens
    y = []
    for token in doc:
        # Keep only alphabetic tokens (removes numbers, symbols, punctuations)
        # Also remove stopwords (like 'is', 'the', etc.)
        if token.is_alpha and token.text not in stopwords.words('english'):
            # Apply stemming
            y.append(ps.stem(token.text))

    # Join tokens back into a single string
    return " ".join(y)

In [17]:
transform_text("This is a sample text, with some numbers 123 and symbols !&(*@#.")

'sampl text number symbol'

In [22]:
value = df.iloc[0, 1]
value

'Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...'

In [23]:
transform_text(value)

'go jurong point crazi avail bugi n great world la e buffet cine got amor wat'

In [24]:
df['transformed_text'] = df['text'].apply(transform_text)

In [25]:
df.head(10)

Unnamed: 0,target,text,transformed_text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,0,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt may ...
3,0,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah think goe usf live around though
5,1,FreeMsg Hey there darling it's been 3 week's n...,freemsg hey darl week word back like fun still...
6,0,Even my brother is not like to speak with me. ...,even brother like speak treat like aid patent
7,0,As per your request 'Melle Melle (Oru Minnamin...,per request mell mell oru minnaminungint nurun...
8,1,WINNER!! As a valued network customer you have...,winner valu network custom select receivea pri...
9,1,Had your mobile 11 months or more? U R entitle...,mobil month u r entitl updat latest colour mob...


In [27]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
tfidf = TfidfVectorizer(max_features=5000, min_df=7, max_df=0.7, stop_words='english')

In [31]:
X = tfidf.fit_transform(df['transformed_text']).toarray()
y = df['target'].values

In [32]:
X

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], shape=(5169, 913))

In [30]:
y

array([0, 0, 1, ..., 0, 0, 0], shape=(5169,))

## Splitting the data into train and test datasets

In [33]:
from sklearn.model_selection import train_test_split

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=2)

## Model Training