In [None]:
import pandas as pd
df=pd.read_csv('spam.csv',encoding='latin-1',on_bad_lines='skip',sep='\t',names=['Type','Message'])

In [14]:
df

Unnamed: 0,Type,Message
0,Type,Message
1,ham,"Go until jurong point, crazy.. Available only ..."
2,ham,Ok lar... Joking wif u oni...
3,spam,Free entry in 2 a wkly comp to win FA Cup fina...
4,ham,U dun say so early hor... U c already then say...
...,...,...
652,ham,Anything lor. Juz both of us lor.
653,ham,Get me out of this dump heap. My mom decided t...
654,ham,Ok lor... Sony ericsson salesman... I ask shuh...
655,ham,Ard 6 like dat lor.


### Now gradually we will extract the data in the message column of the dataframe and store them in the corpus variable.

### At the same time we will apply stopwords and lemmatization on the sentences

In [16]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
import re
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
lemmatizer=WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\KIIT0001\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [18]:
corpus = []

for msg in df["Message"]:
    msg = msg.lower()
    tokens = word_tokenize(msg)
    cleaned_tokens = [
        lemmatizer.lemmatize(token)
        for token in tokens
        if   token.isalpha() and token not in set(stopwords.words('english'))
    ]
    
    # join tokens back into sentence
    corpus.append(" ".join(cleaned_tokens))


In [19]:
corpus

['message',
 'go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gon na home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt send cost tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c lccltd pobox',
 'finally match heading towards 

### Apply the BOW model

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=500,binary=True)## we are taking only 500 features from the corpus (ie it contains the words) the top 500 words with maximum frequency
## binary =true means that all the value will be 0 or 1 if in a sentence a word is present more than one then also the value  will be 1
x=cv.fit_transform(corpus).toarray()

In [27]:
x,x.size

(array([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]], dtype=int64),
 328500)

### N-Grams

#### The N-Grams are the combination of N words in a sentence

In [29]:
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=5000,binary=True,ngram_range=(1,3))
x=cv.fit_transform(corpus).toarray()

In [31]:
cv.vocabulary_

{'message': 541,
 'go': 344,
 'point': 1698,
 'crazy': 193,
 'available': 43,
 'great': 365,
 'world': 4758,
 'la': 462,
 'got': 363,
 'wat': 4352,
 'point crazy': 1701,
 'world la': 4761,
 'point crazy available': 1702,
 'world la buffet': 4762,
 'ok': 1224,
 'lar': 469,
 'joking': 441,
 'wif': 4569,
 'oni': 1332,
 'ok lar': 1233,
 'wif oni': 4572,
 'ok lar joking': 1234,
 'free': 313,
 'entry': 269,
 'wkly': 4676,
 'comp': 174,
 'win': 4598,
 'final': 298,
 'tkts': 3678,
 'may': 534,
 'text': 3384,
 'receive': 2005,
 'question': 1879,
 'std': 3095,
 'txt': 3923,
 'rate': 1921,
 'apply': 25,
 'free entry': 316,
 'entry wkly': 270,
 'wkly comp': 4677,
 'win fa': 4603,
 'tkts may': 3679,
 'receive entry': 2006,
 'question std': 1882,
 'std txt': 3098,
 'txt rate': 3939,
 'rate apply': 1922,
 'free entry wkly': 317,
 'entry wkly comp': 271,
 'wkly comp win': 4679,
 'win fa cup': 4604,
 'tkts may text': 3680,
 'receive entry question': 2007,
 'question std txt': 1883,
 'std txt rate': 309