# Import Libraries

In [24]:
import pandas as pd 
import re 
import string
import nltk
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

# Reading Data

In [3]:
data = pd.read_csv('spam_sms.csv')
data.head()

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [4]:
data.columns = ['label', 'text']

In [5]:
stopwords = nltk.corpus.stopwords.words('english')
ps = nltk.PorterStemmer() 

# Clean Data

In [9]:
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.split('\W+', text)
    text = " ".join(ps.stem(word) for word in tokens if word not in stopwords)
    return text

In [10]:
data['cleaned_text'] = data['text'].apply(lambda x: clean_text(x))
data.head()

Unnamed: 0,label,text,cleaned_text
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


# Vectorisation

In [11]:
count_vect = CountVectorizer()
X_counts = count_vect.fit_transform(data['cleaned_text'])

print(X_counts.shape)

(5572, 8026)


In [13]:
print(count_vect.get_feature_names_out())

['008704050406' '0089mi' '0121' ... 'ûïharri' 'ûò' 'ûówell']


In [14]:
X_counts_df = pd.DataFrame(X_counts.toarray())
X_counts_df

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,8016,8017,8018,8019,8020,8021,8022,8023,8024,8025
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [15]:
X_counts_df.columns = count_vect.get_feature_names_out()
X_counts_df

Unnamed: 0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,ìï,ìïll,ûthank,ûªm,ûªt,ûªve,ûï,ûïharri,ûò,ûówell
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# N-gram

In [16]:
ngram_vect = CountVectorizer(ngram_range=(2, 2))
X_counts_ng = ngram_vect.fit_transform(data['cleaned_text'])

print(X_counts_ng.shape)

(5572, 31142)


In [20]:
print(ngram_vect.get_feature_names_out()[1500:1600])

['age product' 'age rang' 'age start' 'age verifi' 'age yeah'
 'age16 150perweeksub' 'age16 150ppermesssubscript' 'age16 2stop'
 'age16 stoptxtstopå' 'age23 blond' 'agenc rent' 'agent dont' 'agent mob'
 'agesr ur' 'ago alreadi' 'ago guy' 'ago liao' 'ago much' 'ago shower'
 'ago wat' 'ago wtf' 'agocusoon honi' 'agre price' 'agre stop'
 'agreen day' 'ah coz' 'ah den' 'ah dun' 'ah fail' 'ah keep' 'ah meet'
 'ah muz' 'ah oki' 'ah opp' 'ah poop' 'ah poor' 'ah said' 'ah see'
 'ah thk' 'ah tmr' 'ah waitin' 'ah wat' 'ah well' 'aha da' 'ahead amp'
 'ahead month' 'ahead smoke' 'ahead watt' 'ahhh work' 'ahhhhjust woken'
 'ahmad ador' 'ahmad al' 'ahmad cant' 'ahmad kiss' 'ahmad saeed'
 'ahnow wkg' 'ahold anybodi' 'ahsen got' 'ahth ubi' 'ahwhat machiani'
 'aid patent' 'aid usml' 'aig join' 'aight call' 'aight close'
 'aight fuck' 'aight ill' 'aight im' 'aight ive' 'aight latest'
 'aight lemm' 'aight let' 'aight pick' 'aight plan' 'aight rush'
 'aight see' 'aight sorri' 'aight sound' 'aight still' '

In [23]:
X_counts_ng_df = pd.DataFrame(X_counts_ng.toarray())
X_counts_ng_df.columns = ngram_vect.get_feature_names_out()
X_counts_ng_df

Unnamed: 0,008704050406 sp,0089mi last,0121 2025050,01223585236 xx,01223585334 cum,0125698789 ring,02 user,020603 2nd,0207 153,02072069400 bx,...,ûò address,ûò entertain,ûò even,ûò favour,ûò get,ûò hope,ûò inde,ûò limp,ûò sound,ûówell done
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5568,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5569,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5570,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# TF-IDF

In [25]:
tfidf_vect = TfidfVectorizer()
X_tfidf = tfidf_vect.fit_transform(data['cleaned_text'])

print(X_tfidf.shape)

(5572, 8026)


In [27]:
print(tfidf_vect.get_feature_names_out()[1000:1100])

['ador' 'adp' 'adress' 'adrian' 'adsens' 'adult' 'advanc' 'adventur'
 'advic' 'advis' 'advisor' 'aeronaut' 'aeroplan' 'afew' 'affair' 'affect'
 'affection' 'affectionsamp' 'affidavit' 'afford' 'afghanistan' 'afraid'
 'africa' 'african' 'aft' 'afternon' 'afternoon' 'afterward' 'aftr' 'ag'
 'againcal' 'againlov' 'agalla' 'age' 'age16' 'age16150ppermesssubscript'
 'age23' 'agenc' 'agent' 'agesr' 'agidhan' 'ago' 'agocusoon' 'agre'
 'agreen' 'ah' 'aha' 'ahead' 'ahge' 'ahhh' 'ahhhhjust' 'ahmad' 'ahnow'
 'ahold' 'ahsen' 'ahth' 'ahwhat' 'aid' 'aig' 'aight' 'aint' 'air' 'air1'
 'airport' 'airtel' 'aiya' 'aiyah' 'aiyar' 'aiyo' 'ajith' 'ak' 'aka'
 'akonlon' 'al' 'alaikkumprid' 'alaipayuth' 'albi' 'album' 'albumquit'
 'alcohol' 'aldrin' 'alert' 'alertfrom' 'alett' 'alex' 'alfi' 'algarv'
 'algebra' 'algorithm' 'ali' 'alian' 'alibi' 'aliv' 'alivebett' 'all'
 'allah' 'allahmeet' 'allahrakhesh' 'allalo' 'allday']


In [28]:
X_tfidf_df = pd.DataFrame(X_tfidf.toarray())
X_tfidf_df.columns = tfidf_vect.get_feature_names_out()
X_tfidf_df

Unnamed: 0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,ìï,ìïll,ûthank,ûªm,ûªt,ûªve,ûï,ûïharri,ûò,ûówell
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
