In [30]:
import os
import pandas as pd
import nltk
import re
import string
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.compose import make_column_transformer
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix
from sklearn.metrics import precision_recall_fscore_support as score
from sklearn import metrics
import numpy as np
from collections import Counter
import time
from statistics import mean
from xgboost import XGBClassifier

In [31]:
os.chdir("Spam Classification")
os.listdir(os.curdir)
# Tells you the directory where spam is.

FileNotFoundError: [Errno 2] No such file or directory: 'Spam Classification'

In [32]:
# Load data, drop na's, show columns, print data.
data = pd.read_csv("spam.csv", encoding = "latin-1")
data = data.dropna(how = "any", axis = 1)
data.columns = ['label','body_text']
data.head()

Unnamed: 0,label,body_text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [34]:
# Print number of rows and columns.
print(f"Input data has {len(data)} rows and {len(data.columns)} columns.")

Input data has 5572 rows and 2 columns.


In [33]:
# Prints those classified as ham or spam
print(f"Out of {len(data)} rows, {len(data[data.label == 'spam'])} are spam and {len(data[data.label == 'ham'])} are ham.")

Out of 5572 rows, 747 are spam and 4825 are ham.


In [35]:
# We are counting the number of null values in both columns.
print(f"Number of null in label: {data.label.isnull().sum()}")
print(f"Number of null in text: {data.body_text.isnull().sum()}")

Number of null in label: 0
Number of null in text: 0


In [36]:
# Creates a new column with no spaces in text called x.
data['body_len'] = data.body_text.apply(lambda x: len(x) - x.count(" "))

In [37]:
# Creates a function that counts the percentage of punctuation marks in text.
def count_punct(text):
    count = sum([1 for char in text if char in string.punctuation])
    return round(count/(len(text) - text.count(" ")), 3) * 100

In [38]:
# Creates a new column with the percentage calculated before.
data['punct%'] = data.body_text.apply(lambda x: count_punct(x))
data.head()

Unnamed: 0,label,body_text,body_len,punct%
0,ham,"Go until jurong point, crazy.. Available only ...",92,9.8
1,ham,Ok lar... Joking wif u oni...,24,25.0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7
3,ham,U dun say so early hor... U c already then say...,39,15.4
4,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1


In [39]:
# Create a matrix with the metrics and transpose it.
data[['body_len', 'punct%']].describe().transpose()

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
body_len,5572.0,65.512024,48.629795,2.0,29.0,50.0,98.0,740.0
punct%,5572.0,7.202656,6.701062,0.0,3.3,5.6,9.2,100.0


In [40]:
# 740 max length of the text and we print the first 740 characters.
list(data.loc[data.body_len == 740, 'body_text'])

["For me the love should start with attraction.i should feel that I need her every time around me.she should be the first thing which comes in my thoughts.I would start the day and end it with her.she should be there every time I dream.love will be then when my every breath has her name.my life should happen around her.my life will be named to her.I would cry for her.will give all my happiness and take all her sorrows.I will be ready to fight with anyone for her.I will be in love when I will be doing the craziest things for her.love will be when I don't have to proove anyone that my girl is the most beautiful lady on the whole planet.I will always be singing praises for her.love will be when I start up making chicken curry and end up makiing sambar.life will be the most beautiful then.will get every morning and thank god for the day because she is with me.I would like to say a lot..will tell later.."]

In [41]:
# Create two instances for word stems and word lemmers. 
ps = nltk.PorterStemmer()
wn = nltk.WordNetLemmatizer()

In [43]:
# Join text in lower case and and remove punctuation. 
def clean_text(text):
    text = "".join([word.lower() for word in text if word not in string.punctuation])
    tokens = re.findall('\S+', text) # Everything that has no spaces.
    # text = [ps.stem(word) for word in tokens if word not in stopwords.words('english')]
    text = [wn.lemmatize(word) for word in tokens if word not in stopwords.words('english')] #lemmatize the tokens
    return text



In [44]:
# We are creating a new clean_text column and showing the first 10 values.
data['cleaned_text'] = data.body_text.apply(lambda x: clean_text(x))
data[['body_text', 'cleaned_text']].head(10)

Unnamed: 0,body_text,cleaned_text
0,"Go until jurong point, crazy.. Available only ...","[go, jurong, point, crazy, available, bugis, n..."
1,Ok lar... Joking wif u oni...,"[ok, lar, joking, wif, u, oni]"
2,Free entry in 2 a wkly comp to win FA Cup fina...,"[free, entry, 2, wkly, comp, win, fa, cup, fin..."
3,U dun say so early hor... U c already then say...,"[u, dun, say, early, hor, u, c, already, say]"
4,"Nah I don't think he goes to usf, he lives aro...","[nah, dont, think, go, usf, life, around, though]"
5,FreeMsg Hey there darling it's been 3 week's n...,"[freemsg, hey, darling, 3, week, word, back, i..."
6,Even my brother is not like to speak with me. ...,"[even, brother, like, speak, treat, like, aid,..."
7,As per your request 'Melle Melle (Oru Minnamin...,"[per, request, melle, melle, oru, minnaminungi..."
8,WINNER!! As a valued network customer you have...,"[winner, valued, network, customer, selected, ..."
9,Had your mobile 11 months or more? U R entitle...,"[mobile, 11, month, u, r, entitled, update, la..."


In [45]:
ham_words = list(data.loc[data.label == 'ham', 'cleaned_text'])

# Extract all words from ham messages
ham_words = list(np.concatenate(ham_words).flat)

# Count the frequency of each word
ham_words = Counter(ham_words)

# Create a dataframe to show the 50 most common words in ham messages
pd.DataFrame(ham_words.most_common(50), columns = ['word', 'frequency'])

Unnamed: 0,word,frequency
0,u,1027
1,im,449
2,get,314
3,2,305
4,ltgt,276
5,go,273
6,ok,272
7,dont,257
8,come,242
9,know,241


In [46]:
spam_words = list(data.loc[data.label == 'spam', 'cleaned_text'])

# Extract all words from spam messages
spam_words = list(np.concatenate(spam_words).flat)

# Count the frequency of each word
spam_words = Counter(spam_words)

# Create a dataframe to show the 50 most common words in spam messages
pd.DataFrame(spam_words.most_common(50), columns = ['word', 'frequency'])

Unnamed: 0,word,frequency
0,call,359
1,free,216
2,2,173
3,u,155
4,txt,150
5,ur,144
6,text,137
7,mobile,135
8,4,119
9,claim,115


In [47]:
extra_stopwords = ['u', 'im', '2', 'ur', 'ill', '4', 'lor', 'r', 'n', 'da', 'oh']

# Remove extra stopwords from the 'cleaned_text' column in the data
data['cleaned_text'] = data['cleaned_text'].apply(lambda x: [word for word in x if word not in extra_stopwords])

# Extract all words from ham messages after removing extra stopwords
ham_words = list(data.loc[data.label == 'ham', 'cleaned_text'])
ham_words = list(np.concatenate(ham_words).flat)
ham_words = Counter(ham_words)

# Create a dataframe to show the 30 most common words in ham messages after removing extra stopwords
ham_words = pd.DataFrame(ham_words.most_common(30), columns = ['word', 'frequency'])

# Extract all words from spam messages after removing extra stopwords
spam_words = list(data.loc[data.label == 'spam', 'cleaned_text'])
spam_words = list(np.concatenate(spam_words).flat)
spam_words = Counter(spam_words)

# Create a dataframe to show the 30 most common words in spam messages after removing extra stopwords
spam_words = pd.DataFrame(spam_words.most_common(30), columns = ['word', 'frequency'])



In [48]:
data.head()

Unnamed: 0,label,body_text,body_len,punct%,cleaned_text
0,ham,"Go until jurong point, crazy.. Available only ...",92,9.8,"[go, jurong, point, crazy, available, bugis, g..."
1,ham,Ok lar... Joking wif u oni...,24,25.0,"[ok, lar, joking, wif, oni]"
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,ham,U dun say so early hor... U c already then say...,39,15.4,"[dun, say, early, hor, c, already, say]"
4,ham,"Nah I don't think he goes to usf, he lives aro...",49,4.1,"[nah, dont, think, go, usf, life, around, though]"


In [49]:
# Convert the labels into numerical format
FactorResult = pd.factorize(data["label"])
data["label"] = FactorResult[0]
data.head()

Unnamed: 0,label,body_text,body_len,punct%,cleaned_text
0,0,"Go until jurong point, crazy.. Available only ...",92,9.8,"[go, jurong, point, crazy, available, bugis, g..."
1,0,Ok lar... Joking wif u oni...,24,25.0,"[ok, lar, joking, wif, oni]"
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,128,4.7,"[free, entry, wkly, comp, win, fa, cup, final,..."
3,0,U dun say so early hor... U c already then say...,39,15.4,"[dun, say, early, hor, c, already, say]"
4,0,"Nah I don't think he goes to usf, he lives aro...",49,4.1,"[nah, dont, think, go, usf, life, around, though]"


In [50]:
# Split the data into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(data[['body_text', 'body_len', 'punct%']], data.label, random_state = 42, test_size = 0.2)

# Print the shape of each set
print(f"X_train shape: {X_train.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_test shape: {Y_test.shape}")

X_train shape: (4457, 3)
Y_train shape: (4457,)
X_test shape: (1115, 3)
Y_test shape: (1115,)


In [51]:
# Create a TF-IDF vectorizer object
tfidf_vect = TfidfVectorizer(analyzer = clean_text)

# Fit the vectorizer to the training data
tfidf_vect_fit = tfidf_vect.fit(X_train['body_text'])

# Transform the training and testing data into TF-IDF features
tfidf_train = tfidf_vect.transform(X_train['body_text'])
tfidf_test = tfidf_vect.transform(X_test['body_text'])

In [52]:
# Concatenate the 'body_len' and 'punct%' columns with the transformed features
X_train = pd.concat([X_train[['body_len', 'punct%']].reset_index(drop = True), pd.DataFrame(tfidf_train.toarray())], axis = 1)
X_test = pd.concat([X_test[['body_len', 'punct%']].reset_index(drop = True), pd.DataFrame(tfidf_test.toarray())], axis = 1)

# Print the shape of the training and testing datasets
print(f"X_train shape: {X_train.shape}")
print(f"Y_train shape: {Y_train.shape}")
print(f"X_test shape: {X_test.shape}")
print(f"Y_test shape: {Y_test.shape}")

X_train shape: (4457, 7865)
Y_train shape: (4457,)
X_test shape: (1115, 7865)
Y_test shape: (1115,)


In [53]:
clf = XGBClassifier()

clf.fit(X_train, Y_train)

In [54]:
# We finally predict the values.
predict_train = clf.predict(X_train)

print(
    f"Accuracy of Train dataset: {metrics.accuracy_score(Y_train, predict_train):0.3f}"
)

Accuracy of Train dataset: 0.993
