## Import the required packages

In [3]:
import csv
import sklearn
import pandas as pd
import numpy as np
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

In [4]:
df = pd.read_csv('spam.csv', encoding='latin')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


## Removing unwanted columns

In [5]:
df = df.drop(["Unnamed: 2", "Unnamed: 3", "Unnamed: 4"], axis=1)
df = df.rename(columns={"v2" : "text", "v1":"label"})

In [6]:
df.head()

Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [7]:
# Import nltk packages and Punkt Tokenizer Models
# import nltk
nltk.download("punkt")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [8]:
df = df.replace(['ham','spam'],[0, 1]) 

In [9]:
df.head()

Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Removing all the stopwords

In [10]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\divya\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [11]:
#remove the punctuations and stopwords
import string
def text_process(text):
    
    text = text.translate(str.maketrans('', '', string.punctuation))
    text = [word for word in text.split() if word.lower() not in stopwords.words('english')]
    
    return " ".join(text)

In [12]:
df['text'] = df['text'].apply(text_process)

In [13]:
df.head()

Unnamed: 0,label,text
0,0,Go jurong point crazy Available bugis n great ...
1,0,Ok lar Joking wif u oni
2,1,Free entry 2 wkly comp win FA Cup final tkts 2...
3,0,U dun say early hor U c already say
4,0,Nah dont think goes usf lives around though


In [14]:
text = df['text'].to_numpy()
label = df['label'].to_numpy()

## Converting words to vectors using TFIDF Vectorizer

In [15]:
#convert the text data into vectors
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer()
vectors = vectorizer.fit_transform(df['text'])
vectors.shape

(5572, 9376)

In [16]:
# features = word_vectors
features = vectors
# fetures = features.toarray()
features

<5572x9376 sparse matrix of type '<class 'numpy.float64'>'
	with 47254 stored elements in Compressed Sparse Row format>

## Splitting into training and test set

In [17]:
#split the dataset into train and test set
X_train, X_test, y_train, y_test = train_test_split(features,label, test_size=0.15, random_state=111)

In [18]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((4736, 9376), (836, 9376), (4736,), (836,))

In [19]:
# features = pd.Series('features')
X_train = X_train.toarray()

In [20]:
X_test = X_test.toarray()

## Building the model

In [21]:
#decision tree

import numpy as np
# from Node import Node

class Node:
    def __init__(self, feature=None, threshold=None, left=None, right=None, *, value=None):
        self.feature = feature
        self.threshold = threshold
        self.left = left
        self.right = right
        self.value = value
        
    def is_leaf(self):
        return self.value is not None
  
  
class DecisionTree:
    #by default it will make root = NUll and assign max_depth and min_samples_split
    def __init__(self, max_depth=1, min_samples_split=1):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.root = None
        
    # it will check whethr we should stop now or not 
    # here if depth is reach our max or class split is divided into 1 or split samples are less min split
    def searchComplete(self, depth):
        if (depth >= self.max_depth or self.class_labels == 1 or self.n_samples < self.min_samples_split):
            return True
        return False
    
    def entropy(self,y):
        proportions = np.bincount(y) / len(y)
        entropy = -np.sum([p * np.log2(p) for p in proportions if p > 0])
        return entropy
        
	# make the tree split using threshold value
    def createSplit(self, X , thresh):
        left_idx = np.argwhere(X <= thresh).flatten()
        right_idx = np.argwhere(X > thresh).flatten()
        
        # print(left_idx,right_idx)
        return left_idx, right_idx
        
	# implementing the basic information gain formula finding the left subtree and right subtree and subtracting the child loss from its parent loss
    def informationGain(self, X, y, thresh):
        parentLoss = self.entropy(y)
        left_idx, right_idx = self.createSplit(X, thresh)
        n, n_left , n_right = len(y), len(left_idx), len(right_idx)

        if n_left == 0 or n_right == 0:
            return 0
        childLoss = (n_left / n) * self.entropy(y[left_idx]) + (n_right/ n) * self.entropy(y[right_idx])
        return parentLoss - childLoss

	# best function to make the split 
    def bestSplit(self, X, y , features):
        split = {'score' : -1 , 'feat': None, 'thresh': None}
       
        for feat in features:
            X_feat = X[:,feat]
            thresholds = np.unique(X_feat)
            for thresh in thresholds:
                score = self.informationGain(X_feat, y, thresh)
                
                if score > split['score']:
                    split['score'] = score
                    split['feat'] = feat
                    split['thresh'] = thresh
        return split['feat'], split['thresh']

        
    # will have to build tree with given information of depth and training dataset
    def buildTree(self, X, y, depth = 0):

        self.n_samples, self.n_features = X.shape 
        self.class_labels = len(np.unique(y)) 
            
        # check whether we should stop or not
        if self.searchComplete(depth):
            most_common_Label = np.argmax(np.bincount(y))
            return Node(value = most_common_Label)

        # get best split
        rnd_feats = np.random.choice(self.n_features, self.n_features, replace=False) # generates a new random sample from 1D array
        best_feat, best_thresh = self.bestSplit(X, y, rnd_feats) # this function will return best feature and best threshold 
         
        # grow child recursively
        left_idx, right_idx = self.createSplit(X[:, best_feat], best_thresh)
        left_child = self.buildTree(X[left_idx, :], y[left_idx], depth + 1)
        right_child = self.buildTree(X[right_idx, :], y[right_idx], depth + 1)
        return Node(best_feat, best_thresh, left_child, right_child)
        
    def _traverse_tree(self, x, node):
        if node.is_leaf():
            return node.value
        
        if x[node.feature] <= node.threshold:
            return self._traverse_tree(x, node.left)
        return self._traverse_tree(x, node.right)
    
    # will call this function in our jupyter notebook during 
    # building model instead of calling the prebuilt decision tree using sckit learn 
    def fit(self, X, y):
        self.root = self.buildTree(X,y)

    def predict(self, X):
        predictions = [self._traverse_tree(x, self.root) for x in X ]
        return np.array(predictions)



In [22]:
clf = DecisionTree()

In [23]:
clf.fit(X_train,y_train)

In [24]:
dt_pred = clf.predict(X_test)

In [25]:
def print_metrics(y_test, dt_pred):
    print("Accuracy Score is : ",format(accuracy_score(y_test, dt_pred)))
    print('Precision score is : ' , format(precision_score(y_test, dt_pred)))
    print('Recall score is : ' , format(recall_score(y_test, dt_pred)))
    print('F1 score is : ', format(f1_score(y_test, dt_pred)))

Accuracy Score =  0.8899521531100478
Precision score =  0.620253164556962
Recall score =  0.44144144144144143


In [None]:
print_metrics(y_test, dt_pred)

# Predicting

In [25]:
def Predict(message):
    message = [message]
    message = vectorizer.transform(message).toarray()
    predictedValue = clf.predict(message)
    if(predictedValue == 0):
        print("Ham")
    else:
        print("Spam")

In [26]:
for i in range(1,1000):
    print(df["text"][i],":", end = " ")
    Predict(df["text"][i])

Ok lar Joking wif u oni : Ham
Free entry 2 wkly comp win FA Cup final tkts 21st May 2005 Text FA 87121 receive entry questionstd txt rateTCs apply 08452810075over18s : Ham
U dun say early hor U c already say : Ham
Nah dont think goes usf lives around though : Ham
FreeMsg Hey darling 3 weeks word back Id like fun still Tb ok XxX std chgs send å£150 rcv : Ham
Even brother like speak treat like aids patent : Ham
per request Melle Melle Oru Minnaminunginte Nurungu Vettam set callertune Callers Press 9 copy friends Callertune : Ham
WINNER valued network customer selected receivea å£900 prize reward claim call 09061701461 Claim code KL341 Valid 12 hours : Spam
mobile 11 months U R entitled Update latest colour mobiles camera Free Call Mobile Update Co FREE 08002986030 : Spam
Im gonna home soon dont want talk stuff anymore tonight k Ive cried enough today : Ham
SIX chances win CASH 100 20000 pounds txt CSH11 send 87575 Cost 150pday 6days 16 TsandCs apply Reply HL 4 info : Ham
URGENT 1 week FR

u hiding stranger : Ham
interested like : Ham
sister cleared two round birla soft yesterday : Ham
Gudnitetcpractice going : Ham
Dis yijue jus saw ur mail case huiming havent sent u num Dis num : Ham
One small prestige problem : Ham
Fancy shag doInterested sextextukcom txt XXUK SUZY 69876 Txts cost 150 per msg TnCs website X : Ham
checking Really miss seeing Jeremiah great month : Ham
Nah cant help Ive never iphone : Ham
youre car hour half Im going apeshit : Ham
TODAY Sorry day ever angry ever misbehaved hurt plz plz SLAP URSELF Bcoz ur fault Im basically GOOD : Ham
Yo guys ever figure much need alcohol Jay trying figure much safely spend weed : Ham
ltgt ISH MINUTES 5 MINUTES AGO WTF : Ham
Thank callingForgot say Happy Onam SirjiI fine remembered met insurance personMeet Qatar Insha AllahRakhesh ex Tata AIG joined TISSCOTayseer : Ham
Congratulations ur awarded 500 CD vouchers 125gift guaranteed Free entry 2 100 wkly draw txt MUSIC 87066 TnCs wwwLdewcom1win150ppmx3age16 : Ham
Ur cashbal

08714712388 10am7pm Cost 10p : Ham
Save stress person dorm account send account details money sent : Ham
also knows lunch menu da know : Ham
stuff sell ill tell : Ham
449071512431 URGENT 2nd attempt contact UU å£1250 CALL 09071512433 b4 050703 TCsBCM4235WC1N3XX callcost 150ppm mobilesvary maxå£7 50 : Spam
Book lesson msg call work sth Im going get specs membership PX3748 : Spam
guaranteed å£1000 cash å£2000 prize claim yr prize call customer service representative 08714712394 10am7pm : Spam
Macha dont feel upseti assume mindsetbelieve one evening wonderful plans usLET LIFE BEGIN AGAINcall anytime : Ham
Oh Send address : Ham
Sfine Anytime best : Ham
wondar full flim : Ham
Ya even cookies jelly : Ham
world running stillmaybe feeling sameso itor admiti madthen correctionor let call lifeand keep running worldmay u r also runninglets run : Spam
Got looks scrumptious daddy wants eat night long : Ham
cos lar im ba dao ok 1 pm lor u never ask go ah said u would ask fri said u ask today : Ham
A

Ham
