In [1]:
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer
import re
import spacy
from gensim.models import Word2Vec
nlp = spacy.load('en_core_web_sm')

df= pd.read_csv(r'/kaggle/input/spam-email-dataset/emails.csv', )
y= df['spam'].values
X= df['text']


def preprocess(text):
    text = re.sub(r'[^\w\s]', '', text.lower())
    doc= nlp(text)
    return [token.lemma_ for token in doc if not token.is_stop and not token.is_punct]

# X= X.apply(preprocess).tolist()

In [2]:
X_processed = X.apply(preprocess)

In [3]:
len(X_processed.tolist()[2])

55

In [4]:
X_processed.head()

0    [subject, naturally, irresistible, corporate, ...
1    [subject, stock, trading, gunslinger,  , fanny...
2    [subject, unbelievable, new, home, easy,  , m,...
3    [subject, 4, color, printing, special,  , requ...
4    [subject, money,  , software, cd,   , software...
Name: text, dtype: object

In [5]:
word2vec_model = Word2Vec(sentences=X_processed.tolist(), vector_size=100, window=5, min_count=1, workers=4)

In [6]:
def get_email_embedding(words):
    if not words:  # If the list is empty
        return np.zeros(word2vec_model.vector_size)  # Return a zero vector
    # Get the embeddings for each word in the email and calculate the mean
    return np.mean([word2vec_model.wv[word] for word in words if word in word2vec_model.wv], axis=0)

# Apply the embedding function to get embeddings for each email
X_embeddings = np.array(X_processed.apply(get_email_embedding).tolist())


In [7]:
print(X_embeddings.shape)
# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_embeddings, y, test_size=0.2, random_state=42)

(5728, 100)


In [8]:
class DecisionStump:
    def __init__(self):
        # The feature index to split on
        self.feature_idx = None
        # The threshold value for splitting
        self.threshold = None
        # Whether to predict +1 or -1 for values above threshold
        self.polarity = None
        
    def fit(self, X, y, sample_weight=None):
        if sample_weight is None:
            sample_weight = np.ones(len(y))
            
        n_samples, n_features = X.shape
        min_error = float('inf')
        
        # For each feature
        for feature in range(n_features):
            # Get all values for this feature
            feature_values = X[:, feature]
            thresholds = np.unique(feature_values)
            
            # Try each value as a threshold
            for threshold in thresholds:
                # Predict 1 for values above threshold, -1 for below
                pred1 = np.ones(n_samples)
                pred1[feature_values < threshold] = -1
                error1 = np.sum(sample_weight * (pred1 != y)) / np.sum(sample_weight)
                
                # Predict -1 for values above threshold, 1 for below
                pred2 = np.ones(n_samples)
                pred2[feature_values >= threshold] = -1
                error2 = np.sum(sample_weight * (pred2 != y)) / np.sum(sample_weight)
                
                # Keep track of the best split
                if error1 < min_error:
                    min_error = error1
                    self.feature_idx = feature
                    self.threshold = threshold
                    self.polarity = 1
                    
                if error2 < min_error:
                    min_error = error2
                    self.feature_idx = feature
                    self.threshold = threshold
                    self.polarity = -1
        
        return self
    
    def predict(self, X):
        predictions = np.ones(X.shape[0])
        if self.polarity == 1:
            predictions[X[:, self.feature_idx] < self.threshold] = -1
        else:
            predictions[X[:, self.feature_idx] >= self.threshold] = -1
        return predictions


In [10]:
class Adaboost():
    """Boosting method that uses a number of weak classifiers in 
    ensemble to make a strong classifier. This implementation uses decision
    stumps, which is a one level Decision Tree. 

    Parameters:
    -----------
    n_clf: int
        The number of weak classifiers that will be used. 
    """
    def __init__(self,  weak_learner,EXPLICIT_SAMPLING, n_clf=50, eta=0.5):
        self.n_clf = n_clf
        self.weak_learner= weak_learner
#         self.params= params if params is not None else {}
        self.EXPLICIT_SAMPLING= EXPLICIT_SAMPLING
        self.eta = eta
        self.clfs = []  # To store weak classifiers
        self.alphas = [] 

    def fit(self, X, y):
        m,n= X.shape
        ids= np.arange(m)
        w= np.ones(m)/m
        if np.isfinite(X).all() == False:
            raise ValueError("Input contains NaN or Infinite values.")
        if m <= 1 or n >= m:
            raise ValueError("More samples than features required.")
        for _ in range(self.n_clf):

            clf= self.weak_learner()
            if self.EXPLICIT_SAMPLING:
                sampled_ids = np.random.choice(ids, size=m, p=w)
                X_iter = X[sampled_ids]
                y_iter = y[sampled_ids]
                clf.fit(X_iter, y_iter)
                pred= clf.predict(X_iter)
             
            
            else:
                clf.fit(X, y, sample_weight= w)
                pred= clf.predict(X)
            
       
            # err= np.sum(w[y != pred])
            # err= w[pred != y_train].sum() / w.sum()
            err= np.sum(w * (pred != y)) / np.sum(w)
            print(err)
            alpha= self.eta* np.log((1-err)/(err+ 1e-10) )
            # if err!=0 else 0
            # alpha = max(0, min(1, alpha))  # Ensures alpha stays between 0 and 1

        
            w= w*np.exp(-alpha*y*pred)
            # w= w/np.sum(w)
            # w[pred!=y_train]*= np.exp(alpha)
            # w[pred==y_train]*= np.exp(-alpha)
            w/= np.sum(w)
            self.clfs.append(clf)
            self.alphas.append(alpha)
   
            # plot_decision_boundary(clf, X, y)
            
    def predict(self, X):
        final_pred= np.zeros(X.shape[0])
        for alpha, learner in zip(self.alphas, self.clfs):
            final_pred += alpha * learner.predict(X)
        # print('inpred',np.sign(final_pred))
        # print('truth',y_train)
        return np.sign(final_pred)

In [None]:


# Create AdaBoost classifier with DecisionStump
boost = Adaboost(
    weak_learner=DecisionStump,

    EXPLICIT_SAMPLING=False,
    n_clf=50,
    eta=0.5
)

# Train the model
boost.fit(X_train, y_train)

# Make predictions
y_pred = boost.predict(X_test)

# Calculate accuracy
accuracy = np.mean(y_pred == y_test)
print(f"Training accuracy: {accuracy:.3f}")