In [2]:
pip install numpy pandas scikit-learn torch torchvision torchaudio transformers tensorflow

Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import BertTokenizer, BertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
data = pd.read_csv("D:\Codes\python_Research\code_research\emails.csv")
X = data['text']
y = data['spam']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [6]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import BertTokenizer, BertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer

class BERTEmbeddings:
    def __init__(self, batch_size=16):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.batch_size = batch_size
    
    def get_embeddings(self, texts):
        embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch_texts = texts[i:i+self.batch_size]
            inputs = self.tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
            outputs = self.model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
            embeddings.append(batch_embeddings)
        return np.vstack(embeddings)

X_train, X_test, y_train, y_test = train_test_split(data['text'], data['spam'], test_size=0.2, random_state=42)

bert_embedder = BERTEmbeddings(batch_size=8)
X_train_bert = bert_embedder.get_embeddings(X_train.tolist())
X_test_bert = bert_embedder.get_embeddings(X_test.tolist())

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [7]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import BertTokenizer, BertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer

# Reducing max_features
max_features = 100
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X_train.values)
X_train_lstm = tokenizer.texts_to_sequences(X_train.values)
X_train_lstm = pad_sequences(X_train_lstm)
X_test_lstm = tokenizer.texts_to_sequences(X_test.values)
X_test_lstm = pad_sequences(X_test_lstm, maxlen=X_train_lstm.shape[1])

# Reducing embedding dimension and using smaller batch size
def create_lstm_model(input_length):
    model = Sequential()
    model.add(Embedding(max_features, 64, input_length=input_length))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

lstm_model = create_lstm_model(X_train_lstm.shape[1])
lstm_model.fit(X_train_lstm, y_train, epochs=2, batch_size=32, validation_split=0.1, verbose=2)

lstm_train_preds = lstm_model.predict(X_train_lstm)
lstm_test_preds = lstm_model.predict(X_test_lstm)

# Use the output of the LSTM model's penultimate layer as the LSTM embeddings
lstm_train_embeddings = lstm_model.predict(X_train_lstm)
lstm_test_embeddings = lstm_model.predict(X_test_lstm)


Epoch 1/2
129/129 - 563s - loss: 0.3669 - accuracy: 0.8443 - val_loss: 0.1872 - val_accuracy: 0.9281 - 563s/epoch - 4s/step
Epoch 2/2
129/129 - 715s - loss: 0.1800 - accuracy: 0.9362 - val_loss: 0.1570 - val_accuracy: 0.9368 - 715s/epoch - 6s/step


In [8]:
# Combine TF-IDF, BERT, and LSTM features
X_train_combined = np.concatenate((X_train_tfidf, X_train_bert, lstm_train_embeddings), axis=1)
X_test_combined = np.concatenate((X_test_tfidf, X_test_bert, lstm_test_embeddings), axis=1)

In [9]:
# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_combined, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_combined, y_train)

# SVM
svm = SVC(probability=True)
svm.fit(X_train_combined, y_train)

SVC(probability=True)

In [10]:
# Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('lr', lr), 
    ('rf', rf), 
    ('svm', svm)
], voting='hard')

# Train the voting classifier
voting_clf.fit(X_train_combined, y_train)

# Predictions
y_pred = voting_clf.predict(X_test_combined)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))

Accuracy: 0.9808027923211169
Classification Report:
              precision    recall  f1-score   support

           0       0.98      0.99      0.99       856
           1       0.97      0.95      0.96       290

    accuracy                           0.98      1146
   macro avg       0.98      0.97      0.97      1146
weighted avg       0.98      0.98      0.98      1146



In [11]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score, classification_report
import torch
from transformers import BertTokenizer, BertModel
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer

# Load the dataset
data = pd.read_csv("D:/Codes/python_Research/code_research/emails.csv")
X = data['text']
y = data['spam']

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# TF-IDF
tfidf = TfidfVectorizer(max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

# BERT
class BERTEmbeddings:
    def __init__(self, batch_size=16):
        self.tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
        self.model = BertModel.from_pretrained('bert-base-uncased')
        self.batch_size = batch_size
    
    def get_embeddings(self, texts):
        embeddings = []
        for i in range(0, len(texts), self.batch_size):
            batch_texts = texts[i:i+self.batch_size]
            inputs = self.tokenizer(batch_texts, return_tensors='pt', padding=True, truncation=True, max_length=128)
            outputs = self.model(**inputs)
            batch_embeddings = outputs.last_hidden_state.mean(dim=1).detach().numpy()
            embeddings.append(batch_embeddings)
        return np.vstack(embeddings)

bert_embedder = BERTEmbeddings(batch_size=8)
X_train_bert = bert_embedder.get_embeddings(X_train.tolist())
X_test_bert = bert_embedder.get_embeddings(X_test.tolist())

# LSTM
max_features = 100
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(X_train.values)
X_train_lstm = tokenizer.texts_to_sequences(X_train.values)
X_train_lstm = pad_sequences(X_train_lstm)
X_test_lstm = tokenizer.texts_to_sequences(X_test.values)
X_test_lstm = pad_sequences(X_test_lstm, maxlen=X_train_lstm.shape[1])

def create_lstm_model(input_length):
    model = Sequential()
    model.add(Embedding(max_features, 64, input_length=input_length))
    model.add(SpatialDropout1D(0.2))
    model.add(LSTM(50, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(2, activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

lstm_model = create_lstm_model(X_train_lstm.shape[1])
lstm_model.fit(X_train_lstm, y_train, epochs=2, batch_size=32, validation_split=0.1, verbose=2)

# Use the output of the LSTM model's penultimate layer as the LSTM embeddings
lstm_train_embeddings = lstm_model.predict(X_train_lstm)
lstm_test_embeddings = lstm_model.predict(X_test_lstm)

# Combine TF-IDF, BERT, and LSTM features
X_train_combined = np.concatenate((X_train_tfidf, X_train_bert, lstm_train_embeddings), axis=1)
X_test_combined = np.concatenate((X_test_tfidf, X_test_bert, lstm_test_embeddings), axis=1)

# Logistic Regression
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_combined, y_train)

# Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_combined, y_train)

# SVM
svm = SVC(probability=True)
svm.fit(X_train_combined, y_train)

# Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('lr', lr), 
    ('rf', rf), 
    ('svm', svm)
], voting='hard')

# Train the voting classifier
voting_clf.fit(X_train_combined, y_train)

# Predictions
y_pred = voting_clf.predict(X_test_combined)

# Evaluation
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:")
print(classification_report(y_test, y_pred))


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Epoch 1/2
129/129 - 596s - loss: 0.3532 - accuracy: 0.8503 - val_loss: 0.1768 - val_accuracy: 0.9303 - 596s/epoch - 5s/step
Epoch 2/2
129/129 - 591s - loss: 0.3226 - accuracy: 0.9020 - val_loss: 0.1788 - val_accuracy: 0.9434 - 591s/epoch - 5s/step
Accuracy: 0.9825479930191972
Classification Report:
              precision    recall  f1-score   support

           0       0.99      0.99      0.99       856
           1       0.97      0.96      0.97       290

    accuracy                           0.98      1146
   macro avg       0.98      0.97      0.98      1146
weighted avg       0.98      0.98      0.98      1146



In [13]:
# Voting Classifier
voting_clf = VotingClassifier(estimators=[
    ('lr', lr), 
    ('rf', rf), 
    ('svm', svm)
], voting='soft')

# Train the voting classifier
voting_clf.fit(X_train_combined, y_train)

# Predictions
y_pred = voting_clf.predict(X_test_combined)

from sklearn.metrics import f1_score
print("Accuracy:", accuracy_score(y_test, y_pred))
print("F1-score:", f1_score(y_test, y_pred))

Accuracy: 0.981675392670157
F1-score: 0.9636048526863086
