In [None]:
!pip install nltk

In [None]:
!pip install scikit-learn

In [None]:
# Import libraries
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

# Download NLTK stopwords
nltk.download('stopwords')

In [10]:
#!mkdir ~/.kaggle
#!cp kaggle.json ~/.kaggle/  # Upload your Kaggle API token as 'kaggle.json'
#!chmod 600 ~/.kaggle/kaggle.json
#!kaggle datasets download -d uciml/sms-spam-collection-dataset
#!unzip sms-spam-collection-dataset.zip

# 1.Load the dataset
df = pd.read_csv('spam.csv', encoding='latin-1')
df = df[['v1', 'v2']]  # Select relevant columns
df.columns = ['label', 'text']

In [51]:
# 2.Function to clean text
def clean_text(text):
    text = re.sub(r'\W', ' ', text)  # Remove non-text characters
    text = re.sub(r'\s+', ' ', text)  # Remove extra spaces
    text = re.sub(r'\d+', ' ', text)  # Remove numbers
    text = text.lower()  # Convert to lowercase
    return text

# Clean text data
df['text'] = df['text'].apply(clean_text)

# Remove stop words
stop_words = set(stopwords.words('english'))
df['text'] = df['text'].apply(lambda x: ' '.join(word for word in x.split() if word not in stop_words))

# Convert labels to binary
df['label'] = df['label'].map({'ham': 0, 'spam': 1})

In [59]:
# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

In [55]:
# TF-IDF Encoding
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)


In [None]:
# Random Forest Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train_tfidf, y_train)

In [57]:
# Predictions
y_pred_rf = rf_model.predict(X_test_tfidf)

In [None]:
# Evaluation
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print("Random Forest Model Accuracy:", accuracy_rf)
print("Confusion Matrix:")
print(conf_matrix_rf)

In [32]:
# 3.TF-IDF Vectorization######
tfidf = TfidfVectorizer(max_features=3000)
X = tfidf.fit_transform(df['text']).toarray()

# Encode labels
y = df['label'].apply(lambda x: 1 if x == 'spam' else 0)

In [33]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# 4.Train the model
model = RandomForestClassifier(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

In [None]:
!pip install tensorflow

In [None]:
 # 5.Predict on the test set
y_pred = model.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, y_pred)
print(f'Accuracy: {accuracy*100:.2f}%')

# Confusion matrix
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

In [None]:
# Random Forest Model
rf_model = RandomForestClassifier()
rf_model.fit(X_train_tfidf, y_train)

# Predictions
y_pred_rf = rf_model.predict(X_test_tfidf)

# Evaluation
accuracy_rf = accuracy_score(y_test, y_pred_rf)
conf_matrix_rf = confusion_matrix(y_test, y_pred_rf)
print("Random Forest Model Accuracy:", accuracy_rf)
print("Confusion Matrix:")
print(conf_matrix_rf)


In [None]:
#stage 2
!pip install tensorflow

In [39]:
#1.tokenise & pad sequence
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Tokenize text data
tokenizer = Tokenizer(num_words=5000)
tokenizer.fit_on_texts(df['text'])
X = tokenizer.texts_to_sequences(df['text'])

# Pad sequences
max_length = 100
X = pad_sequences(X, maxlen=max_length, padding='post')

# Encode labels
y = df['label'].apply(lambda x: 1 if x == 'spam' else 0).values

In [None]:
#2.convolutional
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense

# Define the model
embedding_dim = 50
model = Sequential([
    Embedding(input_dim=5000, output_dim=embedding_dim, input_length=max_length),
    Conv1D(filters=128, kernel_size=5, activation='relu'),
    GlobalMaxPooling1D(),
    Dense(10, activation='relu'),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X, y, epochs=10, batch_size=32, validation_split=0.2)

In [None]:
#3.evaluating model
# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train the model
history = model.fit(X_train, y_train, epochs=10, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy*100:.2f}%')

# Confusion matrix
y_pred = (model.predict(X_test) > 0.5).astype("int32")
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)


In [None]:
#3word2vector
from gensim.models import Word2Vec
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Train Word2Vec model
sentences = [text.split() for text in df['text']]
word2vec_model = Word2Vec(sentences, vector_size=100, window=5, min_count=1, workers=4)

# Create a weight matrix for words in the training set
vocab_size = len(tokenizer.word_index) + 1
embedding_matrix = np.zeros((vocab_size, 100))
for word, i in tokenizer.word_index.items():
    if word in word2vec_model.wv:
        embedding_matrix[i] = word2vec_model.wv[word]

# Build the LSTM model
model = Sequential([
    Embedding(vocab_size, 100, weights=[embedding_matrix], input_length=max_length, trainable=False),
    LSTM(100, dropout=0.2, recurrent_dropout=0.2),
    Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X, y, epochs=5, batch_size=32, validation_split=0.2)

# Evaluate the model
loss, accuracy = model.evaluate(X_test, y_test)
print(f'Accuracy: {accuracy*100:.2f}%')

# Confusion matrix
y_pred = (model.predict(X_test) > 0.5).astype("int32")
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)

In [None]:
#stage 3BERT embedding
!pip install transformers

In [None]:
from transformers import BertTokenizer, TFBertModel

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize and pad sequences
def bert_encode(texts, tokenizer, max_len=128):
    input_ids = []
    attention_masks = []
    for text in texts:
        encoded = tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=max_len,
            pad_to_max_length=True,
            return_attention_mask=True,
            return_tensors='tf'
        )
        input_ids.append(encoded['input_ids'])
        attention_masks.append(encoded['attention_mask'])
    return np.array(input_ids), np.array(attention_masks)

# Prepare data
X_input_ids, X_attention_masks = bert_encode(df['text'], tokenizer)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Load and preprocess the dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None)
df.columns = ['label', 'text']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the text data
X_train_encoded = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')
X_test_encoded = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')

# Get BERT embeddings for train and test sets
X_train_embedded = model(X_train_encoded.input_ids, attention_mask=X_train_encoded.attention_mask)[0]
X_test_embedded = model(X_test_encoded.input_ids, attention_mask=X_test_encoded.attention_mask)[0]

# Define and compile the DL model
input_layer = Input(shape=(X_train_embedded.shape[1], X_train_embedded.shape[2]))
dropout_layer = Dropout(0.2)(input_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_embedded, y_train, epochs=3, batch_size=32, validation_data=(X_test_embedded, y_test))

# Evaluate the model
y_pred = model.predict(X_test_embedded)
y_pred_binary = np.round(y_pred).astype(int)
accuracy_bert = accuracy_score(y_test, y_pred_binary)
conf_matrix_bert = confusion_matrix(y_test, y_pred_binary)

print("BERT Model Accuracy:", accuracy_bert)
print("Confusion Matrix:")
print(conf_matrix_bert)


In [None]:
# Split the data
X_train_ids, X_test_ids, X_train_masks, X_test_masks, y_train, y_test = train_test_split(
    X_input_ids, X_attention_masks, y, test_size=0.2, random_state=42
)

# Train the model
history = model.fit(
    [X_train_ids, X_train_masks], y_train,
    epochs=3, batch_size=16, validation_split=0.2
)

# Evaluate the model
loss, accuracy = model.evaluate([X_test_ids, X_test_masks], y_test)
print(f'Accuracy: {accuracy*100:.2f}%')

# Confusion matrix
y_pred = (model.predict([X_test_ids, X_test_masks]) > 0.5).astype("int32")
conf_matrix = confusion_matrix(y_test, y_pred)
print('Confusion Matrix:')
print(conf_matrix)


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from transformers import BertTokenizer, TFBertModel
from tensorflow.keras.layers import Input, Dropout, Dense
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

# Load and preprocess the dataset
url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"
df = pd.read_csv(url, sep='\t', header=None)
df.columns = ['label', 'text']

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = TFBertModel.from_pretrained('bert-base-uncased')

# Tokenize and encode the text data
X_train_encoded = tokenizer(X_train.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')
X_test_encoded = tokenizer(X_test.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')

# Get BERT embeddings for train and test sets
X_train_embedded = model(X_train_encoded.input_ids, attention_mask=X_train_encoded.attention_mask)[0]
X_test_embedded = model(X_test_encoded.input_ids, attention_mask=X_test_encoded.attention_mask)[0]

# Define and compile the DL model
input_layer = Input(shape=(X_train_embedded.shape[1], X_train_embedded.shape[2]))
dropout_layer = Dropout(0.2)(input_layer)
output_layer = Dense(1, activation='sigmoid')(dropout_layer)

model = Model(inputs=input_layer, outputs=output_layer)
model.compile(optimizer=Adam(lr=1e-5), loss='binary_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_embedded, y_train, epochs=3, batch_size=32, validation_data=(X_test_embedded, y_test))

# Evaluate the model
y_pred = model.predict(X_test_embedded)
y_pred_binary = np.round(y_pred).astype(int)
accuracy_bert = accuracy_score(y_test, y_pred_binary)
conf_matrix_bert = confusion_matrix(y_test, y_pred_binary)

print("BERT Model Accuracy:", accuracy_bert)
print("Confusion Matrix:")
print(conf_matrix_bert)

# Plot confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix_bert, annot=True, fmt='d', cmap='Blues', cbar=False, annot_kws={'fontsize': 14})
plt.xlabel('Predicted Labels', fontsize=14)
plt.ylabel('True Labels', fontsize=14)
plt.title('Confusion Matrix - BERT Model', fontsize=16)
plt.show()
