In [None]:
!pip install gensim
!pip install fasttext
!pip install xgboost
!pip install catboost
!pip install lightgbm
!pip install wordcloud

In [None]:
import pandas as pd
import numpy as np
import nltk
import gensim
import tensorflow as tf
import re
import scipy
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score,confusion_matrix
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from gensim.models import KeyedVectors, Word2Vec
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, Flatten

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('punkt_tab')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
df = pd.read_csv("/content/spam.csv", encoding="latin-1")
df = df.dropna(how="any", axis=1)
df.columns = ['target', 'message']
df.head()

Unnamed: 0,target,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [None]:
def preprocess_text(text):
    text = re.sub(r'[^\w\s]', '', text).lower()  # Remove punctuation and lowercase
    text = re.sub('\[.*?\]', '', text)  # Remove text in square brackets
    text = re.sub('https?://\S+|www\.\S+', '', text)  # Remove links
    text = re.sub('\w*\d\w*', '', text) # Remove words containing numbers
    words = nltk.word_tokenize(text) # Tokenize
    words = [w for w in words if w not in stop_words] # Remove stopwords
    words = [lemmatizer.lemmatize(w) for w in words] # Lemmatize
    return " ".join(words)

df['processed_message'] = df['message'].apply(preprocess_text)

In [None]:
df.head()

Unnamed: 0,target,message,processed_message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri wkli comp win fa cup final tkt may ...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though


In [None]:
le = LabelEncoder()
le.fit(df['target'])
df['target_encoded'] = le.transform(df['target'])
df.head()

Unnamed: 0,target,message,processed_message,target_encoded
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...,0
1,ham,Ok lar... Joking wif u oni...,ok lar joking wif u oni,0
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entry wkly comp win fa cup final tkts may...,1
3,ham,U dun say so early hor... U c already then say...,u dun say early hor u c already say,0
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though,0


In [None]:
plt.figure(figsize=(8, 6))
sns.countplot(x='target', data=df)
plt.title('Distribution of Target Variable')
plt.show()

df['message_length'] = df['message'].apply(len)
plt.figure(figsize=(8, 6))
sns.histplot(df['message_length'], kde=True)
plt.title('Distribution of Message Lengths')
plt.show()

text = " ".join(df['processed_message'].tolist())
wordcloud = WordCloud(width=800, height=400).generate(text)
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud)
plt.axis("off")
plt.title('Word Cloud of Processed Messages')
plt.show()

In [None]:
x = df['processed_message']
y = df['target_encoded']
print("Shape of Data:")
print(len(x), len(y))
x_train, x_test, y_train, y_test = train_test_split(x, y, random_state=42)
print("Shape of Train:")
print(len(x_train), len(y_train))
print("Shape of Test:")
print(len(x_test), len(y_test))

Shape of Data:
5572 5572
Shape of Train:
4179 4179
Shape of Test:
1393 1393


In [None]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip
!pip install fasttext
import fasttext

In [None]:
#Countvectorisor
vectorizer = CountVectorizer()
x_train_vec = vectorizer.fit_transform(x_train)
x_test_vec = vectorizer.transform(x_test)

In [None]:
# TF-IDF
tfidf_vectorizer = TfidfVectorizer()
x_train_tfidf = tfidf_vectorizer.fit_transform(x_train)
x_test_tfidf = tfidf_vectorizer.transform(x_test)

In [None]:
# Word2Vec
# Train Word2Vec model
tokenized_messages = [nltk.word_tokenize(text) for text in x_train]
word2vec_model = Word2Vec(sentences=tokenized_messages, vector_size=100, window=5, min_count=1, workers=4)

# Function to create sentence embeddings
def get_sentence_embedding(sentence):
    words = nltk.word_tokenize(sentence)
    vectors = [word2vec_model.wv[word] for word in words if word in word2vec_model.wv]
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)  # Return zero vector if no words are in vocabulary

x_train_word2vec = np.array([get_sentence_embedding(sentence) for sentence in x_train])
x_test_word2vec = np.array([get_sentence_embedding(sentence) for sentence in x_test])

In [None]:
# Load pre-trained GloVe embeddings
glove_file = '/content/glove.6B.100d.txt' # Replace with your path
glove_model = {}
with open(glove_file, encoding='utf-8') as f:
    for line in f:
        values = line.split()
        word = values[0]
        vector = np.asarray(values[1:], dtype='float32')
        glove_model[word] = vector

# Function to get sentence embedding using GloVe
def get_glove_embedding(sentence):
    words = nltk.word_tokenize(sentence)
    vectors = [glove_model.get(word, np.zeros(100)) for word in words] # Use zeros if word not found
    if vectors:
        return np.mean(vectors, axis=0)
    else:
        return np.zeros(100)

x_train_glove = np.array([get_glove_embedding(sentence) for sentence in x_train])
x_test_glove = np.array([get_glove_embedding(sentence) for sentence in x_test])

In [None]:
with open('train.txt', 'w') as f:
  for text, label in zip(x_train, y_train):
    f.write(f"__label__{label} {text}\n")

fasttext_model = fasttext.train_supervised('train.txt', epoch=25)
def get_fasttext_embedding(text):
    return fasttext_model.get_sentence_vector(text)

x_train_fasttext = np.array([get_fasttext_embedding(sentence) for sentence in x_train])
x_test_fasttext = np.array([get_fasttext_embedding(sentence) for sentence in x_test])

In [None]:
models = {
    "XGBoost": XGBClassifier(),
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Decision Tree": DecisionTreeClassifier(),
    "SVM": SVC(),
    "Random Forest": RandomForestClassifier(),
    "KNN": KNeighborsClassifier(),
    "Catboost": CatBoostClassifier(verbose=0),
    #"LightGBM": LGBMClassifier()
}

In [None]:
dataset = {
    'countvectorizer': {
        'train': x_train_vec,
        'test': x_test_vec
    },
    'tfidf': {
        'train': x_train_tfidf,
        'test': x_test_tfidf
    },
    'word2vec': {
        'train': x_train_word2vec,
        'test': x_test_word2vec
    },
    'glove': {
        'train': x_train_glove,
        'test': x_test_glove
    },
    'fasttext': {
        'train': x_train_fasttext,
        'test': x_test_fasttext
    }
}


In [None]:
def train_and_evaluate_naive_bayes(x_train, x_test, y_train, y_test, dataset_name):
    # Convert sparse matrices to dense arrays before clipping
    if isinstance(x_train, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)):
        x_train = x_train.toarray()
    if isinstance(x_test, (scipy.sparse.csr_matrix, scipy.sparse.csc_matrix)):
        x_test = x_test.toarray()

    x_train_clipped = np.clip(x_train, 0, None)
    x_test_clipped = np.clip(x_test, 0, None)

    nb_classifier = MultinomialNB()
    nb_classifier.fit(x_train_clipped, y_train)
    nb_pred = nb_classifier.predict(x_test_clipped)
    nb_accuracy = accuracy_score(y_test, nb_pred)
    nb_precision = precision_score(y_test, nb_pred)
    nb_recall = recall_score(y_test, nb_pred)
    print(f"Naive Bayes ({dataset_name}) Accuracy: {nb_accuracy}")
    return nb_accuracy, nb_precision, nb_recall

In [None]:
results = []
Modelname = 'ANN' # corrected variable name
for dataset_name, data in dataset.items():
    accuracy, precision, recall = train_and_evaluate_naive_bayes(data['train'], data['test'], y_train, y_test, dataset_name)
    results.append([Modelname, dataset_name, accuracy, precision, recall])
df_results = pd.DataFrame(results, columns=['Model', 'Dataset', 'Accuracy', 'Precision', 'Recall'])
df_results

Naive Bayes (countvectorizer) Accuracy: 0.9784637473079684
Naive Bayes (tfidf) Accuracy: 0.9612347451543432
Naive Bayes (word2vec) Accuracy: 0.8628858578607322
Naive Bayes (glove) Accuracy: 0.9030868628858578
Naive Bayes (fasttext) Accuracy: 0.9755922469490309


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Unnamed: 0,Dataset,Accuracy,Precision,Recall
0,countvectorizer,0.978464,0.96,0.879581
1,tfidf,0.961235,1.0,0.717277
2,word2vec,0.862886,0.0,0.0
3,glove,0.903087,0.888889,0.335079
4,fasttext,0.975592,0.993711,0.827225


In [None]:
df_results2=[]
for model_name, model in models.items():
    for dataset_name, data in dataset.items():
        print(f"Training {model_name} on {dataset_name} dataset...")
        try:
            model.fit(data['train'], y_train)
            y_pred = model.predict(data['test'])
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred)
            recall = recall_score(y_test, y_pred)
            print(f"{model_name} on {dataset_name}: Accuracy = {accuracy}")
            df_results2.append({
                'Model': model_name,
                'Dataset': dataset_name,
                'Accuracy': accuracy,
                'Precision': precision,
                'Recall': recall
            })
        except ValueError as e:
            print(f"Error training {model_name} on {dataset_name}: {e}")
        except Exception as e:
            print(f"An unexpected error occurred: {e}")

In [None]:
df_results2 = pd.DataFrame(df_results2)
df_results2

Unnamed: 0,Model,Dataset,Accuracy,Precision,Recall
0,XGBoost,countvectorizer,0.96626,0.933735,0.811518
1,XGBoost,tfidf,0.96626,0.955696,0.790576
2,XGBoost,word2vec,0.948313,0.947368,0.659686
3,XGBoost,glove,0.965542,0.955414,0.78534
4,XGBoost,fasttext,0.983489,0.977273,0.900524
5,Logistic Regression,countvectorizer,0.977028,0.993789,0.837696
6,Logistic Regression,tfidf,0.947595,0.968254,0.638743
7,Logistic Regression,word2vec,0.862886,0.0,0.0
8,Logistic Regression,glove,0.935391,0.821656,0.675393
9,Logistic Regression,fasttext,0.980617,0.993976,0.863874


In [None]:
ann_results=[]
for dataset_name, data in dataset.items():
    print(f"Training ANN on {dataset_name} dataset...")
    try:
        if dataset_name == 'countvectorizer':
            x_train_data = x_train_vec
            x_test_data = x_test_vec
        elif dataset_name == 'tfidf':
            input_shape = (x_train_tfidf.shape[1],)
            x_train_data = x_train_tfidf.toarray()
            x_test_data = x_test_tfidf.toarray()
        else:
            input_shape = (x_train_word2vec.shape[1],)
            if dataset_name == 'word2vec':
              x_train_data = x_train_word2vec
              x_test_data = x_test_word2vec
            elif dataset_name == 'glove':
              x_train_data = x_train_glove
              x_test_data = x_test_glove
            elif dataset_name == 'fasttext':
              x_train_data = x_train_fasttext
              x_test_data = x_test_fasttext

        ann_model = Sequential([
            Dense(64, activation='relu', input_shape=(x_train_data.shape[1],)),
            Dense(32, activation='relu'),
            Dense(1, activation='sigmoid')
        ])

        ann_model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
        ann_model.fit(x_train_data, y_train, epochs=5, batch_size=32, verbose=0)
        _, ann_accuracy = ann_model.evaluate(x_test_data, y_test, verbose=0)
        ann_precision = precision_score(y_test, (ann_model.predict(x_test_data) > 0.5).astype("int32"))
        ann_recall = recall_score(y_test, (ann_model.predict(x_test_data) > 0.5).astype("int32"))
        ann_results.append({
            'Model': 'ANN',
            'Dataset': dataset_name,
            'Accuracy': ann_accuracy,
            'Precision': ann_precision,
            'Recall': ann_recall
        })
        print(f"ANN ({dataset_name}) Accuracy: {ann_accuracy}")
    except ValueError as e:
        print(f"Error training ANN on {dataset_name}: {e}")
    except Exception as e:
        print(f"An unexpected error occurred: {e}")

dfann=pd.DataFrame(ann_results)
dfann

In [None]:
ResultsData=pd.concat([df_results,df_results2,dfann])
ResultsData = ResultsData.reset_index(drop=True)
ResultsData

Unnamed: 0,Dataset,Accuracy,Precision,Recall,Model
0,countvectorizer,0.978464,0.96,0.879581,
1,tfidf,0.961235,1.0,0.717277,
2,word2vec,0.862886,0.0,0.0,
3,glove,0.903087,0.888889,0.335079,
4,fasttext,0.975592,0.993711,0.827225,
0,countvectorizer,0.96626,0.933735,0.811518,XGBoost
1,tfidf,0.96626,0.955696,0.790576,XGBoost
2,word2vec,0.948313,0.947368,0.659686,XGBoost
3,glove,0.965542,0.955414,0.78534,XGBoost
4,fasttext,0.983489,0.977273,0.900524,XGBoost


In [None]:
# prompt: save results

from google.colab import files
ResultsData.to_csv('results.csv', encoding = 'utf-8-sig')
files.download('results.csv')


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>