In [2]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import ngrams
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

# Download NLTK resources
# nltk.download('punkt')
# nltk.download('stopwords')
# nltk.download('wordnet')
# nltk.download('averaged_perceptron_tagger')


### Step 1: Data Acquisition 

In [20]:
#1.Data Aquisition

import pandas as pd

data = pd.read_csv("C:\\Users\\Abhishek\\Desktop\\AIML\\Machine Learning_Code\\Diwali Assignments\\spam_assassin (2).csv")

df = pd.DataFrame(data)
df.head()

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


### Step 2: Text Cleaning

In [22]:
def clean_text(text):
    # Remove HTML tags, URLs, special characters, and numbers
    text = re.sub(r'<.*?>', '', text)  # Remove HTML tags
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)  # Remove URLs
    text = re.sub(r'\@\w+|\#','', text)  # Remove @ mentions and # hashtags
    text = re.sub(r'[^a-zA-Z\s]', '', text)  # Remove special characters and numbers
    text = text.lower()  # Convert to lowercase
    return text
# Apply cleaning
df['text'] = df['text'].apply(clean_text)


### Tokenization 

In [24]:
def tokenize(text):
    return nltk.word_tokenize(text)

data['tokens'] = data['text'].apply(tokenize)


### Stop Word Removal

In [26]:
stop_words = set(stopwords.words('english'))

def remove_stopwords(tokens):
    return [word for word in tokens if word not in stop_words]

data['tokens'] = data['tokens'].apply(remove_stopwords)


### Stemming and Lemmatization 

In [10]:
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def stem_tokens(tokens):
    return [stemmer.stem(word) for word in tokens]

def lemmatize_tokens(tokens):
    return [lemmatizer.lemmatize(word) for word in tokens]

# Uncomment one of the following lines based on your choice
data['tokens'] = data['tokens'].apply(stem_tokens)  # For stemming
# data['tokens'] = data['tokens'].apply(lemmatize_tokens)  # For lemmatization


### Feature Extraction 

In [12]:
# Convert list of tokens back to strings
data['processed_text'] = data['tokens'].apply(lambda x: ' '.join(x))

# Using TF-IDF Vectorizer
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(data['processed_text']).toarray()
y = data['target']  # Assuming 'spam_label' is the column name for the target


### Split the Data

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


### Build Classification model

In [14]:
# Initialize the model
model = MultinomialNB()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9094827586206896
Classification Report:
               precision    recall  f1-score   support

           0       0.88      1.00      0.94       779
           1       1.00      0.72      0.84       381

    accuracy                           0.91      1160
   macro avg       0.94      0.86      0.89      1160
weighted avg       0.92      0.91      0.91      1160

Confusion Matrix:
 [[779   0]
 [105 276]]


In [17]:
# Initialize the model
model = LogisticRegression()

# Train the model
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Accuracy: 0.9879310344827587
Classification Report:
               precision    recall  f1-score   support

           0       0.98      1.00      0.99       779
           1       1.00      0.96      0.98       381

    accuracy                           0.99      1160
   macro avg       0.99      0.98      0.99      1160
weighted avg       0.99      0.99      0.99      1160

Confusion Matrix:
 [[779   0]
 [ 14 367]]
