<a href="https://colab.research.google.com/github/BiradarSonali/Sonali-Biradar/blob/main/Email_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

In [None]:
data = pd.read_csv('spam_assassin.csv')
data.head()

Unnamed: 0,text,target
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,0
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,1
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,1
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,1
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,0


In [None]:
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('omw-1.4')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\tusha\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\tusha\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\tusha\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\tusha\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

##### In Preprocessing step, we remove any unnecessary symbols, headers from email_address, numbers using Regular Expression. After this we remove the stop words and lemmatize the text to make more understandable for further processing

In [None]:
def preprocess_text(text):

    # Convert to lowercase
    text = text.lower()

    # Removing email headers, email addresses, URLs, special symbols, and numbers to clean the text for further processing
    text = re.sub(r'^from.*?\n', '', text, flags=re.MULTILINE)
    text = re.sub(r'\b[a-zA-Z0-9._%+-]+@[a-zA-Z.-]+\.[a-zA-Z]{2,}\b', '', text)
    text = re.sub(r'http\S+|www\S+|https\S+', '', text, flags=re.MULTILINE)
    text = re.sub(r'[^a-z\s]', '', text)

    tokens = [lemmatizer.lemmatize(word) for word in nltk.word_tokenize(text) if word not in stop_words]

    return ' '.join(tokens)

In [None]:
data['cleaned_text'] = data['text'].apply(preprocess_text)

In [None]:
data[['text', 'cleaned_text']].head()

Unnamed: 0,text,cleaned_text
0,From ilug-admin@linux.ie Mon Jul 29 11:28:02 2...,mon jul returnpath deliveredto received localh...
1,From gort44@excite.com Mon Jun 24 17:54:21 200...,mon jun returnpath deliverydate tue jun receiv...
2,From fork-admin@xent.com Mon Jul 29 11:39:57 2...,mon jul returnpath deliveredto received localh...
3,From dcm123@btamail.net.cn Mon Jun 24 17:49:23...,mon jun returnpath deliverydate mon jun receiv...
4,From ilug-admin@linux.ie Mon Aug 19 11:02:47 2...,mon aug returnpath deliveredto received localh...


#### We use TF-IDF Vectorizer to convert text into vector form which can processed by model for prediction. I have used N-Grams techniques including both 1gram and 2gram approach to form the vectors to capture text patterns.

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data['cleaned_text'], data['target'], test_size=0.3, random_state=42)

# Using TF-IDF with n-grams (up to bigrams here)
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), max_df=0.9, min_df=5)
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

In [None]:
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train_tfidf, y_train)
y_pred_nb = nb_classifier.predict(X_test_tfidf)
print("Naive Bayes Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Classification Report:\n", classification_report(y_test, y_pred_nb))

Naive Bayes Accuracy: 0.9752731454859115
Classification Report:
               precision    recall  f1-score   support

           0       0.96      1.00      0.98      1172
           1       1.00      0.92      0.96       567

    accuracy                           0.98      1739
   macro avg       0.98      0.96      0.97      1739
weighted avg       0.98      0.98      0.98      1739



In [None]:
logreg = LogisticRegression(max_iter=1000)
logreg.fit(X_train_tfidf, y_train)
y_pred_logreg = logreg.predict(X_test_tfidf)
print("Logistic Regression Accuracy:", accuracy_score(y_test, y_pred_logreg))
print("Classification Report:\n", classification_report(y_test, y_pred_logreg))

Logistic Regression Accuracy: 0.9907993099482462
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1172
           1       1.00      0.97      0.99       567

    accuracy                           0.99      1739
   macro avg       0.99      0.99      0.99      1739
weighted avg       0.99      0.99      0.99      1739



In [None]:
svm = SVC(kernel='linear')
svm.fit(X_train_tfidf, y_train)
y_pred_svm = svm.predict(X_test_tfidf)
print("SVM Accuracy:", accuracy_score(y_test, y_pred_svm))
print("Classification Report:\n", classification_report(y_test, y_pred_svm))

SVM Accuracy: 0.9953996549741231
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      1.00      1172
           1       1.00      0.99      0.99       567

    accuracy                           1.00      1739
   macro avg       1.00      0.99      0.99      1739
weighted avg       1.00      1.00      1.00      1739



In [None]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train_tfidf, y_train)
y_pred_rf = rf.predict(X_test_tfidf)
print("Random Forest Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Classification Report:\n", classification_report(y_test, y_pred_rf))

Random Forest Accuracy: 0.9913743530764807
Classification Report:
               precision    recall  f1-score   support

           0       0.99      1.00      0.99      1172
           1       0.99      0.98      0.99       567

    accuracy                           0.99      1739
   macro avg       0.99      0.99      0.99      1739
weighted avg       0.99      0.99      0.99      1739



##### From the above analysis, we can say that Support Vector Classifier and Random Forest Classifier are the two best models for classification which are giving almost 99% accuracy