In [229]:
import pandas as pd
import numpy as np

df_emails=pd.read_csv('./datasets/emails.csv')
df_emails['sentences'] = df_emails['text'].str.split('\n')
df_emails['sentences']=df_emails['sentences'].apply(lambda x: [i.split('\t') for i in x if len(i)>0])
df_emails['sentences']=df_emails['sentences'].apply(lambda x:' '.join(map(' '.join,x)))
df_emails['sentences']=df_emails['sentences'].apply(lambda x:x.replace('\ufeff',''))


In [None]:
from hazm import Normalizer, word_tokenize, stopwords_list, Stemmer
normalizer = Normalizer()
stemmer = Stemmer()
def preprocess_text(text):
    # Normalize text
    text = normalizer.normalize(text)
    
    # Tokenize text
    tokens = word_tokenize(text)
    
    # Remove punctuation and stopwords, and perform stemming
    cleaned_tokens = [stemmer.stem(token) for token in tokens if token not in stopwords_list() and token.isalnum()]
    
    return cleaned_tokens

# Apply preprocessing to each email in the dataset
df_emails['cleaned_text'] = df_emails['sentences'].apply(preprocess_text)

In [231]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Convert text data into numerical features using TF-IDF vectorization
tfidf_vectorizer = TfidfVectorizer(analyzer='word', tokenizer=lambda x: x, preprocessor=lambda x: x, token_pattern=None)
X_train_tfidf = tfidf_vectorizer.fit_transform(X)
X_train, X_test, Y_train, Y_test = train_test_split(X_train_tfidf, Y, test_size=0.3, random_state=42)


# Train KNN classifiers for different values of K
k_values = list(range(1, 21))
accuracy_scores = []

for k in k_values:
    knn_classifier = KNeighborsClassifier(n_neighbors=k)
    knn_classifier.fit(X_train, Y_train)
    Y_pred = knn_classifier.predict(X_test)
    accuracy = accuracy_score(Y_test, Y_pred)
    accuracy_scores.append(accuracy)

# Find the optimal value of K based on the highest accuracy
optimal_k = k_values[accuracy_scores.index(max(accuracy_scores))]

print("Optimal value of K:", optimal_k)
print("Accuracy:", max(accuracy_scores))

Optimal value of K: 6
Accuracy: 0.9766666666666667


In [232]:
from sklearn.model_selection import cross_val_score, KFold
import numpy as np

knn=KNeighborsClassifier(n_neighbors=optimal_k)
# Perform 10-fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

# Define the scoring metrics
scoring = ['accuracy', 'precision', 'recall', 'f1']

# Store the scores for each metric
scores = {metric: cross_val_score(knn, X_train_tfidf, Y, cv=kf, scoring=metric) for metric in scoring}

# Compute average and standard deviation for each metric
avg_scores = {metric: np.mean(scores[metric]) for metric in scoring}
std_scores = {metric: np.std(scores[metric]) for metric in scoring}

# Print results
for metric in scoring:
    print("Average {}: {:.4f}".format(metric, avg_scores[metric]))
    print("{} Standard Deviation: {:.4f}".format(metric, std_scores[metric]))

Average accuracy: 0.9670
accuracy Standard Deviation: 0.0149
Average precision: 0.9728
precision Standard Deviation: 0.0279
Average recall: 0.9601
recall Standard Deviation: 0.0212
Average f1: 0.9660
f1 Standard Deviation: 0.0167
