In [33]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

from sklearn.naive_bayes import MultinomialNB
from sklearn.cluster import KMeans
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

In [34]:
df = pd.read_csv("Data/enron6.csv")

In [35]:
df.info

<bound method DataFrame.info of      label  label_num                                               text
0      ham          0  subject tw weekly 10 31 00 please see attached...
1     spam          1  subject re your woman needs an 8 inch man be t...
2     spam          1  subject bait excelled em cas outdoor adventure...
3     spam          1  subject notice you ve won claim your money now...
4     spam          1  subject largment nxeonu virility patch rx male...
...    ...        ...                                                ...
5995  spam          1  subject get discount drugs without prescriptio...
5996   ham          0  subject rev em up enron you re invited to a ca...
5997  spam          1  subject spotlight on the specialty store for y...
5998  spam          1     subject enlarge your bre 4 sts now unsubscribe
5999  spam          1  subject ears tussioneex ambl 1 en xanaax 11 ag...

[6000 rows x 3 columns]>

In [36]:
# 2. Eğitim ve Test Setlerini Ayır
train_texts, test_texts, train_labels, test_labels = train_test_split(
    df["text"], df["label"], test_size=0.2, random_state=42
)

In [41]:
# 3. TF-IDF ile Vektörleştirme
tfidf_vectorizer = TfidfVectorizer(max_features=500)
X_train = tfidf_vectorizer.fit_transform(train_texts)
X_test = tfidf_vectorizer.transform(test_texts)

In [42]:
print(test_labels)

1782    spam
3917    spam
221     spam
2135     ham
5224    spam
        ... 
2543     ham
1773    spam
2846    spam
5799     ham
1765     ham
Name: label, Length: 1200, dtype: object


In [8]:
# 5. Etiket Eşleştirme
# Gerçek etiketleri encode et
label_encoder = LabelEncoder()
true_labels_encoded = label_encoder.fit_transform(train_labels)


### K-Means

In [29]:
# 4. K-Means ile Kümeleme
num_clusters = 2  # Spam ve ham için 2 küme
kmeans = KMeans(n_clusters=num_clusters, random_state=42)
kmeans.fit(X_train,true_labels_encoded)


In [30]:
# Confusion Matrix üzerinden eşleştirme yap
train_clusters = kmeans.predict(X_train)
cm = confusion_matrix(true_labels_encoded, train_clusters)
mapping = cm.argmax(axis=0)  # Küme -> Gerçek Etiket Eşleştirme

In [31]:
# Test Verilerini Tahmin Et
predicted_clusters = [mapping[label] for label in kmeans.predict(X_test)]

In [32]:
# Test etiketlerini encode et
test_labels_encoded = label_encoder.transform(test_labels)

In [25]:
# 6. Başarı Ölçümü
accuracy = accuracy_score(test_labels_encoded, predicted_clusters)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(test_labels_encoded, predicted_clusters))


Accuracy: 71.58%

Classification Report:
              precision    recall  f1-score   support

           0       0.00      0.00      0.00       341
           1       0.72      1.00      0.83       859

    accuracy                           0.72      1200
   macro avg       0.36      0.50      0.42      1200
weighted avg       0.51      0.72      0.60      1200



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Naive Bayes

In [26]:
# Naive Bayes sınıflandırıcı
naive_bayes = MultinomialNB()
naive_bayes.fit(X_train, train_labels)


In [27]:
# Test verisiyle tahmin yap
predicted_labels = naive_bayes.predict(X_test)


In [28]:
# Başarı ölçümü
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(test_labels, predicted_labels))


Accuracy: 95.33%

Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.85      0.91       341
        spam       0.94      1.00      0.97       859

    accuracy                           0.95      1200
   macro avg       0.96      0.92      0.94      1200
weighted avg       0.96      0.95      0.95      1200



### SVM

In [82]:
# SVM sınıflandırıcı
svm = SVC(kernel='linear')  # Lineer kernel ile SVM modelini oluştur
svm.fit(X_train, train_labels)


In [83]:
# Test verisiyle tahmin yap
predicted_labels = svm.predict(X_test)


In [84]:
# Başarı ölçümü
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(test_labels, predicted_labels))


Accuracy: 97.58%

Classification Report:
              precision    recall  f1-score   support

         ham       0.99      0.92      0.96       341
        spam       0.97      1.00      0.98       859

    accuracy                           0.98      1200
   macro avg       0.98      0.96      0.97      1200
weighted avg       0.98      0.98      0.98      1200



### Random Forest

In [85]:
# Random Forest sınıflandırıcı
random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
random_forest.fit(X_train, train_labels)


In [86]:
# Test verisiyle tahmin yap
predicted_labels = random_forest.predict(X_test)

In [87]:
# Başarı ölçümü
accuracy = accuracy_score(test_labels, predicted_labels)
print(f"Accuracy: {accuracy * 100:.2f}%")
print("\nClassification Report:")
print(classification_report(test_labels, predicted_labels))


Accuracy: 96.50%

Classification Report:
              precision    recall  f1-score   support

         ham       0.97      0.90      0.94       341
        spam       0.96      0.99      0.98       859

    accuracy                           0.96      1200
   macro avg       0.97      0.95      0.96      1200
weighted avg       0.97      0.96      0.96      1200

