In [16]:
import os
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langdetect import detect, DetectorFactory
import nltk

# NLTK ve LangDetect ayarları
nltk.download('stopwords')
nltk.download('punkt')
DetectorFactory.seed = 0  # LangDetect için deterministik sonuç

# Metin temizleme fonksiyonu
def clean_text(text):
    # Noktalama işaretlerini ve özel karakterleri kaldır
    text = re.sub(r'[^\w\s]', '', text)
    # Sayıları kaldır
    text = re.sub(r'\d+', '', text)
    # Küçük harfe çevir
    text = text.lower()
    # Durak kelimeleri kaldır
    stop_words = set(stopwords.words('english'))
    tokens = word_tokenize(text)
    filtered_tokens = [word for word in tokens if word not in stop_words]
    return ' '.join(filtered_tokens)

# Eğitim veri setinin yolu
articles = []  # Initialize the list to store article data
for j in range(1, 21):
    if j < 10:
        training_dataset_path = f'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem0000{j}'
    else:
        training_dataset_path = f'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem000{j}'

    # Tüm problem klasörlerini listeleme
    problems = [os.path.join(training_dataset_path, d) for d in os.listdir(training_dataset_path) if os.path.isdir(os.path.join(training_dataset_path, d))]
    print(f"Problem Klasörleri: {problems}")  # Tüm klasörleri göster

    # İlk problem klasöründen dosyaları okuma
    for i in range(0, len(problems)):
        problem_path = problems[i]
        files = os.listdir(problem_path)
        print(f"Problemdeki Dosyalar: {files}")

        # Makale içeriği, yazar ve dil bilgisi
        for file_name in files:
            file_path = os.path.join(problem_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Temizleme işlemi
            cleaned_content = clean_text(content)
            
            # Yazar bilgisini almak
            author = problem_path.split(os.sep)[-1]  # Dosya yolundan yazar ismini almak
            
            # Dil algılama
            try:
                language = detect(content)
            except Exception:
                language = "unknown"  # Dil algılanamazsa "unknown" olarak işaretle
            
            # Veri setine ekle
            articles.append({"file_name": file_name, "content": cleaned_content, "author": author, "language": language})

# Convert the list of articles to a DataFrame
df = pd.DataFrame(articles)

# Print an example of the cleaned text
print(f"Örnek Temizlenmiş Metin: {df['content'][0][:100]}")  # İlk 100 karakteri göster

# Verinin ilk 5 satırını göster
print(df.head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Problem Klasörleri: ['pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00001', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00002', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00003', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00004', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00005', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00006', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00007', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00008', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00009', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candid

In [17]:
import os
import pandas as pd
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from langdetect import detect, DetectorFactory
import nltk

# NLTK ve LangDetect ayarları
nltk.download('stopwords')
nltk.download('punkt')
DetectorFactory.seed = 0  # LangDetect için deterministik sonuç

# Stopwords listesi (desteklenen diller için)
stopwords_dict = {
    "en": set(stopwords.words('english')),
    "fr": set(stopwords.words('french')),
    "it": set(stopwords.words('italian')),
    "es": set(stopwords.words('spanish')),
    "pl": {
        "i", "w", "na", "do", "po", "z", "za", "że", "nie", "tak", "jak", "czy", "ale", "lub",
        "od", "dla", "być", "przez", "ze", "ten", "to", "te", "może", "też", "tylko", "jeszcze",
        "już", "więc", "oni", "ona", "ono", "on", "ja", "ty", "my", "wy", "ich", "go", "mu", "jej",
        "jaki", "kiedy", "gdzie", "dlaczego", "co", "kto", "kogo", "czego", "komu", "kim", "czym",
        "jaka", "która", "który", "którzy", "które", "ich", "ich", "są", "był", "była", "było"
    }
}

# Dil bazlı metin temizleme fonksiyonu
def clean_text_by_language(text, lang):
    # Noktalama işaretlerini ve özel karakterleri kaldır
    text = re.sub(r'[^\w\s]', '', text)
    # Sayıları kaldır
    text = re.sub(r'\d+', '', text)
    # Küçük harfe çevir
    text = text.lower()
    # Tokenize et
    tokens = word_tokenize(text)
    # Stopwords kaldır
    if lang in stopwords_dict:
        tokens = [word for word in tokens if word not in stopwords_dict[lang]]
    return ' '.join(tokens)

# Eğitim veri setinin yolu
articles = []  # Makale verilerini depolamak için liste
for j in range(1, 21):
    if j < 10:
        training_dataset_path = f'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem0000{j}'
    else:
        training_dataset_path = f'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem000{j}'

    # Tüm problem klasörlerini listeleme
    problems = [os.path.join(training_dataset_path, d) for d in os.listdir(training_dataset_path) if os.path.isdir(os.path.join(training_dataset_path, d))]
    print(f"Problem Klasörleri: {problems}")  # Tüm klasörleri göster

    # Problem klasörlerinden dosyaları okuma
    for i in range(0, len(problems)):
        problem_path = problems[i]
        files = os.listdir(problem_path)
        print(f"Problemdeki Dosyalar: {files}")

        # Makale içeriği, yazar ve dil bilgisi
        for file_name in files:
            file_path = os.path.join(problem_path, file_name)
            with open(file_path, 'r', encoding='utf-8') as f:
                content = f.read()
            
            # Dil algılama
            try:
                language = detect(content)
            except Exception:
                language = "unknown"  # Dil algılanamazsa "unknown" olarak işaretle
            
            # Desteklenen dillerde temizleme işlemi
            if language in stopwords_dict:
                cleaned_content = clean_text_by_language(content, language)
            else:
                cleaned_content = "unsupported_language"

            # Yazar bilgisini almak
            author = problem_path.split(os.sep)[-1]  # Dosya yolundan yazar ismini almak
            
            # Veri setine ekle
            articles.append({"file_name": file_name, "content": cleaned_content, "author": author, "language": language})

# Veri çerçevesine dönüştürme
df = pd.DataFrame(articles)

# Bilinmeyen veya desteklenmeyen dilleri temizleme
df = df[(df['language'] != 'unknown') & (df['content'] != "unsupported_language")]

# Örnek temizlenmiş metin gösterme
print(f"Örnek Temizlenmiş Metin: {df['content'].iloc[0][:100]}")  # İlk 100 karakteri göster

# Verinin ilk 5 satırını göster
print(df.head())

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\acer\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Problem Klasörleri: ['pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00001', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00002', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00003', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00004', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00005', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00006', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00007', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00008', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candidate00009', 'pan18-cross-domain-authorship-attribution-test-dataset2-2018-04-20/problem00001\\candid

In [2]:
df['language'].value_counts()

language
es    681
pl    650
fr    588
it    587
en    559
Name: count, dtype: int64

1. Word2Vec

In [18]:
from gensim.models import Word2Vec
import numpy as np
from nltk.tokenize import word_tokenize

# Word2Vec için veri hazırlama
def prepare_sentences(df):
    sentences = []
    for content in df['content']:
        tokens = word_tokenize(content)
        sentences.append(tokens)
    return sentences

# Word2Vec modelini eğitme
def train_word2vec(sentences, vector_size=100, window=5, min_count=2, workers=4):
    model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    return model

# Veri setinden cümleleri hazırlama
sentences = prepare_sentences(df)

# Word2Vec modelini eğitme
word2vec_model = train_word2vec(sentences)

# Örnek: "example" kelimesinin vektörünü alma
if "example" in word2vec_model.wv:
    example_vector = word2vec_model.wv["example"]
    print(f"'example' kelimesinin embedding vektörü:\n{example_vector}")

# Makale bazında embedding hesaplama (ortalama vektör)
def get_document_embedding(tokens, model):
    embeddings = [model.wv[word] for word in tokens if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Veri setindeki her makale için embedding hesaplama
df['embedding'] = df['content'].apply(lambda x: get_document_embedding(word_tokenize(x), word2vec_model))

# İlk makalenin embedding vektör boyutunu göster
print(f"İlk makale embedding vektörü boyutu: {df['embedding'][0].shape}")


İlk makale embedding vektörü boyutu: (100,)


2. BERT

In [None]:
import torch
from transformers import BertTokenizer, BertModel
import numpy as np
from nltk.tokenize import word_tokenize

# BERT modelini ve tokenizer'ı yükle
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# BERT'e metin verisi iletmek ve gömmeleri almak için yardımcı fonksiyon
def get_bert_embeddings(text):
    # Metni tokenize et
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True, max_length=512)
    
    # BERT modelinden çıktı al
    with torch.no_grad():
        outputs = model(**inputs)
    
    # Elde edilen gömme (embedding) çıktısı, son katmanından alınır
    last_hidden_state = outputs.last_hidden_state  # (batch_size, sequence_length, hidden_size)
    
    # Kelime bazında gömmeleri al
    embeddings = last_hidden_state[0].numpy()  # (sequence_length, hidden_size)
    
    return embeddings

# Makale bazında embedding hesaplama (ortalama vektör)
def get_document_embedding(tokens, model):
    embeddings = get_bert_embeddings(" ".join(tokens))
    if embeddings.any():
        # Her cümle için kelimelerin embedding'lerinin ortalamasını al
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.config.hidden_size)  # BERT'in hidden size'ı 768

# Veri setindeki her makale için embedding hesaplama
df['embedding'] = df['content'].apply(lambda x: get_document_embedding(word_tokenize(x), model))

# İlk makalenin embedding vektör boyutunu göster
print(f"İlk makale embedding vektörü boyutu: {df['embedding'][0].shape}")

# Örnek: İlk makalenin embedding vektörünü yazdır
print(f"İlk makale embedding vektörü:\n{df['embedding'][0]}")


Random Forest

In [9]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
import numpy as np

# Veri setindeki 'embedding' sütunlarını numpy dizisine çevirme
X = np.vstack(df['embedding'].values)
y = df['author']  # Burada etiketlerin 'label' sütununda olduğunu varsayıyoruz

# Eğitim ve test setine ayırma
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest modelini eğitme
clf = RandomForestClassifier(n_estimators=100, random_state=42)
clf.fit(X_train, y_train)

# Test setinde tahmin yapma
y_pred = clf.predict(X_test)

# Sonuçları değerlendirme
print("Sınıflandırma Raporu:")
print(classification_report(y_test, y_pred))


Sınıflandırma Raporu:
                precision    recall  f1-score   support

candidate00001       0.00      0.00      0.00        41
candidate00002       0.18      0.20      0.19        44
candidate00003       0.14      0.16      0.15        43
candidate00004       0.11      0.09      0.10        46
candidate00005       0.28      0.23      0.25        44
candidate00006       0.17      0.11      0.13        37
candidate00007       0.21      0.19      0.20        32
candidate00008       0.00      0.00      0.00        32
candidate00009       0.03      0.03      0.03        32
candidate00010       0.13      0.17      0.15        29
candidate00011       0.00      0.00      0.00        17
candidate00012       0.26      0.21      0.23        24
candidate00013       0.04      0.04      0.04        23
candidate00014       0.00      0.00      0.00        25
candidate00015       0.00      0.00      0.00        20
candidate00016       0.00      0.00      0.00         7
candidate00017       0.10

1. Logistic Regression

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

# Logistic Regression modelini eğitme
logreg = LogisticRegression(max_iter=1000, random_state=42)
logreg.fit(X_train, y_train)

# Test setinde tahmin yapma
y_pred_logreg = logreg.predict(X_test)

# Sonuçları değerlendirme
print("Logistic Regression Sınıflandırma Raporu:")
print(classification_report(y_test, y_pred_logreg))


Logistic Regression Sınıflandırma Raporu:
                precision    recall  f1-score   support

candidate00001       0.00      0.00      0.00        41
candidate00002       0.00      0.00      0.00        44
candidate00003       0.00      0.00      0.00        43
candidate00004       0.00      0.00      0.00        46
candidate00005       0.08      0.02      0.04        44
candidate00006       0.00      0.00      0.00        37
candidate00007       0.00      0.00      0.00        32
candidate00008       0.00      0.00      0.00        32
candidate00009       0.00      0.00      0.00        32
candidate00010       0.00      0.00      0.00        29
candidate00011       0.00      0.00      0.00        17
candidate00012       0.00      0.00      0.00        24
candidate00013       0.00      0.00      0.00        23
candidate00014       0.00      0.00      0.00        25
candidate00015       0.00      0.00      0.00        20
candidate00016       0.00      0.00      0.00         7
candi

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


2. Support Vector Machine (SVM)

In [7]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report

# SVM modelini eğitme
svm = SVC(kernel='linear', random_state=42)
svm.fit(X_train, y_train)

# Test setinde tahmin yapma
y_pred_svm = svm.predict(X_test)

# Sonuçları değerlendirme
print("Support Vector Machine Sınıflandırma Raporu:")
print(classification_report(y_test, y_pred_svm))


Support Vector Machine Sınıflandırma Raporu:
                precision    recall  f1-score   support

candidate00001       0.00      0.00      0.00        41
candidate00002       0.00      0.00      0.00        44
candidate00003       0.00      0.00      0.00        43
candidate00004       0.00      0.00      0.00        46
candidate00005       0.00      0.00      0.00        44
candidate00006       0.00      0.00      0.00        37
candidate00007       0.00      0.00      0.00        32
candidate00008       0.00      0.00      0.00        32
candidate00009       0.00      0.00      0.00        32
candidate00010       0.00      0.00      0.00        29
candidate00011       0.00      0.00      0.00        17
candidate00012       0.00      0.00      0.00        24
candidate00013       0.00      0.00      0.00        23
candidate00014       0.00      0.00      0.00        25
candidate00015       0.00      0.00      0.00        20
candidate00016       0.00      0.00      0.00         7
ca

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


3. K-Nearest Neighbors (KNN)

In [8]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import classification_report

# KNN modelini eğitme
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

# Test setinde tahmin yapma
y_pred_knn = knn.predict(X_test)

# Sonuçları değerlendirme
print("K-Nearest Neighbors Sınıflandırma Raporu:")
print(classification_report(y_test, y_pred_knn))


K-Nearest Neighbors Sınıflandırma Raporu:
                precision    recall  f1-score   support

candidate00001       0.04      0.10      0.06        41
candidate00002       0.07      0.11      0.08        44
candidate00003       0.05      0.07      0.06        43
candidate00004       0.05      0.04      0.04        46
candidate00005       0.14      0.14      0.14        44
candidate00006       0.00      0.00      0.00        37
candidate00007       0.11      0.06      0.08        32
candidate00008       0.04      0.03      0.04        32
candidate00009       0.00      0.00      0.00        32
candidate00010       0.07      0.03      0.05        29
candidate00011       0.00      0.00      0.00        17
candidate00012       0.11      0.04      0.06        24
candidate00013       0.00      0.00      0.00        23
candidate00014       0.00      0.00      0.00        25
candidate00015       0.08      0.05      0.06        20
candidate00016       0.00      0.00      0.00         7
candi

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


4. Gradient Boosting (GBM)

In [9]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report

# Gradient Boosting modelini eğitme
gbm = GradientBoostingClassifier(random_state=42)
gbm.fit(X_train, y_train)

# Test setinde tahmin yapma
y_pred_gbm = gbm.predict(X_test)

# Sonuçları değerlendirme
print("Gradient Boosting Sınıflandırma Raporu:")
print(classification_report(y_test, y_pred_gbm))


Gradient Boosting Sınıflandırma Raporu:
                precision    recall  f1-score   support

candidate00001       0.06      0.05      0.05        41
candidate00002       0.18      0.16      0.17        44
candidate00003       0.15      0.12      0.13        43
candidate00004       0.07      0.04      0.05        46
candidate00005       0.21      0.14      0.17        44
candidate00006       0.25      0.14      0.18        37
candidate00007       0.30      0.19      0.23        32
candidate00008       0.00      0.00      0.00        32
candidate00009       0.04      0.03      0.04        32
candidate00010       0.09      0.10      0.10        29
candidate00011       0.00      0.00      0.00        17
candidate00012       0.10      0.08      0.09        24
candidate00013       0.00      0.00      0.00        23
candidate00014       0.00      0.00      0.00        25
candidate00015       0.03      0.05      0.04        20
candidate00016       0.00      0.00      0.00         7
candida

5. XGBoost

In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report
import xgboost as xgb
import pandas as pd
from nltk.tokenize import word_tokenize

# Word2Vec modelini ve embedding işlemini tekrar eden kısımlar
# (Sizden sağlanan kodlar bu kısımda yer alıyor)
from gensim.models import Word2Vec
import numpy as np

def prepare_sentences(df):
    sentences = []
    for content in df['content']:
        tokens = word_tokenize(content)
        sentences.append(tokens)
    return sentences

def train_word2vec(sentences, vector_size=100, window=5, min_count=2, workers=4):
    model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    return model

def get_document_embedding(tokens, model):
    embeddings = [model.wv[word] for word in tokens if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Veri hazırlığı
sentences = prepare_sentences(df)
word2vec_model = train_word2vec(sentences)

# Makale bazında embedding hesaplama
df['embedding'] = df['content'].apply(lambda x: get_document_embedding(word_tokenize(x), word2vec_model))

# X ve y oluşturma
X = np.array(df['embedding'].tolist())
y = df['author']  # Hedef değişken

# Veriyi eğitim ve test setlerine ayırma
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LabelEncoder ile hedef değişkeni sayısallaştırma
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# XGBoost modelini eğitme
xgb_model = xgb.XGBClassifier(random_state=42)
xgb_model.fit(X_train, y_train_encoded)

# Test setinde tahmin yapma
y_pred_xgb = xgb_model.predict(X_test)

# Tahmin edilen sınıfları orijinal etiketlere geri çevirme
y_pred_xgb_decoded = label_encoder.inverse_transform(y_pred_xgb)

# Sonuçları değerlendirme
print("XGBoost Sınıflandırma Raporu:")
print(classification_report(y_test, y_pred_xgb_decoded))


XGBoost Sınıflandırma Raporu:
                precision    recall  f1-score   support

candidate00001       0.04      0.04      0.04        25
candidate00002       0.10      0.12      0.11        26
candidate00003       0.10      0.07      0.08        27
candidate00004       0.09      0.06      0.07        34
candidate00005       0.12      0.12      0.12        26
candidate00006       0.24      0.17      0.20        24
candidate00007       0.13      0.15      0.14        20
candidate00008       0.00      0.00      0.00        25
candidate00009       0.00      0.00      0.00        20
candidate00010       0.06      0.13      0.09        15
candidate00011       0.00      0.00      0.00        12
candidate00012       0.25      0.20      0.22        20
candidate00013       0.00      0.00      0.00        12
candidate00014       0.00      0.00      0.00        15
candidate00015       0.07      0.08      0.07        13
candidate00016       0.00      0.00      0.00         3
candidate00017   

6. Neural Networks (MLPClassifier)

In [10]:
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report

# Neural Network modelini eğitme
mlp = MLPClassifier(random_state=42)
mlp.fit(X_train, y_train)

# Test setinde tahmin yapma
y_pred_mlp = mlp.predict(X_test)

# Sonuçları değerlendirme
print("Neural Network (MLP) Sınıflandırma Raporu:")
print(classification_report(y_test, y_pred_mlp))


Neural Network (MLP) Sınıflandırma Raporu:
                precision    recall  f1-score   support

candidate00001       0.00      0.00      0.00        41
candidate00002       0.15      0.05      0.07        44
candidate00003       0.08      0.02      0.04        43
candidate00004       0.09      0.09      0.09        46
candidate00005       0.00      0.00      0.00        44
candidate00006       0.00      0.00      0.00        37
candidate00007       0.00      0.00      0.00        32
candidate00008       0.08      0.03      0.04        32
candidate00009       0.00      0.00      0.00        32
candidate00010       0.00      0.00      0.00        29
candidate00011       0.00      0.00      0.00        17
candidate00012       0.00      0.00      0.00        24
candidate00013       0.00      0.00      0.00        23
candidate00014       0.00      0.00      0.00        25
candidate00015       0.00      0.00      0.00        20
candidate00016       0.00      0.00      0.00         7
cand

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


7. Naive Bayes

In [19]:
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
import numpy as np
from nltk.tokenize import word_tokenize

# Word2Vec ve embedding işlemleri
from gensim.models import Word2Vec

def prepare_sentences(df):
    sentences = []
    for content in df['content']:
        tokens = word_tokenize(content)
        sentences.append(tokens)
    return sentences

def train_word2vec(sentences, vector_size=100, window=5, min_count=2, workers=4):
    model = Word2Vec(sentences, vector_size=vector_size, window=window, min_count=min_count, workers=workers)
    return model

def get_document_embedding(tokens, model):
    embeddings = [model.wv[word] for word in tokens if word in model.wv]
    if embeddings:
        return np.mean(embeddings, axis=0)
    else:
        return np.zeros(model.vector_size)

# Veri hazırlığı
sentences = prepare_sentences(df)
word2vec_model = train_word2vec(sentences)

# Makale bazında embedding hesaplama
df['embedding'] = df['content'].apply(lambda x: get_document_embedding(word_tokenize(x), word2vec_model))

# X ve y oluşturma
X = np.array(df['embedding'].tolist())
y = df['author']  # Hedef değişken

# Embedding vektörlerini normalleştirme (pozitif değerler gereklidir)
scaler = MinMaxScaler()
X = scaler.fit_transform(X)

# Veriyi eğitim ve test setlerine ayırma
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# LabelEncoder ile hedef değişkeni sayısallaştırma
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Naive Bayes modelini eğitme
nb = MultinomialNB()
nb.fit(X_train, y_train_encoded)

# Test setinde tahmin yapma
y_pred_nb = nb.predict(X_test)

# Tahmin edilen sınıfları orijinal etiketlere geri çevirme
y_pred_nb_decoded = label_encoder.inverse_transform(y_pred_nb)

# Sonuçları değerlendirme
print("Naive Bayes Sınıflandırma Raporu:")
print(classification_report(y_test, y_pred_nb_decoded))


Naive Bayes Sınıflandırma Raporu:
                precision    recall  f1-score   support

candidate00001       0.00      0.00      0.00        25
candidate00002       0.00      0.00      0.00        26
candidate00003       0.00      0.00      0.00        27
candidate00004       0.00      0.00      0.00        34
candidate00005       0.00      0.00      0.00        26
candidate00006       0.00      0.00      0.00        24
candidate00007       0.00      0.00      0.00        20
candidate00008       0.00      0.00      0.00        25
candidate00009       0.00      0.00      0.00        20
candidate00010       0.00      0.00      0.00        15
candidate00011       0.00      0.00      0.00        12
candidate00012       0.00      0.00      0.00        20
candidate00013       0.00      0.00      0.00        12
candidate00014       0.00      0.00      0.00        15
candidate00015       0.00      0.00      0.00        13
candidate00016       0.00      0.00      0.00         3
candidate0001

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


In [14]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Özellikler (X) ve etiketler (y) belirleme
X = df['embedding'].apply(lambda x: x.flatten()).to_list()  # embedding vektörlerini kullanıyoruz
y = df['author']  # Yazarları etiket olarak kullanıyoruz

# Eğitim ve test veri setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Modeli oluşturma (Decision Tree)
dt_model = DecisionTreeClassifier(random_state=42)

# Modeli eğitme
dt_model.fit(X_train, y_train)

# Test veri seti ile tahmin yapma
y_pred = dt_model.predict(X_test)

# Sonuçları değerlendirme
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Modeli Başarı Oranı: {accuracy * 100:.2f}%")

# Detaylı sınıflandırma raporu
print("Sınıflandırma Raporu:")
print(classification_report(y_test, y_pred))

Decision Tree Modeli Başarı Oranı: 61.04%
Sınıflandırma Raporu:
                precision    recall  f1-score   support

candidate00001       0.05      0.08      0.06        39
candidate00002       0.05      0.05      0.05        37
candidate00003       0.03      0.02      0.02        51
candidate00004       0.07      0.07      0.07        40
candidate00005       0.05      0.07      0.06        40
candidate00006       0.04      0.09      0.05        23
candidate00007       0.06      0.08      0.07        48
candidate00008       0.04      0.05      0.04        39
candidate00009       0.05      0.05      0.05        41
       unknown       0.83      0.77      0.80      1236

      accuracy                           0.61      1594
     macro avg       0.13      0.13      0.13      1594
  weighted avg       0.65      0.61      0.63      1594



In [11]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Özellikler (X) ve etiketler (y) belirleme
X = df['embedding'].apply(lambda x: x.flatten()).to_list()  # embedding vektörlerini kullanıyoruz
y = df['author']  # Yazarları etiket olarak kullanıyoruz

# Eğitim ve test veri setlerine ayırma
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Modeli oluşturma (Decision Tree)
dt_model = DecisionTreeClassifier(random_state=42)

# Modeli eğitme
dt_model.fit(X_train, y_train)

# Test veri seti ile tahmin yapma
y_pred = dt_model.predict(X_test)

# Sonuçları değerlendirme
accuracy = accuracy_score(y_test, y_pred)
print(f"Decision Tree Modeli Başarı Oranı: {accuracy * 100:.2f}%")

# Detaylı sınıflandırma raporu
print("Sınıflandırma Raporu:")
print(classification_report(y_test, y_pred))

Decision Tree Modeli Başarı Oranı: 44.02%
Sınıflandırma Raporu:
                precision    recall  f1-score   support

candidate00001       0.02      0.02      0.02        41
candidate00002       0.12      0.18      0.14        44
candidate00003       0.11      0.16      0.13        43
candidate00004       0.08      0.11      0.10        46
candidate00005       0.23      0.25      0.24        44
candidate00006       0.11      0.08      0.09        37
candidate00007       0.12      0.09      0.11        32
candidate00008       0.00      0.00      0.00        32
candidate00009       0.05      0.03      0.04        32
candidate00010       0.12      0.17      0.14        29
candidate00011       0.00      0.00      0.00        17
candidate00012       0.25      0.17      0.20        24
candidate00013       0.00      0.00      0.00        23
candidate00014       0.00      0.00      0.00        25
candidate00015       0.00      0.00      0.00        20
candidate00016       0.00      0.00    