In [11]:
import pandas as pd
import numpy as np
from nltk.corpus import stopwords
import nltk

DF = pd.read_csv('/content/Autor_detection.csv')
DF.head()

#stopworldslerin analizi yanlış yönlendirmesini engellemek için
nltk.download('stopwords')

counts = DF['Writter name'].value_counts()
print("max: ", counts.max(),"- min: ", counts.min(),"- mean: ", counts.mean() , "std: ", counts.std())

counts_greater_100 = counts[counts > 100]
print("Number of authors with more than 100 samples: ", counts_greater_100)

# counts_greater_120 = counts[counts > 120]
# print("Number of authors with more than 120 samples: ", counts_greater_120)


AttributeError: partially initialized module 'nltk' has no attribute 'data' (most likely due to a circular import)

In [None]:
#-------------ALL FUNCTİONS--------------
def clean_and_std_data(df):

    df = df.dropna(subset=['Quotes'])

    def is_english(text):
        try:
            text.encode("ascii")
            return True
        except UnicodeEncodeError:
            return False

    english_quotes = df[df['Quotes'].apply(lambda x: is_english(str(x)))]
    english_quotes['Quotes'] = english_quotes['Quotes'].str.lower().str.replace(r'[^a-zA-Z\s]', '', regex=True).str.strip()

    print(f"Orijinal veri seti büyüklüğü: {len(df)}")
    print(f"İngilizce quotes sayısı: {len(english_quotes)}")
    print(f"Elenen quote sayısı: {len(df) - len(english_quotes)}")
    return english_quotes

def get_word_freqs(df,top = 20):
    STOP_WORDS = set(stopwords.words('english'))
    author_word_freqs = {}
    for author in df['Writter name'].unique():
        author_quotes = df[df['Writter name'] == author]['Quotes']
        all_words = ' '.join(author_quotes).split()
        # Stop words'leri filtrele
        all_words = [word for word in all_words if word not in STOP_WORDS and len(word) > 1]
        word_freq = pd.Series(all_words).value_counts().head(top)
        author_word_freqs[author] = word_freq
    return author_word_freqs

def find_longest_quote_in_df(df: pd.DataFrame):
    """
    Traverses all elements of a DataFrame and finds the longest text (quote) by word count.
    Args:
        df (pd.DataFrame): The DataFrame to search through.
    Returns:
        tuple[str, int]: The longest quote and its word count.
    """
    longest_quote = ""
    max_words = 0
    for col in df.columns:
        for val in df[col]:
            if isinstance(val, str):  # Only process text
                word_count = len(val.split())
                if word_count > max_words:
                    max_words = word_count
                    longest_quote = val
    return longest_quote, max_words





def get_avg_quote_length_for_author(df):  # standardize edilmemiş, yazarlar arasındaki genel quote'a göre standardize edilecek
    author_avg_word_length = {}
    longest_quote, max_words = find_longest_quote_in_df(df)
    for author in df['Writter name'].unique():
        author_quotes = df[df['Writter name'] == author]['Quotes']
        avg_length = author_quotes.apply(lambda x: len(x.split())).mean()
        standardize_length = avg_length / max_words
        # max_words if max_words > 0 else 0.0
        author_avg_word_length[author] = standardize_length

    return author_avg_word_length

def get_frequent_ngrams(df,n=2, top=20):
    STOP_WORDS = set(stopwords.words('english'))
    author_ngram_freqs = {}
    for author in df['Writter name'].unique():
        author_quotes = df[df['Writter name'] == author]['Quotes']
        all_ngrams = []
        for quote in author_quotes:
            # Kelimeleri böl ve stop words'leri filtrele
            words = [word for word in quote.split()
                    if word not in STOP_WORDS and len(word) > 1]

            if len(words) >= n:  # n-gram oluşturmak için yeterli kelime var mı
                ngram_list = list(nltk.ngrams(words, n))
                all_ngrams.extend(ngram_list)

        if all_ngrams:
            ngram_freq = pd.Series(all_ngrams).value_counts().head(top)
            author_ngram_freqs[author] = ngram_freq
        else:
            author_ngram_freqs[author] = pd.Series()

    return author_ngram_freqs

def get_word_diversity_for_author(df):
    author_word_diversity = {}
    for author in df['Writter name'].unique():
        author_quotes = df[df['Writter name'] == author]['Quotes']
        all_words = ' '.join(author_quotes).split()
        unique_words = set(all_words)
        diversity = len(unique_words) / len(all_words) if len(all_words) > 0 else 0
        author_word_diversity[author] = diversity
    return author_word_diversity

def get_author_uniqueness_score(df):
    """
    Her yazarın diğer yazarlara göre ne kadar benzersiz kelime kullandığını hesaplar
    """
    STOP_WORDS = set(stopwords.words('english'))

    all_authors_words = set()
    author_word_sets = {}

    # Her yazarın kelime setini oluştur
    for author in df['Writter name'].unique():
        author_quotes = df[df['Writter name'] == author]['Quotes']
        words = set(' '.join(author_quotes).split())
        # Stop words'leri çıkar
        words = {word for word in words if word not in STOP_WORDS and len(word) > 1}
        author_word_sets[author] = words
        all_authors_words.update(words)

    # Her yazar için benzersizlik skoru hesapla
    author_uniqueness = {}
    for author, words in author_word_sets.items():
        # Diğer yazarların kelimelerini topla
        other_authors_words = set()
        for other_author, other_words in author_word_sets.items():
            if other_author != author:
                other_authors_words.update(other_words)

        # Benzersiz kelimeler: Sadece bu yazarın kullandığı kelimeler
        unique_words = words - other_authors_words

        # Benzersizlik skoru: Yazarın benzersiz kelimelerinin toplam kelimelerine oranı
        uniqueness_score = len(unique_words) / len(words) if len(words) > 0 else 0
        author_uniqueness[author] = uniqueness_score

    return author_uniqueness



import pandas as pd
from nltk.corpus import stopwords

def get_lwr_and_avg_world_len(df):    # standardize edilmiş hali
    # lwr = long word ratio
    STOP_WORDS = set(stopwords.words('english'))

    author_complexity = {}

    for author in df['Writter name'].unique():
        author_quotes = df[df['Writter name'] == author]['Quotes']

        # Drop NaN, convert everything to string
        author_quotes = author_quotes.dropna().astype(str)

        # Collect all words
        all_words = " ".join(author_quotes).split()
        all_words = [
            w for w in all_words
            if w.lower() not in STOP_WORDS and len(w) > 1
        ]

        if all_words:
            avg_word_length = sum(len(w) for w in all_words) / len(all_words)
            long_words = [w for w in all_words if len(w) > 6]
            long_word_ratio = len(long_words) / len(all_words)
        else:
            avg_word_length = 0.0
            long_word_ratio = 0.0

        author_complexity[author] = {
            "avg_word_length": avg_word_length,
            "long_word_ratio": long_word_ratio
        }

    df_complexity = pd.DataFrame.from_dict(author_complexity, orient="index")

    # Normalize each column to [0, 1]
    for col in df_complexity.columns:
        max_val = df_complexity[col].max()
        if max_val > 0:
            df_complexity[col] = df_complexity[col] / max_val

    return df_complexity
df_comp = get_lwr_and_avg_world_len(DF)
print(df_comp.max())

def get_avg_quote_length_for_author(df):
    author_avg_word_length = {}

    # Step 1: compute average word counts per author
    for author in df['Writter name'].unique():
        author_quotes = df[df['Writter name'] == author]['Quotes']
        avg_length = author_quotes.apply(lambda x: len(str(x).split())).mean()
        author_avg_word_length[author] = avg_length

    # Step 2: normalize all averages (divide by the max)
    max_avg = max(author_avg_word_length.values())
    for author in author_avg_word_length:
        author_avg_word_length[author] /= max_avg  # normalize

    return author_avg_word_length
print(max(get_avg_quote_length_for_author(DF).values()))

def pick_writers(df,limit=120):
    counts = df['Writter name'].value_counts()
    selected_authors = counts[counts > limit].index
    filtered_df = df[df['Writter name'].isin(selected_authors)]
    print(f"Seçilen yazar sayısı: {len(selected_authors)}")
    print(f"Filtrelenmiş veri seti büyüklüğü: {len(filtered_df)}")
    return filtered_df

avg_word_length    1.0
long_word_ratio    1.0
dtype: float64
1.0


In [None]:
cleaned_DF = clean_and_std_data(DF)
print(cleaned_DF.head())
print("\n")

counts = cleaned_DF['Writter name'].value_counts()
print("max: ", counts.max(),"- min: ", counts.min(),"- mean: ", counts.mean() , "std: ", counts.std())
print("\n")

counts_greater_100 = counts[counts > 100]
print("Number of authors with more than 100 samples: ", counts_greater_100)
print("\n")

counts_greater_120 = counts[counts > 120]
print("Number of authors with more than 120 samples: ", counts_greater_120)


Orijinal veri seti büyüklüğü: 1571
İngilizce quotes sayısı: 1566
Elenen quote sayısı: 5
                                              Quotes      Writter name
0  reading kafka i sense that the elicited questi...  Alberto Manguel,
1  all animals are equal but some animals are mor...     George Orwell
2  i am old gandalf i dont look it but i am begin...    J.R.R. Tolkien
3  how can we live without our lives how will we ...    John Steinbeck
4  i was only foolin george i dont want no ketchu...    John Steinbeck


max:  191 - min:  1 - mean:  19.575 std:  41.69199145997631


Number of authors with more than 100 samples:  Writter name
Jane Austen            191
J.R.R. Tolkien         158
John Steinbeck         147
Charles Dickens        147
William Shakespeare    132
Ernest Hemingway       114
Name: count, dtype: int64


Number of authors with more than 120 samples:  Writter name
Jane Austen            191
J.R.R. Tolkien         158
John Steinbeck         147
Charles Dickens        147
Will

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  english_quotes['Quotes'] = english_quotes['Quotes'].str.lower().str.replace(r'[^a-zA-Z\s]', '', regex=True).str.strip()


In [None]:
# NaN double checkpoint
print("\nBoş string kontrolü:", cleaned_DF[cleaned_DF['Quotes'] == ''].shape[0])
print("pd.NA kontrolü:", cleaned_DF[cleaned_DF['Quotes'].isna()].shape[0])
print("np.nan kontrolü:", cleaned_DF[cleaned_DF['Quotes'].isnull()].shape[0])

#word_freq feature ekleme
word_freqs = get_word_freqs(cleaned_DF, top=20)
cleaned_DF["word_freqs"] = cleaned_DF['Writter name'].map(word_freqs)

#n=2 ve n=3 için ngram feature ekleme
bigrams = get_frequent_ngrams(cleaned_DF, n=2, top=20)
cleaned_DF["bigram_freqs"] = cleaned_DF['Writter name'].map(bigrams)
trigrams = get_frequent_ngrams(cleaned_DF, n=3, top=20)
cleaned_DF["trigram_freqs"] = cleaned_DF['Writter name'].map(trigrams)

#yazarın quotelarında kullandığı kelime çeşitliliğini represent eden feature ekleme
word_diversities = get_word_diversity_for_author(cleaned_DF)
cleaned_DF["word_diversity_"] = cleaned_DF['Writter name'].map(word_diversities)

#author uniquness score ekleme : get_author_uniqueness_score func altında feature tanımı bulunmakta
uniqueness_scores = get_author_uniqueness_score(cleaned_DF)
cleaned_DF["author_uniqueness_score"] = cleaned_DF['Writter name'].map(uniqueness_scores)

#lwr ve avg word length ekleme, lwr: long word ratio
complexity_df = get_lwr_and_avg_world_len(cleaned_DF)
cleaned_DF = cleaned_DF.merge(complexity_df, left_on='Writter name', right_index=True, how='left')


#avg quote length ekleme
avg_lengths = get_avg_quote_length_for_author(cleaned_DF)
cleaned_DF["avg_quote_length(by words)"] = cleaned_DF['Writter name'].map(avg_lengths)
cleaned_DF.head()





Boş string kontrolü: 0
pd.NA kontrolü: 0
np.nan kontrolü: 0


Unnamed: 0,Quotes,Writter name,word_freqs,bigram_freqs,trigram_freqs,word_diversity_,author_uniqueness_score,avg_word_length,long_word_ratio,avg_quote_length(by words)
0,reading kafka i sense that the elicited questi...,"Alberto Manguel,",kafka 2 next 2 sense ...,"(reading, kafka) 1 (kafka, sens...","(reading, kafka, sense) 1 ...",0.901961,0.357143,0.773267,0.57037,0.118329
1,all animals are equal but some animals are mor...,George Orwell,one 11 man 10 would 8 h...,"(animal, shall) 4 (animals, equal...","(whatever, goes, upon) 2 (used,...",0.3657,0.280412,0.651,0.436159,0.053331
2,i am old gandalf i dont look it but i am begin...,J.R.R. Tolkien,one 20 ring 19 said 19 l...,"(one, ring) 8 (said, bilbo) ...","(one, ring, rule) 3 (ring, rule, on...",0.256083,0.349872,0.57582,0.267378,0.073629
3,how can we live without our lives how will we ...,John Steinbeck,people 20 like 18 get 17 know ...,"(seemed, know) 4 (im, scared) ...","(make, feel, rich) 3 (seemed, kno...",0.289151,0.328846,0.586435,0.288417,0.062992
4,i was only foolin george i dont want no ketchu...,John Steinbeck,people 20 like 18 get 17 know ...,"(seemed, know) 4 (im, scared) ...","(make, feel, rich) 3 (seemed, kno...",0.289151,0.328846,0.586435,0.288417,0.062992


In [None]:
cleaned_DF = pick_writers(cleaned_DF,120)
cleaned_DF.head()

Seçilen yazar sayısı: 5
Filtrelenmiş veri seti büyüklüğü: 775


Unnamed: 0,Quotes,Writter name,word_freqs,bigram_freqs,trigram_freqs,word_diversity_,author_uniqueness_score,avg_word_length,long_word_ratio,avg_quote_length(by words)
2,i am old gandalf i dont look it but i am begin...,J.R.R. Tolkien,one 20 ring 19 said 19 l...,"(one, ring) 8 (said, bilbo) ...","(one, ring, rule) 3 (ring, rule, on...",0.256083,0.349872,0.57582,0.267378,0.073629
3,how can we live without our lives how will we ...,John Steinbeck,people 20 like 18 get 17 know ...,"(seemed, know) 4 (im, scared) ...","(make, feel, rich) 3 (seemed, kno...",0.289151,0.328846,0.586435,0.288417,0.062992
4,i was only foolin george i dont want no ketchu...,John Steinbeck,people 20 like 18 get 17 know ...,"(seemed, know) 4 (im, scared) ...","(make, feel, rich) 3 (seemed, kno...",0.289151,0.328846,0.586435,0.288417,0.062992
6,there is nothing i would not do for those who ...,Jane Austen,one 25 could 23 must 21 alwa...,"(everybody, else) 5 (mr, darcy) ...","(well, everybody, else) 2 (wish, ...",0.25473,0.348817,0.667977,0.451571,0.059705
8,short cuts make long delays,J.R.R. Tolkien,one 20 ring 19 said 19 l...,"(one, ring) 8 (said, bilbo) ...","(one, ring, rule) 3 (ring, rule, on...",0.256083,0.349872,0.57582,0.267378,0.073629




In [None]:
# test yapılırken kullanabilecek bir hal

import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

# Türkçe karakterleri düzgün gösterme
pd.set_option('display.max_colwidth', None)

# İngilizce stop words'leri indirme
nltk.download('stopwords')
nltk.download('punkt')

# Veri setini okuma
df = pd.read_csv('Autor_detection.csv')

# Veri temizleme fonksiyonunu güncelleme
def clean_and_std_data(df):
    # 1. Boş alıntıları kaldırma
    df = df.dropna(subset=['Quotes'])

    # 2. Metinleri string'e çevirme ve temizleme
    df['Quotes'] = df['Quotes'].astype(str)

    # 3. Türkçe karakter kontrolü - İngilizce olmayanları filtreleme
    def is_english(text):
        try:
            text.encode('ascii')
            return True
        except UnicodeEncodeError:
            return False

    # İngilizce alıntıları seçme
    english_quotes = df[df['Quotes'].apply(lambda x: is_english(x))]
    print(f"Orijinal veri seti büyüklüğü: {len(df)}")
    print(f"İngilizce quotes sayısı: {len(english_quotes)}")
    print(f"Elenen quote sayısı: {len(df) - len(english_quotes)}")

    # 4. Metinleri küçük harfe çevirme ve özel karakterleri temizleme
    english_quotes['clean_quotes'] = english_quotes['Quotes'].str.lower()
    english_quotes['clean_quotes'] = english_quotes['clean_quotes'].apply(
        lambda x: re.sub(r'[^a-z\s]', '', x)
    )
    english_quotes['clean_quotes'] = english_quotes['clean_quotes'].str.strip()
    english_quotes['clean_quotes'] = english_quotes['clean_quotes'].apply(
        lambda x: re.sub(r'\s+', ' ', x)
    )

    # 5. Çok kısa alıntıları filtreleme (en az 5 kelime)
    english_quotes['word_count'] = english_quotes['clean_quotes'].apply(
        lambda x: len(x.split())
    )
    english_quotes = english_quotes[english_quotes['word_count'] >= 5]

    # 6. Yazar adlarındaki gereksiz boşlukları ve noktalama işaretlerini temizleme
    english_quotes['Writter name'] = english_quotes['Writter name'].str.strip()
    english_quotes['Writter name'] = english_quotes['Writter name'].str.replace(',', '')
    english_quotes['Writter name'] = english_quotes['Writter name'].str.title()

    return english_quotes

# Veri temizleme işlemini gerçekleştirme
cleaned_df = clean_and_std_data(df)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


Orijinal veri seti büyüklüğü: 1571
İngilizce quotes sayısı: 1566
Elenen quote sayısı: 5


In [None]:
# Yazarların frekans dağılımı
author_counts = cleaned_df['Writter name'].value_counts()

# En az 120 örneği olan yazarları seçme - dengeli bir veri seti için
min_samples = 120
selected_authors = author_counts[author_counts >= min_samples].index
filtered_df = cleaned_df[cleaned_df['Writter name'].isin(selected_authors)]

print(f"Seçilen {len(selected_authors)} yazar: {list(selected_authors)}")
print(f"Filtrelenmiş veri seti büyüklüğü: {len(filtered_df)}")

# Dengesizlik oranı kontrolü
print("\nYazar dağılımı:")
print(filtered_df['Writter name'].value_counts())
print("\nDengesizlik oranı (en çok/en az):",
      filtered_df['Writter name'].value_counts().max() / filtered_df['Writter name'].value_counts().min())

Seçilen 5 yazar: ['Jane Austen', 'J.R.R. Tolkien', 'John Steinbeck', 'Charles Dickens', 'William Shakespeare']
Filtrelenmiş veri seti büyüklüğü: 765

Yazar dağılımı:
Writter name
Jane Austen            190
J.R.R. Tolkien         157
John Steinbeck         146
Charles Dickens        145
William Shakespeare    127
Name: count, dtype: int64

Dengesizlik oranı (en çok/en az): 1.4960629921259843


In [3]:
# EDA (Detaylı Keşifsel Veri Analizi)
# Yazar frekans grafiği
plt.figure(figsize=(12, 6))
sns.countplot(y='Writter name', data=filtered_df, order=filtered_df['Writter name'].value_counts().index)
plt.title('Yazarlara Göre Alıntı Sayısı', fontsize=16)
plt.xlabel('Alıntı Sayısı', fontsize=14)
plt.ylabel('Yazar', fontsize=14)
plt.tight_layout()
plt.savefig('author_distribution.png')
plt.show()

# Alıntı uzunluğu analizi
filtered_df['quote_length'] = filtered_df['clean_quotes'].apply(lambda x: len(x))
filtered_df['word_count'] = filtered_df['clean_quotes'].apply(lambda x: len(x.split()))

plt.figure(figsize=(12, 6))
sns.boxplot(x='Writter name', y='quote_length', data=filtered_df)
plt.xticks(rotation=45)
plt.title('Yazarlara Göre Alıntı Uzunluğu Dağılımı (Karakter)', fontsize=16)
plt.tight_layout()
plt.savefig('quote_length_by_author.png')
plt.show()

# Kelime sayısı dağılımı
plt.figure(figsize=(12, 6))
sns.histplot(data=filtered_df, x='word_count', hue='Writter name', kde=True, element='step', common_norm=False)
plt.title('Yazarlara Göre Kelime Sayısı Dağılımı', fontsize=16)
plt.xlabel('Kelime Sayısı', fontsize=14)
plt.ylabel('Frekans', fontsize=14)
plt.tight_layout()
plt.savefig('word_count_distribution.png')
plt.show()

# En sık kullanılan kelimeler - WordCloud ile görselleştirme
from wordcloud import WordCloud

plt.figure(figsize=(15, 10))
for i, author in enumerate(selected_authors):
    author_texts = filtered_df[filtered_df['Writter name'] == author]['clean_quotes']
    all_text = ' '.join(author_texts.tolist())

    # Stop words'leri filtreleme
    STOP_WORDS = set(stopwords.words('english'))
    words = all_text.split()
    filtered_words = [word for word in words if word not in STOP_WORDS and len(word) > 2]
    filtered_text = ' '.join(filtered_words)

    wordcloud = WordCloud(width=800, height=400, background_color='white', max_words=100).generate(filtered_text)

    plt.subplot(2, 3, i+1)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.title(f'{author}', fontsize=14)
    plt.axis('off')

plt.tight_layout()
plt.savefig('wordclouds_by_author.png')
plt.show()

# Ortalama kelime uzunluğu analizi
def avg_word_length(text):
    words = text.split()
    if len(words) == 0:
        return 0
    return sum(len(word) for word in words) / len(words)

filtered_df['avg_word_length'] = filtered_df['clean_quotes'].apply(avg_word_length)

plt.figure(figsize=(12, 6))
sns.boxplot(x='Writter name', y='avg_word_length', data=filtered_df)
plt.xticks(rotation=45)
plt.title('Yazarlara Göre Ortalama Kelime Uzunluğu', fontsize=16)
plt.tight_layout()
plt.savefig('avg_word_length_by_author.png')
plt.show()


NameError: name 'filtered_df' is not defined

<Figure size 1200x600 with 0 Axes>

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# stopwords güvenli şekilde yükleme (nltk yoksa sklearn'e dön)
try:
    from nltk.corpus import stopwords
    STOP_WORDS = set(stopwords.words('english'))
except Exception:
    try:
        import nltk
        nltk.download('stopwords', quiet=True)
        from nltk.corpus import stopwords
        STOP_WORDS = set(stopwords.words('english'))
    except Exception:
        from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS
        STOP_WORDS = set(ENGLISH_STOP_WORDS)

# Stilometrik özellik çıkarımı için fonksiyonlar
def extract_stylometric_features(df):
    df = df.copy()

    # Ensure text columns exist and are strings (fix for float/NaN)
    if 'clean_quotes' not in df.columns:
        df['clean_quotes'] = ''
    else:
        df['clean_quotes'] = df['clean_quotes'].fillna('').astype(str)

    if 'Quotes' not in df.columns:
        df['Quotes'] = ''
    else:
        df['Quotes'] = df['Quotes'].fillna('').astype(str)

    # 1. Temel metrikler
    df['word_count'] = df['clean_quotes'].apply(lambda x: len(str(x).split()))
    df['avg_word_length'] = df['clean_quotes'].apply(
        lambda x: (sum(len(word) for word in str(x).split()) / len(str(x).split())) if len(str(x).split()) > 0 else 0
    )


    # 3. Kelime uzunluğu dağılımı (1..10, 10+'lar 10. kategoriye konur)
    def word_length_distribution(text):
        text = '' if pd.isna(text) else str(text)
        words = [w for w in text.split() if w.isalpha()]
        lengths = [len(w) for w in words]
        if not lengths:
            return [0] * 10
        dist = [0] * 10
        for l in lengths:
            idx = l - 1 if l <= 10 else 9
            dist[idx] += 1
        total = sum(dist) or 1
        return [d / total for d in dist]

    word_len_df = pd.DataFrame(
        df['clean_quotes'].apply(word_length_distribution).tolist(),
        columns=[f'word_len_{i}' for i in range(1, 11)]
    )
    df = pd.concat([df, word_len_df], axis=1)

    # 4. Fonksiyon kelime oranları (kelime bazlı, küçük harfe çevirerek)
    function_words = [
        'the', 'and', 'to', 'of', 'a', 'in', 'that', 'it', 'is', 'i', 'you', 'he', 'she', 'we', 'they',
        'this', 'these', 'those', 'my', 'your', 'his', 'her', 'our', 'their', 'be', 'have', 'do', 'can',
        'will', 'would', 'should', 'could', 'may', 'might', 'must'
    ]
    # Daha sağlam hesaplama: önce token'ları oluştur, sonra oranı hesapla
    def func_word_ratio_factory(target_word):
        def ratio(text):
            text = '' if pd.isna(text) else str(text)
            tokens = text.lower().split()
            n = len(tokens)
            if n == 0:
                return 0.0
            count = sum(1 for w in tokens if w == target_word)
            return count / n
        return ratio

    for word in function_words:
        col_name = f'func_{word}'
        df[col_name] = df['clean_quotes'].apply(func_word_ratio_factory(word))

    # 6. Kelime çeşitliliği (Type-Token Ratio)
    def type_token_ratio(text):
        text = '' if pd.isna(text) else str(text)
        tokens = text.split()
        if not tokens:
            return 0
        unique_words = set(tokens)
        return len(unique_words) / len(tokens)

    df['ttr'] = df['clean_quotes'].apply(type_token_ratio)

    # 7. Kelime sıklığı özellikleri (stopwords harici)
    def content_word_ratio(text):
        text = '' if pd.isna(text) else str(text)
        tokens = text.lower().split()
        n = len(tokens)
        if n == 0:
            return 0.0
        content_count = sum(1 for w in tokens if w not in STOP_WORDS)
        return content_count / n

    df['content_word_ratio'] = df['clean_quotes'].apply(content_word_ratio)

    return df

# Stilometrik özellikleri çıkar
# varsayım: filtered_df daha önce tanımlanmış DataFrame
filtered_df = extract_stylometric_features(filtered_df)

# Temel özelliklerin korelasyon matrisi
feature_cols = [
    'word_count', 'avg_word_length', 'ttr', 'content_word_ratio'
]

# Group by yazar ve ortalamalarını alıp korelasyonu hesapla
correlation_matrix = filtered_df[feature_cols + ['Writter name']].groupby('Writter name').mean().corr()

plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', center=0)
plt.title('Özellikler Arasındaki Korelasyon', fontsize=16)
plt.tight_layout()
plt.savefig('feature_correlation.png')
plt.show()


NameError: name 'filtered_df' is not defined