1. Preprocessing

In [None]:
import os
import json
import pandas as pd
import re
from tqdm import tqdm

# 数据预处理函数
def preprocess_text(text):
    text = text.replace('\n', ' ')  # 替换段落中的换行符为一个空格
    text = re.sub(r'@\w+', '<@>', text)  # 替换提及的用户
    text = re.sub(r'#\w+', '<#>', text)  # 替换标签
    text = re.sub(r'http\S+|www\.\S+', 'U', text)  # 替换URL
    text = re.sub(r'\b\d+\b', 'N', text)  # 替换独立的数字
    text = re.sub(r'\b\d{4}-\d{2}-\d{2}\b', 'D', text)  # 替换日期
    text = re.sub(r'\b\d{2}:\d{2}(:\d{2})?\b', 'T', text)  # 替换时间
    text = re.sub(r'<.*?>', '', text)  # 移除HTML标签
    text = re.sub(r'[^\w\s<>.,!?]', '', text)  # 移除非字母数字字符
    text = re.sub(r'\s+', ' ', text).strip()  # 替换多个空格为单个空格
    text = text.lower()  # 转换为小写
    return text

def check_files(input_dir):
    # 获取所有txt文件和对应的json文件
    txt_files = [f for f in os.listdir(input_dir) if f.startswith('problem-') and f.endswith('.txt')]
    file_pairs = [(os.path.join(input_dir, txt_file), os.path.join(input_dir, txt_file.replace('problem-', 'truth-problem-').replace('.txt', '.json'))) for txt_file in txt_files]

    for txt_file, json_file in tqdm(file_pairs, desc='Checking files'):
        try:
            file_id = int(os.path.basename(txt_file).split('-')[1].split('.')[0])  # 提取文件ID

            with open(txt_file, 'r', encoding='utf-8') as f:
                paragraphs = f.read().strip().split('\n')
                paragraphs = [para.strip() for para in paragraphs if para.strip()]  # 去除空段落

            with open(json_file, 'r', encoding='utf-8') as f:
                truth_data = json.load(f)
                author_changes = truth_data['changes']

            if len(paragraphs) != len(author_changes) + 1:
                print(f"Mismatch in file {txt_file}: {len(paragraphs)} paragraphs, {len(author_changes)} author changes")
        except Exception as e:
            print(f"Error processing file {txt_file}: {e}")


In [None]:
def process_files(input_dir, output_dir, batch_size=100):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # 获取所有txt文件和对应的json文件
    txt_files = [f for f in os.listdir(input_dir) if f.startswith('problem-') and f.endswith('.txt')]
    file_pairs = [(os.path.join(input_dir, txt_file), os.path.join(input_dir, txt_file.replace('problem-', 'truth-problem-').replace('.txt', '.json'))) for txt_file in txt_files]

    for batch_idx in range(0, len(file_pairs), batch_size):
        batch_data = []
        batch_files = file_pairs[batch_idx:batch_idx + batch_size]

        for txt_file, json_file in tqdm(batch_files, desc=f'Processing batch {batch_idx // batch_size + 1}'):
            try:
                file_id = int(os.path.basename(txt_file).split('-')[1].split('.')[0])  # 提取文件ID

                with open(txt_file, 'r', encoding='utf-8') as f:
                    paragraphs = f.read().strip().split('\n')
                    paragraphs = [para.strip() for para in paragraphs if para.strip()]  # 去除空段落

                with open(json_file, 'r', encoding='utf-8') as f:
                    truth_data = json.load(f)
                    author_changes = truth_data['changes']

                if len(paragraphs) != len(author_changes) + 1:
                    print(f"Mismatch in file {txt_file}: {len(paragraphs)} paragraphs, {len(author_changes)} author changes")
                    continue

                for i, para in enumerate(paragraphs):
                    processed_para = preprocess_text(para)
                    data = {
                        'file_id': file_id,
                        'paragraph_index': i,
                        'text': processed_para,
                        'author_change': author_changes[i - 1] if i > 0 else None
                    }
                    batch_data.append(data)
            except Exception as e:
                print(f"Error processing file {txt_file}: {e}")
                print(f"Number of paragraphs: {len(paragraphs)}")
                print(f"Number of author changes: {len(author_changes)}")
        # 保存中间结果
        if batch_data:
            intermediate_df = pd.DataFrame(batch_data)
            intermediate_df.to_csv(os.path.join(output_dir, f'batch_{batch_idx // batch_size}.csv'), index=False)

def combine_batches(output_dir, final_output_file):
    batch_files = [os.path.join(output_dir, f) for f in os.listdir(output_dir) if f.startswith('batch_') and f.endswith('.csv')]
    all_data = []

    for batch_file in tqdm(batch_files, desc='Combining batches'):
        df = pd.read_csv(batch_file)
        all_data.append(df)

    final_df = pd.concat(all_data, ignore_index=True)
    final_df.to_csv(final_output_file, index=False)




In [None]:

input_dir = './pan24-multi-author-analysis/easy/train'
output_dir = './pan24-multi-author-analysis/easy/processed_train'
process_files(input_dir, output_dir)

final_output_file = './pan24-multi-author-analysis/easy/easy_train_dataframe.csv'
combine_batches(output_dir, final_output_file)

2. Feature Extraction

2.1 feature extraction for training datasets

In [None]:


# Define the feature extraction functions
def calculate_clause_density(text):
    sentences = sent_tokenize(text)
    clause_indicators = [',', 'but', 'or', 'that', 'if', 'whether', 'whom', 'whose', 'how', 'why', 'because', 'as', 'once', 'though', 'although']
    clause_count = sum(text.count(indicator) for indicator in clause_indicators) + len(sentences)
    return clause_count / len(sentences) if sentences else 0

def calculate_punctuation_density(text):
    punctuation_marks = re.findall(r'[,;.!?:\"\'\-—(){}\[\]]', text)
    return len(punctuation_marks) / len(text.split()) if text.split() else 0

def calculate_syllables_per_word(text):
    words = word_tokenize(text)
    syllable_count = sum(len(re.findall(r'[aeiouy]', word.lower())) for word in words)
    return syllable_count / len(words) if words else 0

def calculate_sentence_length(text):
    sentences = sent_tokenize(text)
    word_count = sum(len(sentence.split()) for sentence in sentences)
    return word_count / len(sentences) if sentences else 0

def calculate_noun_complexity(text):
    words = word_tokenize(text)
    tagged_words = pos_tag(words)
    noun_count = sum(1 for word, pos in tagged_words if pos.startswith('NN'))
    return noun_count / len(words) if words else 0

def extract_features_and_labels_from_dataframe(data):
    features = []
    labels = []

    grouped = data.groupby(['file_id', 'difficulty'])

    for (file_id, difficulty), group in grouped:
        for i in range(1, len(group)):
            para = group['text'].iloc[i]
            prev_para = group['text'].iloc[i-1]

            if para.strip() and prev_para.strip():
                clause_density = calculate_clause_density(para)
                punctuation_density = calculate_punctuation_density(para)
                syllables_per_word = calculate_syllables_per_word(para)
                sentence_length = calculate_sentence_length(para)
                noun_complexity = calculate_noun_complexity(para)

                prev_clause_density = calculate_clause_density(prev_para)
                prev_punctuation_density = calculate_punctuation_density(prev_para)
                prev_syllables_per_word = calculate_syllables_per_word(prev_para)
                prev_sentence_length = calculate_sentence_length(prev_para)
                prev_noun_complexity = calculate_noun_complexity(prev_para)

                features.append({
                    'file_id': group['file_id'].iloc[i],
                    'paragraph_index': group['paragraph_index'].iloc[i],
                    'difficulty': group['difficulty'].iloc[i],
                    'clause_density': clause_density,
                    'punctuation_density': punctuation_density,
                    'syllables_per_word': syllables_per_word,
                    'sentence_length': sentence_length,
                    'noun_complexity': noun_complexity,
                    'prev_clause_density': prev_clause_density,
                    'prev_punctuation_density': prev_punctuation_density,
                    'prev_syllables_per_word': prev_syllables_per_word,
                    'prev_sentence_length': prev_sentence_length,
                    'prev_noun_complexity': prev_noun_complexity
                })

                labels.append({
                    'file_id': group['file_id'].iloc[i],
                    'paragraph_index': group['paragraph_index'].iloc[i],
                    'difficulty': group['difficulty'].iloc[i],
                    'author_change': group['author_change'].iloc[i]
                })

    feature_columns = [
        'file_id', 'paragraph_index', 'difficulty', 'clause_density', 'punctuation_density', 'syllables_per_word',
        'sentence_length', 'noun_complexity', 'prev_clause_density', 'prev_punctuation_density',
        'prev_syllables_per_word', 'prev_sentence_length', 'prev_noun_complexity'
    ]

    label_columns = ['file_id', 'paragraph_index', 'difficulty', 'author_change']

    return pd.DataFrame(features, columns=feature_columns), pd.DataFrame(labels, columns=label_columns)

# Load the merged DataFrame
merged_data = pd.read_csv('lp/merged_train_data.csv')

# Extract features and labels from the merged DataFrame
features_df, labels_df = extract_features_and_labels_from_dataframe(merged_data)

# Save the features and labels to separate CSV files
features_df.to_csv('combined_train_features.csv', index=False)
labels_df.to_csv('combined_train_labels.csv', index=False)

print("Features saved")
print("Labels saved")


2.2 extracting POS density, dependency parsing features... and the rest of the features.

In [None]:
# pos

import re
import pandas as pd
import spacy
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk

nltk.download('punkt')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Define the feature extraction functions
def calculate_pos_density(text):
    doc = nlp(text)
    pos_counts = {
        'NOUN': 0,
        'VERB': 0,
        'ADJ': 0,
        'ADV': 0,
    }
    for token in doc:
        if token.pos_ in pos_counts:
            pos_counts[token.pos_] += 1

    total_words = len(doc) if len(doc) > 0 else 1  # Avoid division by zero
    for key in pos_counts:
        pos_counts[key] /= total_words

    return pos_counts

def extract_features_and_labels_from_dataframe(data):
    features = []
    labels = []

    grouped = data.groupby(['file_id', 'difficulty'])

    for (file_id, difficulty), group in grouped:
        for i in range(1, len(group)):
            para = group['text'].iloc[i]
            prev_para = group['text'].iloc[i-1]

            if para.strip() and prev_para.strip():
                pos_density = calculate_pos_density(para)

                prev_pos_density = calculate_pos_density(prev_para)

                features.append({
                    'file_id': group['file_id'].iloc[i],
                    'paragraph_index': group['paragraph_index'].iloc[i],
                    'difficulty': group['difficulty'].iloc[i],

                    'pos_noun_density': pos_density['NOUN'],
                    'pos_verb_density': pos_density['VERB'],
                    'pos_adj_density': pos_density['ADJ'],
                    'pos_adv_density': pos_density['ADV'],
                    'prev_pos_noun_density': prev_pos_density['NOUN'],
                    'prev_pos_verb_density': prev_pos_density['VERB'],
                    'prev_pos_adj_density': prev_pos_density['ADJ'],
                    'prev_pos_adv_density': prev_pos_density['ADV']
                })

                labels.append({
                    'file_id': group['file_id'].iloc[i],
                    'paragraph_index': group['paragraph_index'].iloc[i],
                    'difficulty': group['difficulty'].iloc[i],
                    'author_change': group['author_change'].iloc[i]
                })

    feature_columns = [
        'file_id', 'paragraph_index', 'difficulty', 'pos_noun_density', 'pos_verb_density', 'pos_adj_density', 'pos_adv_density',
        'prev_noun_complexity', 'prev_pos_noun_density', 'prev_pos_verb_density', 'prev_pos_adj_density', 'prev_pos_adv_density'
    ]

    label_columns = ['file_id', 'paragraph_index', 'difficulty', 'author_change']

    return pd.DataFrame(features, columns=feature_columns), pd.DataFrame(labels, columns=label_columns)

# Load the merged DataFrame
merged_data = pd.read_csv('lp/merged_train_data.csv')

# Extract features and labels from the merged DataFrame
features_df, labels_df = extract_features_and_labels_from_dataframe(merged_data)

# Save the features and labels to separate CSV files
features_df.to_csv('pos.csv', index=False)

print("Features saved")


In [None]:
def calculate_dependency_parsing_features(text):
    dependency_features = [0, 0, 0, 0, 0, 0, 0]  # ['nsubj', 'pobj', 'dobj', 'ROOT', 'neg', 'aux', 'conj']
    doc = nlp(text)
    for token in doc:
        if token.dep_ == 'nsubj':
            dependency_features[0] += 1
        elif token.dep_ == 'pobj':
            dependency_features[1] += 1
        elif token.dep_ == 'dobj':
            dependency_features[2] += 1
        elif token.dep_ == 'ROOT':
            dependency_features[3] += 1
        elif token.dep_ == 'neg':
            dependency_features[4] += 1
        elif token.dep_ == 'aux':
            dependency_features[5] += 1
        elif token.dep_ == 'conj':
            dependency_features[6] += 1

    total_words = len(doc) if len(doc) > 0 else 1
    dependency_features = [count / total_words for count in dependency_features]

    return dependency_features


def extract_features_and_labels_from_dataframe(data):
    features = []

    grouped = data.groupby(['file_id', 'difficulty'])

    for (file_id, difficulty), group in grouped:
        for i in range(1, len(group)):
            para = group['text'].iloc[i]
            prev_para = group['text'].iloc[i-1]

            if para.strip() and prev_para.strip():

                dependency_features = calculate_dependency_parsing_features(para)
                prev_dependency_features = calculate_dependency_parsing_features(prev_para)
                features.append({
                    'file_id': group['file_id'].iloc[i],
                    'paragraph_index': group['paragraph_index'].iloc[i],
                    'difficulty': group['difficulty'].iloc[i],
                    'dependency': dependency_features,
                    'prev_dependency': prev_dependency_features,
                })



    feature_columns = [
        'file_id', 'paragraph_index', 'difficulty', 'dependency', 'prev_dependency']
    return pd.DataFrame(features, columns=feature_columns)

# Load the merged DataFrame
merged_data = pd.read_csv('lp/merged_train_data.csv')

# Extract features and labels from the merged DataFrame
features_df = extract_features_and_labels_from_dataframe(merged_data)

# Save the features and labels to separate CSV files
features_df.to_csv('dp_features.csv', index=False)

print("Features saved")



In [None]:
# the rest

# Define the new feature extraction functions
def calculate_function_word_frequencies(text):
    function_words = [0, 0, 0, 0]  # [pronouns, prepositions, conjunctions, articles]
    doc = nlp(text)
    for token in doc:
        if token.pos_ == 'PRON':
            function_words[0] += 1
        elif token.pos_ == 'ADP':
            function_words[1] += 1
        elif token.pos_ == 'CCONJ' or token.pos_ == 'SCONJ':
            function_words[2] += 1
        elif token.pos_ == 'DET':
            function_words[3] += 1

    total_words = len(doc) if len(doc) > 0 else 1
    function_words = [count / total_words for count in function_words]

    return function_words

def calculate_vocabulary_diversity(text):
    words = word_tokenize(text)
    unique_words = set(words)
    return len(unique_words) / len(words) if len(words) > 0 else 0

def calculate_sentence_length_variety(text):
    sentences = sent_tokenize(text)
    lengths = [len(sentence.split()) for sentence in sentences]
    return pd.Series(lengths).var() if len(lengths)>1 else 0

def extract_features_and_labels_from_dataframe(data):
    features = []
    labels = []

    grouped = data.groupby(['file_id', 'difficulty'])

    for (file_id, difficulty), group in grouped:
        for i in range(1, len(group)):
            para = group['text'].iloc[i]
            prev_para = group['text'].iloc[i-1]

            if para.strip() and prev_para.strip():
                # New features
                function_word_frequencies = calculate_function_word_frequencies(para)
                vocabulary_diversity = calculate_vocabulary_diversity(para)
                sentence_length_variety = calculate_sentence_length_variety(para)

                prev_function_word_frequencies = calculate_function_word_frequencies(prev_para)
                prev_vocabulary_diversity = calculate_vocabulary_diversity(prev_para)
                prev_sentence_length_variety = calculate_sentence_length_variety(prev_para)

                features.append({
                    'file_id': group['file_id'].iloc[i],
                    'paragraph_index': group['paragraph_index'].iloc[i],
                    'difficulty': group['difficulty'].iloc[i],
                    'function_word_frequencies': function_word_frequencies,
                    'vocabulary_diversity': vocabulary_diversity,
                    'sentence_length_variety': sentence_length_variety,
                    'prev_function_word_frequencies': prev_function_word_frequencies,
                    'prev_vocabulary_diversity': prev_vocabulary_diversity,
                    'prev_sentence_length_variety': prev_sentence_length_variety
                })

                labels.append({
                    'file_id': group['file_id'].iloc[i],
                    'paragraph_index': group['paragraph_index'].iloc[i],
                    'difficulty': group['difficulty'].iloc[i],
                    'author_change': group['author_change'].iloc[i]
                })

    feature_columns = [
        'file_id', 'paragraph_index', 'difficulty', 'function_word_frequencies',
        'vocabulary_diversity', 'sentence_length_variety', 'prev_function_word_frequencies', 'prev_vocabulary_diversity', 'prev_sentence_length_variety'
    ]

    label_columns = ['file_id', 'paragraph_index', 'difficulty', 'author_change']

    return pd.DataFrame(features, columns=feature_columns), pd.DataFrame(labels, columns=label_columns)

# Load the merged DataFrame
merged_data = pd.read_csv('lp/merged_train_data.csv')

# Extract features and labels from the merged DataFrame
features_df, labels_df = extract_features_and_labels_from_dataframe(merged_data)

# Save the features and labels to separate CSV files
features_df.to_csv('combined_train_features2.csv', index=False)

print("Features saved")





2.2 feature extraction for validation datasets

In [None]:
import pandas as pd
import re
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

def val_extract_features_and_labels_from_dataframe(data):
    features = []
    labels = []

    grouped = data.groupby(['file_id'])

    for file_id, group in grouped:
        for i in range(1, len(group)):
            para = group['text'].iloc[i]
            prev_para = group['text'].iloc[i-1]

            if para.strip() and prev_para.strip():
                clause_density = calculate_clause_density(para)
                punctuation_density = calculate_punctuation_density(para)
                syllables_per_word = calculate_syllables_per_word(para)
                sentence_length = calculate_sentence_length(para)
                noun_complexity = calculate_noun_complexity(para)

                prev_clause_density = calculate_clause_density(prev_para)
                prev_punctuation_density = calculate_punctuation_density(prev_para)
                prev_syllables_per_word = calculate_syllables_per_word(prev_para)
                prev_sentence_length = calculate_sentence_length(prev_para)
                prev_noun_complexity = calculate_noun_complexity(prev_para)

                features.append({
                    'file_id': group['file_id'].iloc[i],
                    'paragraph_index': group['paragraph_index'].iloc[i],
                    'clause_density': clause_density,
                    'punctuation_density': punctuation_density,
                    'syllables_per_word': syllables_per_word,
                    'sentence_length': sentence_length,
                    'noun_complexity': noun_complexity,
                    'prev_clause_density': prev_clause_density,
                    'prev_punctuation_density': prev_punctuation_density,
                    'prev_syllables_per_word': prev_syllables_per_word,
                    'prev_sentence_length': prev_sentence_length,
                    'prev_noun_complexity': prev_noun_complexity
                })

                labels.append({
                    'file_id': group['file_id'].iloc[i],
                    'paragraph_index': group['paragraph_index'].iloc[i],
                    'author_change': group['author_change'].iloc[i]
                })

    feature_columns = [
        'file_id', 'paragraph_index', 'clause_density', 'punctuation_density', 'syllables_per_word',
        'sentence_length', 'noun_complexity', 'prev_clause_density', 'prev_punctuation_density',
        'prev_syllables_per_word', 'prev_sentence_length', 'prev_noun_complexity'
    ]

    label_columns = ['file_id', 'paragraph_index', 'author_change']

    return pd.DataFrame(features, columns=feature_columns), pd.DataFrame(labels, columns=label_columns)

# Load the merged DataFrame
merged_data = pd.read_csv('./pan24-multi-author-analysis/easy/easy_validation_dataframe.csv')

# Extract features and labels from the merged DataFrame
features_df, labels_df = val_extract_features_and_labels_from_dataframe(merged_data)

# Save the features and labels to separate CSV files
features_df.to_csv('./pan24-multi-author-analysis/validation/easy_validation_features.csv', index=False)
labels_df.to_csv('./pan24-multi-author-analysis/validation/easy_validation_labels.csv', index=False)

print("Features saved")
print("Labels saved")


In [None]:
# Load the merged DataFrame
merged_data = pd.read_csv('./pan24-multi-author-analysis/medium/medium_validation_dataframe.csv')

# Extract features and labels from the merged DataFrame
features_df, labels_df = extract_features_and_labels_from_dataframe(merged_data)

# Save the features and labels to separate CSV files
features_df.to_csv('./pan24-multi-author-analysis/validation/medium_validation_features.csv', index=False)
labels_df.to_csv('./pan24-multi-author-analysis/validation/medium_validation_labels.csv', index=False)

print("Features saved")
print("Labels saved")

# Load the merged DataFrame
merged_data = pd.read_csv('./pan24-multi-author-analysis/hard/hard_validation_dataframe.csv')

# Extract features and labels from the merged DataFrame
features_df, labels_df = extract_features_and_labels_from_dataframe(merged_data)

# Save the features and labels to separate CSV files
features_df.to_csv('./pan24-multi-author-analysis/validation/hard_validation_features.csv', index=False)
labels_df.to_csv('./pan24-multi-author-analysis/validation/hard_validation_labels.csv', index=False)

print("Features saved")
print("Labels saved")

In [None]:
import pandas as pd
import spacy

# Load spaCy model
nlp = spacy.load('en_core_web_sm')

# Define the feature extraction function
def calculate_pos_density(text):
    doc = nlp(text)
    pos_counts = {
        'NOUN': 0,
        'VERB': 0,
        'ADJ': 0,
        'ADV': 0,
    }
    for token in doc:
        if token.pos_ in pos_counts:
            pos_counts[token.pos_] += 1

    total_words = len(doc) if len(doc) > 0 else 1  # Avoid division by zero
    for key in pos_counts:
        pos_counts[key] /= total_words

    return [pos_counts['NOUN'], pos_counts['VERB'], pos_counts['ADJ'], pos_counts['ADV']]

def add_pos_features(features_file, merged_data_file):
    # Load the existing features DataFrame
    features_df = pd.read_csv(features_file)

    # Load the merged DataFrame
    merged_data = pd.read_csv(merged_data_file)

    # Extract features and labels from the merged DataFrame
    grouped = merged_data.groupby(['file_id'])

    # Initialize lists to store new feature columns
    pos_density_list = [None] * len(features_df)
    prev_pos_density_list = [None] * len(features_df)

    # Iterate over grouped data to calculate pos_density and prev_pos_density
    for file_id, group in grouped:
        for i in range(1, len(group)):
            para = group['text'].iloc[i]
            prev_para = group['text'].iloc[i-1]

            if para.strip() and prev_para.strip():
                pos_density_vector = calculate_pos_density(para)
                prev_pos_density_vector = calculate_pos_density(prev_para)

                # Find the corresponding index in the features_df
                index = features_df[
                    (features_df['file_id'] == group['file_id'].iloc[i]) &
                    (features_df['paragraph_index'] == group['paragraph_index'].iloc[i])
                ].index[0]

                pos_density_list[index] = pos_density_vector
                prev_pos_density_list[index] = prev_pos_density_vector

    # Add the new features to the existing DataFrame
    features_df['pos_density'] = pos_density_list
    features_df['prev_pos_density'] = prev_pos_density_list

    # Save the updated features DataFrame
    features_df.to_csv(features_file, index=False)

    print(f"Features updated and saved in {features_file}")

# Example usage
features_file = './pan24-multi-author-analysis/validation/easy_validation_features.csv'
merged_data_file = './pan24-multi-author-analysis/easy/easy_validation_dataframe.csv'
add_pos_features(features_file, merged_data_file)


In [None]:
features_file2 = './pan24-multi-author-analysis/validation/medium_validation_features.csv'
merged_data_file2 = './pan24-multi-author-analysis/medium/medium_validation_dataframe.csv'
add_pos_features(features_file2, merged_data_file2)

features_file3 = './pan24-multi-author-analysis/validation/hard_validation_features.csv'
merged_data_file3 = './pan24-multi-author-analysis/hard/hard_validation_dataframe.csv'
add_pos_features(features_file3, merged_data_file3)


In [None]:
import pandas as pd
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk

# Download punkt tokenizer for nltk
nltk.download('punkt')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')



def add_new_features(features_file, merged_data_file):
    # Load the existing features DataFrame
    features_df = pd.read_csv(features_file)

    # Load the merged DataFrame
    merged_data = pd.read_csv(merged_data_file)

    # Extract features and labels from the merged DataFrame
    grouped = merged_data.groupby(['file_id'])

    # Initialize lists to store new feature columns
    function_word_frequencies_list = [None] * len(features_df)
    vocabulary_diversity_list = [None] * len(features_df)
    sentence_length_variety_list = [None] * len(features_df)
    prev_function_word_frequencies_list = [None] * len(features_df)
    prev_vocabulary_diversity_list = [None] * len(features_df)
    prev_sentence_length_variety_list = [None] * len(features_df)

    # Iterate over grouped data to calculate new features
    for file_id, group in grouped:
        for i in range(1, len(group)):
            para = group['text'].iloc[i]
            prev_para = group['text'].iloc[i-1]

            if para.strip() and prev_para.strip():
                function_word_frequencies = calculate_function_word_frequencies(para)
                vocabulary_diversity = calculate_vocabulary_diversity(para)
                sentence_length_variety = calculate_sentence_length_variety(para)

                prev_function_word_frequencies = calculate_function_word_frequencies(prev_para)
                prev_vocabulary_diversity = calculate_vocabulary_diversity(prev_para)
                prev_sentence_length_variety = calculate_sentence_length_variety(prev_para)

                # Find the corresponding index in the features_df
                index = features_df[
                    (features_df['file_id'] == group['file_id'].iloc[i]) &
                    (features_df['paragraph_index'] == group['paragraph_index'].iloc[i])
                ].index[0]

                function_word_frequencies_list[index] = function_word_frequencies
                vocabulary_diversity_list[index] = vocabulary_diversity
                sentence_length_variety_list[index] = sentence_length_variety
                prev_function_word_frequencies_list[index] = prev_function_word_frequencies
                prev_vocabulary_diversity_list[index] = prev_vocabulary_diversity
                prev_sentence_length_variety_list[index] = prev_sentence_length_variety

    # Add the new features to the existing DataFrame
    features_df['function_word_frequencies'] = function_word_frequencies_list
    features_df['vocabulary_diversity'] = vocabulary_diversity_list
    features_df['sentence_length_variety'] = sentence_length_variety_list
    features_df['prev_function_word_frequencies'] = prev_function_word_frequencies_list
    features_df['prev_vocabulary_diversity'] = prev_vocabulary_diversity_list
    features_df['prev_sentence_length_variety'] = prev_sentence_length_variety_list

    # Save the updated features DataFrame
    features_df.to_csv(features_file, index=False)

    print(f"Features updated and saved in {features_file}")

# Example usage
features_file = './pan24-multi-author-analysis/validation/easy_validation_features.csv'
merged_data_file = './pan24-multi-author-analysis/easy/easy_validation_dataframe.csv'
add_new_features(features_file, merged_data_file)


In [None]:
features_file2 = './pan24-multi-author-analysis/validation/medium_validation_features.csv'
merged_data_file2 = './pan24-multi-author-analysis/medium/medium_validation_dataframe.csv'
add_new_features(features_file2, merged_data_file2)

features_file3 = './pan24-multi-author-analysis/validation/hard_validation_features.csv'
merged_data_file3 = './pan24-multi-author-analysis/hard/hard_validation_dataframe.csv'
add_new_features(features_file3, merged_data_file3)

In [None]:
import pandas as pd
import spacy
from nltk.tokenize import word_tokenize, sent_tokenize
import nltk

# Download punkt tokenizer for nltk
nltk.download('punkt')

# Load spaCy model
nlp = spacy.load('en_core_web_sm')



def add_dependency_features(features_file, merged_data_file):
    # Load the existing features DataFrame
    features_df = pd.read_csv(features_file)

    # Load the merged DataFrame
    merged_data = pd.read_csv(merged_data_file)

    # Extract features from the merged DataFrame
    grouped = merged_data.groupby(['file_id'])

    # Initialize lists to store new feature columns
    dependency_list = [None] * len(features_df)
    prev_dependency_list = [None] * len(features_df)

    # Iterate over grouped data to calculate dependency features
    for file_id, group in grouped:
        for i in range(1, len(group)):
            para = group['text'].iloc[i]
            prev_para = group['text'].iloc[i-1]

            if para.strip() and prev_para.strip():
                dependency_features = calculate_dependency_parsing_features(para)
                prev_dependency_features = calculate_dependency_parsing_features(prev_para)

                # Find the corresponding index in the features_df
                index = features_df[
                    (features_df['file_id'] == group['file_id'].iloc[i]) &
                    (features_df['paragraph_index'] == group['paragraph_index'].iloc[i])
                ].index[0]

                dependency_list[index] = dependency_features
                prev_dependency_list[index] = prev_dependency_features

    # Add the new features to the existing DataFrame
    features_df['dependency'] = dependency_list
    features_df['prev_dependency'] = prev_dependency_list

    # Save the updated features DataFrame
    features_df.to_csv(features_file, index=False)

    print(f"Features updated and saved in {features_file}")

features_file = './pan24-multi-author-analysis/validation/easy_validation_features.csv'
merged_data_file = './pan24-multi-author-analysis/easy/easy_validation_dataframe.csv'
add_dependency_features(features_file, merged_data_file)


In [None]:
features_file2 = './pan24-multi-author-analysis/validation/medium_validation_features.csv'
merged_data_file2 = './pan24-multi-author-analysis/medium/medium_validation_dataframe.csv'
add_dependency_features(features_file2, merged_data_file2)

features_file3 = './pan24-multi-author-analysis/validation/hard_validation_features.csv'
merged_data_file3 = './pan24-multi-author-analysis/hard/hard_validation_dataframe.csv'
add_dependency_features(features_file3, merged_data_file3)

2.3 Standardization

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler

def standardize_features(file_path, features_to_standardize):
    """
    标准化CSV文件中的指定特征，并覆盖原数据。

    :param file_path: CSV文件路径
    :param features_to_standardize: 需要标准化的特征列表
    """
    # 加载数据
    data = pd.read_csv(file_path)

    # 初始化标准化器
    scaler = StandardScaler()

    # 对指定特征进行标准化
    data[features_to_standardize] = scaler.fit_transform(data[features_to_standardize])

    # 将数据保存回 CSV 文件
    data.to_csv(file_path, index=False)

    print(f"Features standardized and saved in {file_path}")

features_to_standardize = [
    'clause_density', 'syllables_per_word', 'sentence_length', 'sentence_length_variety',
    'prev_clause_density', 'prev_syllables_per_word', 'prev_sentence_length', 'prev_sentence_length_variety'
]



#validation dataset
file_path1 = './pan24-multi-author-analysis/validation/easy_validation_features.csv'
standardize_features(file_path1, features_to_standardize)
print("finish!")

file_path2 = './pan24-multi-author-analysis/validation/medium_validation_features.csv'
standardize_features(file_path2, features_to_standardize)
print("finish!")

file_path3 = './pan24-multi-author-analysis/validation/hard_validation_features.csv'
standardize_features(file_path3, features_to_standardize)
print("finish!")

2.4 calculate absolute value differences between the same features for consecutive paragraphs as the training feature

training dataset

In [None]:
import pandas as pd

# Load the ultimate merged features DataFrame
merged_df = pd.read_csv('train_merged_features.csv')

# Ensure pos_features and prev_pos_features are evaluated as lists
merged_df['pos_features'] = merged_df['pos_features'].apply(eval)
merged_df['prev_pos_features'] = merged_df['prev_pos_features'].apply(eval)
merged_df['function_word_frequencies'] = merged_df['function_word_frequencies'].apply(eval)
merged_df['prev_function_word_frequencies'] = merged_df['prev_function_word_frequencies'].apply(eval)
merged_df['dependency'] = merged_df['dependency'].apply(eval)
merged_df['prev_dependency'] = merged_df['prev_dependency'].apply(eval)

# Calculate the absolute differences for each feature
def calculate_absolute_differences(df):
    differences = pd.DataFrame()
    differences['file_id'] = df['file_id']
    differences['paragraph_index'] = df['paragraph_index']
    differences['difficulty'] = df['difficulty']

    # Calculate absolute differences for individual features
    for feature in ['clause_density', 'punctuation_density', 'syllables_per_word', 'sentence_length', 'noun_complexity']:
        differences[f'{feature}_diff'] = (df[feature] - df[f'prev_{feature}']).abs()

    # Calculate absolute differences for function word frequencies
    differences['function_word_diff'] = df.apply(lambda row: [abs(x - y) for x, y in zip(row['function_word_frequencies'], row['prev_function_word_frequencies'])], axis=1)

    # Calculate absolute differences for vocabulary diversity and sentence length variety
    differences['vocabulary_diversity_diff'] = (df['vocabulary_diversity'] - df['prev_vocabulary_diversity']).abs()
    differences['sentence_length_variety_diff'] = (df['sentence_length_variety'] - df['prev_sentence_length_variety']).abs()

    # Calculate absolute differences for dependency features
    differences['dependency_diff'] = df.apply(lambda row: [abs(x - y) for x, y in zip(row['dependency'], row['prev_dependency'])], axis=1)

    # Calculate absolute differences for POS features
    differences['pos_features_diff'] = df.apply(lambda row: [abs(x - y) for x, y in zip(row['pos_features'], row['prev_pos_features'])], axis=1)

    return differences

# Calculate the absolute differences DataFrame
absolute_differences_df = calculate_absolute_differences(merged_df)

# Save the new training feature DataFrame
absolute_differences_df.to_csv('training_features_absolute.csv', index=False)

print("Training features saved")


validation dataset

In [None]:
import pandas as pd

def calculate_absolute_differences(df):
    """
    Calculate the absolute differences for specified features between current and previous paragraphs.

    Args:
    df (pd.DataFrame): DataFrame containing the features and their previous values.

    Returns:
    pd.DataFrame: DataFrame containing the absolute differences for each feature.
    """
    differences = pd.DataFrame()
    differences['file_id'] = df['file_id']
    differences['paragraph_index'] = df['paragraph_index']

    # Calculate absolute differences for individual features
    for feature in ['clause_density', 'punctuation_density', 'syllables_per_word', 'sentence_length', 'noun_complexity']:
        differences[f'{feature}_diff'] = (df[feature] - df[f'prev_{feature}']).abs()

    # Calculate absolute differences for function word frequencies
    if 'function_word_frequencies' in df.columns and 'prev_function_word_frequencies' in df.columns:
        differences['function_word_diff'] = df.apply(lambda row: [abs(x - y) for x, y in zip(row['function_word_frequencies'], row['prev_function_word_frequencies'])], axis=1)

    # Calculate absolute differences for vocabulary diversity and sentence length variety
    if 'vocabulary_diversity' in df.columns and 'prev_vocabulary_diversity' in df.columns:
        differences['vocabulary_diversity_diff'] = (df['vocabulary_diversity'] - df['prev_vocabulary_diversity']).abs()
    if 'sentence_length_variety' in df.columns and 'prev_sentence_length_variety' in df.columns:
        differences['sentence_length_variety_diff'] = (df['sentence_length_variety'] - df['prev_sentence_length_variety']).abs()

    # Calculate absolute differences for dependency features
    if 'dependency' in df.columns and 'prev_dependency' in df.columns:
        differences['dependency_diff'] = df.apply(lambda row: [abs(x - y) for x, y in zip(row['dependency'], row['prev_dependency'])], axis=1)

    # Calculate absolute differences for POS features
    if 'pos_features' in df.columns and 'prev_pos_features' in df.columns:
        differences['pos_features_diff'] = df.apply(lambda row: [abs(x - y) for x, y in zip(row['pos_features'], row['prev_pos_features'])], axis=1)

    return differences

def process_file(input_file, output_file):
    """
    Load the input file, calculate absolute differences for features, and save the result to the output file.

    Args:
    input_file (str): Path to the input CSV file.
    output_file (str): Path to the output CSV file.
    """
    try:
        # Load the merged features DataFrame
        df = pd.read_csv(input_file)

        # Rename columns
        if 'pos_density' in df.columns:
            df.rename(columns={'pos_density': 'pos_features'}, inplace=True)
        if 'prev_pos_density' in df.columns:
            df.rename(columns={'prev_pos_density': 'prev_pos_features'}, inplace=True)

        # Ensure list-like columns are evaluated correctly if they exist
        for col in ['pos_features', 'prev_pos_features', 'function_word_frequencies', 'prev_function_word_frequencies', 'dependency', 'prev_dependency']:
            if col in df.columns:
                df[col] = df[col].apply(eval)

        # Calculate the absolute differences
        differences_df = calculate_absolute_differences(df)

        # Save the new DataFrame to CSV
        differences_df.to_csv(output_file, index=False)
        print(f"Training features saved to {output_file}")
    except Exception as e:
        print(f"Error processing file {input_file}: {e}")



# Example usage for multiple files
files_to_process = [
    ('./pan24-multi-author-analysis/validation/easy_validation_features.csv', './pan24-multi-author-analysis/validation/easy_validation_features_difference.csv'),
    ('./pan24-multi-author-analysis/validation/medium_validation_features.csv', './pan24-multi-author-analysis/validation/medium_validation_features_difference.csv'),
    ('./pan24-multi-author-analysis/validation/hard_validation_features.csv', './pan24-multi-author-analysis/validation/hard_validation_features_difference.csv')
    # Add more files as needed
]

for input_file, output_file in files_to_process:
    process_file(input_file, output_file)


3. Model Training

3.1 Logistic regression model

In [None]:
import ast
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix

# Define file paths for training set
train_features_path = 'trainset/training_features_absolute.csv'
train_labels_path = 'trainset/combined_train_labels.csv'

# Define file paths for validation sets
val_easy_features_path = 'valset/easy_validation_features.csv'
val_easy_labels_path = 'valset/easy_validation_labels.csv'

val_medium_features_path = 'valset/medium_validation_features.csv'
val_medium_labels_path = 'valset/medium_validation_labels.csv'

val_hard_features_path = 'valset/hard_validation_features.csv'
val_hard_labels_path = 'valset/hard_validation_labels.csv'

# Function to convert string representations of lists to actual lists
def convert_string_lists(df, columns):
    for col in columns:
        df[col] = df[col].apply(lambda x: ast.literal_eval(x))
    return df

# Function to flatten list columns into separate columns
def flatten_columns(df, columns):
    for col in columns:
        # Create new columns for each element in the list
        list_col_df = pd.DataFrame(df[col].tolist(), index=df.index)
        list_col_df = list_col_df.add_prefix(f'{col}_')
        df = df.drop(col, axis=1)
        df = pd.concat([df, list_col_df], axis=1)
    return df

# Function to evaluate model performance
def evaluate_model_performance(model, X_val, y_val, dataset_name):
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    conf_matrix = confusion_matrix(y_val, y_pred)

    print(f'\nPerformance on {dataset_name} Validation Set:')
    print(f'F1 Score: {f1}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'Confusion Matrix:\n{conf_matrix}')

# Load your training set features and labels
train_features_df = pd.read_csv(train_features_path)
train_labels_df = pd.read_csv(train_labels_path)

# Columns containing lists stored as strings
list_columns = [
    'function_word_diff', 'dependency_diff', 'pos_features_diff'
]

# Convert string lists to actual lists and flatten them
train_features_df = convert_string_lists(train_features_df, list_columns)
train_features_df = flatten_columns(train_features_df, list_columns)

# Extract features and labels for training set
X_train = train_features_df.drop(['file_id', 'paragraph_index', 'difficulty', 'noun_complexity_diff'], axis=1)
y_train = train_labels_df['author_change']

# Train Logistic Regression Model
log_reg = LogisticRegression(max_iter=1000, random_state=42)
log_reg.fit(X_train, y_train)

# Evaluate on Easy Validation Set
val_easy_features_df = pd.read_csv(val_easy_features_path)
val_easy_labels_df = pd.read_csv(val_easy_labels_path)
val_easy_features_df = convert_string_lists(val_easy_features_df, list_columns)
val_easy_features_df = flatten_columns(val_easy_features_df, list_columns)
X_val_easy = val_easy_features_df.drop(['file_id', 'paragraph_index', 'noun_complexity_diff'], axis=1)
y_val_easy = val_easy_labels_df['author_change']

evaluate_model_performance(log_reg, X_val_easy, y_val_easy, "Easy (Logistic Regression)")

# Evaluate on Medium Validation Set
val_medium_features_df = pd.read_csv(val_medium_features_path)
val_medium_labels_df = pd.read_csv(val_medium_labels_path)
val_medium_features_df = convert_string_lists(val_medium_features_df, list_columns)
val_medium_features_df = flatten_columns(val_medium_features_df, list_columns)
X_val_medium = val_medium_features_df.drop(['file_id', 'paragraph_index','noun_complexity_diff'], axis=1)
y_val_medium = val_medium_labels_df['author_change']

evaluate_model_performance(log_reg, X_val_medium, y_val_medium, "Medium (Logistic Regression)")

# Evaluate on Hard Validation Set
val_hard_features_df = pd.read_csv(val_hard_features_path)
val_hard_labels_df = pd.read_csv(val_hard_labels_path)
val_hard_features_df = convert_string_lists(val_hard_features_df, list_columns)
val_hard_features_df = flatten_columns(val_hard_features_df, list_columns)
X_val_hard = val_hard_features_df.drop(['file_id', 'paragraph_index', 'noun_complexity_diff'], axis=1)
y_val_hard = val_hard_labels_df['author_change']

evaluate_model_performance(log_reg, X_val_hard, y_val_hard, "Hard (Logistic Regression)")

# Calculate feature importance
feature_names = X_train.columns
coefficients = log_reg.coef_[0]
feature_importance = pd.DataFrame({'Feature': feature_names, 'Coefficient': coefficients})
feature_importance['Abs_Coefficient'] = feature_importance['Coefficient'].abs()
feature_importance = feature_importance.sort_values(by='Abs_Coefficient', ascending=False)

# Display feature importance
print(feature_importance)


3.2 random forest model


3.2.1 baseline random forest model

In [None]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from tqdm import tqdm
from google.colab import drive


# Define file paths for training set
train_features_path = '/content/drive/MyDrive/lp/dataset/training/training_features_absolute.csv'
train_labels_path = '/content/drive/MyDrive/lp/dataset/training/combined_train_labels.csv'

# Define file paths for validation sets
val_easy_features_path = '/content/drive/MyDrive/lp/dataset/validation/easy_validation_features_difference.csv'
val_easy_labels_path = '/content/drive/MyDrive/lp/dataset/validation/easy_validation_labels.csv'

val_medium_features_path = '/content/drive/MyDrive/lp/dataset/validation/medium_validation_features_difference.csv'
val_medium_labels_path = '/content/drive/MyDrive/lp/dataset/validation/medium_validation_labels.csv'

val_hard_features_path = '/content/drive/MyDrive/lp/dataset/validation/hard_validation_features_difference.csv'
val_hard_labels_path = '/content/drive/MyDrive/lp/dataset/validation/hard_validation_labels.csv'




In [None]:
import numpy as np
import ast
# Function to convert string representations of lists to actual numpy arrays
def convert_string_to_numpy(df, columns):
    for col in columns:
        df[col] = df[col].apply(lambda x: np.array(ast.literal_eval(x)))
    return df

# Function to combine vector columns into single columns
def combine_vector_columns(df, columns):
    for col in columns:
        df[col] = df[col].apply(lambda x: np.mean(x) if len(x) > 0 else np.nan)
    return df

# Function to evaluate model performance
def evaluate_model_performance(model, X_val, y_val, dataset_name):
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    conf_matrix = confusion_matrix(y_val, y_pred)

    print(f'\nPerformance on {dataset_name} Validation Set:')
    print(f'F1 Score: {f1}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'Confusion Matrix:\n{conf_matrix}')

# Load your training set features and labels
train_features_df = pd.read_csv(train_features_path)
train_labels_df = pd.read_csv(train_labels_path)

# Columns containing lists stored as strings
list_columns = [
    'function_word_diff', 'dependency_diff', 'pos_features_diff'
]

# Convert string lists to numpy arrays and combine them
train_features_df = convert_string_to_numpy(train_features_df, list_columns)
train_features_df = combine_vector_columns(train_features_df, list_columns)

# Extract features and labels for training set
X_train = train_features_df.drop(['file_id', 'paragraph_index', 'difficulty', 'noun_complexity_diff'], axis=1)
y_train = train_labels_df['author_change']

# write X_train and y_train into new CSV files
X_train.to_csv('/content/drive/MyDrive/lp/dataset/training/X_train.csv', index=False)
y_train.to_csv('/content/drive/MyDrive/lp/dataset/training/y_train.csv', index=False)



# Evaluate on Easy Validation Set
val_easy_features_df = pd.read_csv(val_easy_features_path)
val_easy_labels_df = pd.read_csv(val_easy_labels_path)
val_easy_features_df = convert_string_to_numpy(val_easy_features_df, list_columns)
val_easy_features_df = combine_vector_columns(val_easy_features_df, list_columns)
X_val_easy = val_easy_features_df.drop(['file_id', 'paragraph_index', 'noun_complexity_diff'], axis=1)
y_val_easy = val_easy_labels_df['author_change']
X_val_easy.to_csv('/content/drive/MyDrive/lp/dataset/validation/X_val_easy.csv', index=False)
y_val_easy.to_csv('/content/drive/MyDrive/lp/dataset/validation/y_val_easy.csv', index=False)

# Evaluate on Medium Validation Set
val_medium_features_df = pd.read_csv(val_medium_features_path)
val_medium_labels_df = pd.read_csv(val_medium_labels_path)
val_medium_features_df = convert_string_to_numpy(val_medium_features_df, list_columns)
val_medium_features_df = combine_vector_columns(val_medium_features_df, list_columns)
X_val_medium = val_medium_features_df.drop(['file_id', 'paragraph_index', 'noun_complexity_diff'], axis=1)
y_val_medium = val_medium_labels_df['author_change']
X_val_medium.to_csv('/content/drive/MyDrive/lp/dataset/validation/X_val_medium.csv', index=False)
y_val_medium.to_csv('/content/drive/MyDrive/lp/dataset/validation/y_val_medium.csv', index=False)

# Evaluate on Hard Validation Set
val_hard_features_df = pd.read_csv(val_hard_features_path)
val_hard_labels_df = pd.read_csv(val_hard_labels_path)
val_hard_features_df = convert_string_to_numpy(val_hard_features_df, list_columns)
val_hard_features_df = combine_vector_columns(val_hard_features_df, list_columns)
X_val_hard = val_hard_features_df.drop(['file_id', 'paragraph_index', 'noun_complexity_diff'], axis=1)
y_val_hard = val_hard_labels_df['author_change']
X_val_hard.to_csv('/content/drive/MyDrive/lp/dataset/validation/X_val_hard.csv', index=False)
y_val_hard.to_csv('/content/drive/MyDrive/lp/dataset/validation/y_val_hard.csv', index=False)



In [None]:

# Train Random Forest Model
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train, y_train)

evaluate_model_performance(rf_model, X_val_easy, y_val_easy, "Easy (Random Forest)")
evaluate_model_performance(rf_model, X_val_medium, y_val_medium, "Medium (Random Forest)")
evaluate_model_performance(rf_model, X_val_hard, y_val_hard, "Hard (Random Forest)")

# Feature Importances from Random Forest
importances = rf_model.feature_importances_
feature_names = X_train.columns
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values(by='importance', ascending=False)

print('\nFeature Importances:')
print(feature_importances)


bayesian-optimization

In [None]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score, precision_score, recall_score, confusion_matrix
from sklearn.model_selection import cross_val_score
from bayes_opt import BayesianOptimization

# Counter to track the number of times the model is trained
training_counter = 0

# Define the hyperparameter space to search
def rf_cv(n_estimators, max_depth, min_samples_split, min_samples_leaf, max_features):
    global training_counter
    model = RandomForestClassifier(
        n_estimators=int(n_estimators),
        max_depth=int(max_depth),
        min_samples_split=int(min_samples_split),
        min_samples_leaf=int(min_samples_leaf),
        max_features=max_features,
        random_state=42
    )
    # Perform cross-validation and return the mean F1 score
    cv_scores = cross_val_score(model, X_train, y_train, cv=3, scoring='f1')  # Reduced CV folds to 3
    training_counter += 3  # Increase by the number of CV folds
    return np.mean(cv_scores)

# Set up the Bayesian Optimizer
optimizer = BayesianOptimization(
    f=rf_cv,
    pbounds={
        'n_estimators': (10, 200),
        'max_depth': (1, 50),
        'min_samples_split': (2, 10),
        'min_samples_leaf': (1, 10),
        'max_features': (0.1, 0.999)
    },
    random_state=42,
    verbose=2
)

# Perform optimization with reduced init_points and n_iter
optimizer.maximize(init_points=10, n_iter=20)  # Reduced init_points to 5 and n_iter to 10

print(f"Total number of model trainings: {training_counter}")

# Retrieve the best parameters
best_params = optimizer.max['params']
best_params['n_estimators'] = int(best_params['n_estimators'])
best_params['max_depth'] = int(best_params['max_depth'])
best_params['min_samples_split'] = int(best_params['min_samples_split'])
best_params['min_samples_leaf'] = int(best_params['min_samples_leaf'])

print("Best Parameters:")
print(best_params)

# Train Random Forest with best parameters
best_rf_model = RandomForestClassifier(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)

# Function to evaluate model performance
def evaluate_model_performance(model, X_val, y_val, dataset_name):
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    conf_matrix = confusion_matrix(y_val, y_pred)

    print(f'\nPerformance on {dataset_name} Validation Set:')
    print(f'F1 Score: {f1}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'Confusion Matrix:\n{conf_matrix}')

# Evaluate on Easy Validation Set
evaluate_model_performance(best_rf_model, X_val_easy, y_val_easy, "Easy (Optimized Random Forest)")

# Evaluate on Medium Validation Set
evaluate_model_performance(best_rf_model, X_val_medium, y_val_medium, "Medium (Optimized Random Forest)")

# Evaluate on Hard Validation Set
evaluate_model_performance(best_rf_model, X_val_hard, y_val_hard, "Hard (Optimized Random Forest)")


In [None]:
# Train Random Forest with best parameters
best_rf_model = RandomForestClassifier(**best_params, random_state=42)
best_rf_model.fit(X_train, y_train)

# Function to evaluate model performance
def evaluate_model_performance(model, X_val, y_val, dataset_name):
    y_pred = model.predict(X_val)
    f1 = f1_score(y_val, y_pred)
    accuracy = accuracy_score(y_val, y_pred)
    precision = precision_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    conf_matrix = confusion_matrix(y_val, y_pred)

    print(f'\nPerformance on {dataset_name} Validation Set:')
    print(f'F1 Score: {f1}')
    print(f'Accuracy: {accuracy}')
    print(f'Precision: {precision}')
    print(f'Recall: {recall}')
    print(f'Confusion Matrix:\n{conf_matrix}')

# Evaluate on Easy Validation Set
evaluate_model_performance(best_rf_model, X_val_easy, y_val_easy, "Easy (Optimized Random Forest)")

# Evaluate on Medium Validation Set
evaluate_model_performance(best_rf_model, X_val_medium, y_val_medium, "Medium (Optimized Random Forest)")

# Evaluate on Hard Validation Set
evaluate_model_performance(best_rf_model, X_val_hard, y_val_hard, "Hard (Optimized Random Forest)")

In [None]:
# Feature Importances from Random Forest
importances = best_rf_model.feature_importances_
feature_names = X_train.columns
feature_importances = pd.DataFrame({'feature': feature_names, 'importance': importances}).sort_values(by='importance', ascending=False)

print('\nFeature Importances:')
print(feature_importances)