## **Step 4: AI vs Human**

## Human vs AI (Mistral)

In [None]:
import os
os.environ["TRANSFORMERS_CACHE"] = "/XXXX/local_cache"
os.environ["HF_HOME"] = "/XXXX/local_cache"

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from huggingface_hub import login
from collections import Counter
import pandas as pd
import gc
import json
import re
import string

login(token="XXXX")

MODEL_NAME = "mistralai/Mistral-7B-Instruct-v0.1"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

CHECKPOINT_FILE = "mistral25_checkpoint.json"
OUTPUT_FILE = "2025_AI_vs_Human_Mistral.csv"

def save_checkpoint(current_index, results):
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump({"last_processed": current_index, "results": results}, f)

def load_checkpoint():
    try:
        with open(CHECKPOINT_FILE, "r") as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        return {"last_processed": -1, "results": []}

def load_model():
    """Load model and tokenizer once."""
    gc.collect()
    torch.cuda.empty_cache()
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        quantization_config=bnb_config,
        device_map="auto"
    )

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))
    return model, tokenizer

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def query_model(paragraphs, model, tokenizer):
    labels = ["AI-generated", "Human-written"]
    label_token_ids = [tokenizer.encode(label, add_special_tokens=False) for label in labels]

    predicted_labels = []

    for paragraph in paragraphs:
        prompt = f"""You are a text classifier. Given a paragraph, classify whether it is \"AI-generated\" or \"Human-written\".
        Respond ONLY with one of these two labels.
            For each criterion below, assign a score:
     **+1 for AI-like** if the text strongly matches the AI trait.
     **+1 for Human-like** if the text strongly matches the human trait.
     **+0 if neutral/uncertain**.

    #### **Evaluation Steps**:
    1. **Perplexity & Creativity**:
     AI-like: Predictable word choices, clichés, or overly fluent phrasing.
     Human-like: Unusual phrasing, creative metaphors, or minor errors.
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    2. **Burstiness**:
     AI-like: Uniform sentence length/structure (e.g., all medium-length).
     Human-like: Varied rhythm (mix of short/long sentences, interruptions).
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    3. **Specificity & Personalization**:
     AI-like: Lacks concrete details (no names, anecdotes, or emotions).
     Human-like: Uses "I/we," personal stories, opinions, or informal language.
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    4. **Logical Flow**:
     AI-like: Abrupt topic shifts or overly rigid structure.
     Human-like: Natural digressions, conversational flow.
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    5. **Errors & Imperfections**:
     AI-like: Grammatically flawless, no typos/colloquialisms.
     Human-like: Minor errors, idiosyncratic punctuation, or slang.
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    #### **Decision Rule**:
     If **total AI-like >= 3** then "AI-generated."
     Else "Human-written." (Default to Human-written if uncertain.)

        Paragraph: {paragraph}\nAnswer:"""
        inputs = tokenizer(prompt, return_tensors="pt", return_attention_mask=True).to("cuda")

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        last_token_logits = logits[0, -1, :]

        label_scores = []
        for token_ids in label_token_ids:
            score = sum(last_token_logits[token_id].item() for token_id in token_ids) / len(token_ids)
            label_scores.append(score)

        predicted_label = labels[label_scores.index(max(label_scores))]
        predicted_labels.append(predicted_label)

    return predicted_labels

def process_dataset(input_csv, output_csv, batch_size=2):
    checkpoint = load_checkpoint()
    start_index = checkpoint["last_processed"] + 1
    results = checkpoint["results"]

    df = pd.read_csv(input_csv)
    negative_samples = df[df['Majority_Label'] == 'negative']
    paragraphs = negative_samples['Paragraph'].tolist()
    original_indices = negative_samples.index.tolist()

    model, tokenizer = load_model()

    try:
        for i in range(start_index, len(paragraphs), batch_size):
            batch_paragraphs = paragraphs[i:i + batch_size]
            batch_indices = original_indices[i:i + batch_size]

            batch_labels = query_model(batch_paragraphs, model, tokenizer)

            for idx, paragraph, label in zip(batch_indices, batch_paragraphs, batch_labels):
                results.append({
                    "Original_Index": idx,
                    "Paragraph": paragraph,
                    "Mistral-7B": label
                })

            save_checkpoint(i + len(batch_paragraphs) - 1, results)
            pd.DataFrame(results).to_csv(output_csv, index=False)
            print(f"Processed up to index {i + len(batch_paragraphs) - 1}")

    finally:
        del model, tokenizer
        gc.collect()
        torch.cuda.empty_cache()

    print("Processing complete!")
    return results

if __name__ == "__main__":
    process_dataset("2025_Majority_Labeled.csv", "2025_AI_vs_Human_Mistral.csv")

## Human vs AI (DeepSeek)

In [None]:
import os
os.environ["TRANSFORMERS_CACHE"] = "/XXXX/local_cache"
os.environ["HF_HOME"] = "/XXXX/local_cache"

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from huggingface_hub import login
from collections import Counter
import pandas as pd
import gc
import json
import re
import string

MODEL_NAME = "deepseek-ai/deepseek-llm-7b-chat"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

CHECKPOINT_FILE = "deepseek25_checkpoint.json"
OUTPUT_FILE = "2025_AI_vs_Human_deepseek.csv"

def save_checkpoint(current_index, results):
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump({"last_processed": current_index, "results": results}, f)

def load_checkpoint():
    try:
        with open(CHECKPOINT_FILE, "r") as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        return {"last_processed": -1, "results": []}

def load_model():

    gc.collect()
    torch.cuda.empty_cache()
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir="/XXXX/local_cache",)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        cache_dir="/XXXX/local_cache",
        quantization_config=bnb_config,
        device_map="auto"
    )

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))
    return model, tokenizer

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def query_model(paragraphs, model, tokenizer):
    labels = ["AI-generated", "Human-written"]
    label_token_ids = [tokenizer.encode(label, add_special_tokens=False) for label in labels]

    predicted_labels = []

    for paragraph in paragraphs:
        prompt = f"""You are a text classifier. Given a paragraph, classify whether it is \"AI-generated\" or \"Human-written\".
        Respond ONLY with one of these two labels.
        For each criterion below, assign a score:
     **+1 for AI-like** if the text strongly matches the AI trait.
     **+1 for Human-like** if the text strongly matches the human trait.
     **+0 if neutral/uncertain**.

    #### **Evaluation Steps**:
    1. **Perplexity & Creativity**:
     AI-like: Predictable word choices, clichés, or overly fluent phrasing.
     Human-like: Unusual phrasing, creative metaphors, or minor errors.
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    2. **Burstiness**:
     AI-like: Uniform sentence length/structure (e.g., all medium-length).
     Human-like: Varied rhythm (mix of short/long sentences, interruptions).
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    3. **Specificity & Personalization**:
     AI-like: Lacks concrete details (no names, anecdotes, or emotions).
     Human-like: Uses "I/we," personal stories, opinions, or informal language.
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    4. **Logical Flow**:
     AI-like: Abrupt topic shifts or overly rigid structure.
     Human-like: Natural digressions, conversational flow.
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    5. **Errors & Imperfections**:
     AI-like: Grammatically flawless, no typos/colloquialisms.
     Human-like: Minor errors, idiosyncratic punctuation, or slang.
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    #### **Decision Rule**:
     If **total AI-like >= 3** then "AI-generated."
     Else "Human-written." (Default to Human-written if uncertain.)

Paragraph: {paragraph}\nAnswer:"""

        inputs = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False, return_attention_mask=True).to("cuda")

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        last_token_logits = logits[0, -1, :]

        label_scores = []
        for token_ids in label_token_ids:
            score = sum(last_token_logits[token_id].item() for token_id in token_ids) / len(token_ids)
            label_scores.append(score)

        predicted_label = labels[label_scores.index(max(label_scores))]
        predicted_labels.append(predicted_label)

    return predicted_labels

def process_dataset(input_csv, output_csv, batch_size=1):
    checkpoint = load_checkpoint()
    start_index = checkpoint["last_processed"] + 1
    results = checkpoint["results"]

    df = pd.read_csv(input_csv)
    negative_samples = df[df['Majority_Label'] == 'negative']
    paragraphs = negative_samples['Paragraph'].tolist()
    original_indices = negative_samples.index.tolist()

    model, tokenizer = load_model()

    try:
        for i in range(start_index, len(paragraphs), batch_size):
            batch_paragraphs = paragraphs[i:i + batch_size]
            batch_indices = original_indices[i:i + batch_size]

            batch_labels = query_model(batch_paragraphs, model, tokenizer)

            for idx, paragraph, label in zip(batch_indices, batch_paragraphs, batch_labels):
                results.append({
                    "Original_Index": idx,
                    "Paragraph": paragraph,
                    "Deepseek": label
                })

            save_checkpoint(i + len(batch_paragraphs) - 1, results)
            pd.DataFrame(results).to_csv(output_csv, index=False)
            print(f"Processed up to index {i + len(batch_paragraphs) - 1}")

    finally:
        del model, tokenizer
        gc.collect()
        torch.cuda.empty_cache()

    print("Processing complete!")
    return results

if __name__ == "__main__":
    process_dataset("2025_Majority_Labeled.csv", "2025_AI_vs_Human_deepseek.csv")

## Human vs AI (Llama)

In [None]:
import os
os.environ["TRANSFORMERS_CACHE"] = "/XXXX/local_cache"
os.environ["HF_HOME"] = "/XXXX/local_cache"

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from huggingface_hub import login
from collections import Counter
import pandas as pd
import gc
import json
import re
import string

login(token="XXXX")

MODEL_NAME = "meta-llama/Llama-2-7b-chat-hf"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.float16
)

CHECKPOINT_FILE = "llama25_checkpoint.json"
OUTPUT_FILE = "2025_AI_vs_Human_Llama.csv"

def save_checkpoint(current_index, results):
    with open(CHECKPOINT_FILE, "w") as f:
        json.dump({"last_processed": current_index, "results": results}, f)

def load_checkpoint():
    try:
        with open(CHECKPOINT_FILE, "r") as f:
            return json.load(f)
    except (FileNotFoundError, json.JSONDecodeError):
        return {"last_processed": -1, "results": []}

def load_model():
    gc.collect()
    torch.cuda.empty_cache()
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME, cache_dir="/XXXX/local_cache",)
    model = AutoModelForCausalLM.from_pretrained(
        MODEL_NAME,
        cache_dir="/XXXX/local_cache",
        quantization_config=bnb_config,
        device_map="auto"
    )

    if tokenizer.pad_token is None:
        tokenizer.add_special_tokens({'pad_token': '[PAD]'})
        model.resize_token_embeddings(len(tokenizer))
    return model, tokenizer

def remove_punctuation(text):
    return text.translate(str.maketrans('', '', string.punctuation))

def query_model(paragraphs, model, tokenizer):
    labels = ["AI-generated", "Human-written"]
    label_token_ids = [tokenizer.encode(label, add_special_tokens=False) for label in labels]

    predicted_labels = []

    for paragraph in paragraphs:
        prompt = f"""You are a text classifier. Given a paragraph, classify whether it is \"AI-generated\" or \"Human-written\".
        Respond ONLY with one of these two labels.
        For each criterion below, assign a score:
     **+1 for AI-like** if the text strongly matches the AI trait.
     **+1 for Human-like** if the text strongly matches the human trait.
     **+0 if neutral/uncertain**.

    #### **Evaluation Steps**:
    1. **Perplexity & Creativity**:
     AI-like: Predictable word choices, clichés, or overly fluent phrasing.
     Human-like: Unusual phrasing, creative metaphors, or minor errors.
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    2. **Burstiness**:
     AI-like: Uniform sentence length/structure (e.g., all medium-length).
     Human-like: Varied rhythm (mix of short/long sentences, interruptions).
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    3. **Specificity & Personalization**:
     AI-like: Lacks concrete details (no names, anecdotes, or emotions).
     Human-like: Uses "I/we," personal stories, opinions, or informal language.
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    4. **Logical Flow**:
     AI-like: Abrupt topic shifts or overly rigid structure.
     Human-like: Natural digressions, conversational flow.
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    5. **Errors & Imperfections**:
     AI-like: Grammatically flawless, no typos/colloquialisms.
     Human-like: Minor errors, idiosyncratic punctuation, or slang.
     Score: AI-like (+1) / Human-like (+1) / Neutral (0).

    #### **Decision Rule**:
     If **total AI-like >= 3** then "AI-generated."
     Else "Human-written." (Default to Human-written if uncertain.)
        Paragraph: {paragraph}\nAnswer:"""
        inputs = tokenizer(prompt, return_tensors="pt", return_token_type_ids=False, return_attention_mask=True).to("cuda")

        with torch.no_grad():
            outputs = model(**inputs)
            logits = outputs.logits

        last_token_logits = logits[0, -1, :]

        label_scores = []
        for token_ids in label_token_ids:
            score = sum(last_token_logits[token_id].item() for token_id in token_ids) / len(token_ids)
            label_scores.append(score)

        predicted_label = labels[label_scores.index(max(label_scores))]
        predicted_labels.append(predicted_label)

    return predicted_labels

def process_dataset(input_csv, output_csv, batch_size=1):
    checkpoint = load_checkpoint()
    start_index = checkpoint["last_processed"] + 1
    results = checkpoint["results"]

    df = pd.read_csv(input_csv)
    negative_samples = df[df['Majority_Label'] == 'negative']
    paragraphs = negative_samples['Paragraph'].tolist()
    original_indices = negative_samples.index.tolist()

    model, tokenizer = load_model()

    try:
        for i in range(start_index, len(paragraphs), batch_size):
            batch_paragraphs = paragraphs[i:i + batch_size]
            batch_indices = original_indices[i:i + batch_size]

            batch_labels = query_model(batch_paragraphs, model, tokenizer)

            for idx, paragraph, label in zip(batch_indices, batch_paragraphs, batch_labels):
                results.append({
                    "Original_Index": idx,
                    "Paragraph": paragraph,
                    "Llama-7B": label
                })

            save_checkpoint(i + len(batch_paragraphs) - 1, results)
            pd.DataFrame(results).to_csv(output_csv, index=False)
            print(f"Processed up to index {i + len(batch_paragraphs) - 1}")

    finally:
        del model, tokenizer
        gc.collect()
        torch.cuda.empty_cache()

    print("Processing complete!")
    return results

if __name__ == "__main__":
    process_dataset("2025_Majority_Labeled.csv", "2025_AI_vs_Human_Llama.csv")

## 2023

In [None]:
hum_ai_Mistral_2023 = pd.read_csv(r"/XXXX/ACL/CSVs/AI vs Human/Mistral Final/Logit based/2023_AI_vs_Human_Mistral.csv",encoding="ISO-8859-1")
hum_ai_Llama_2023 = pd.read_csv(r"/XXXX/ACL/CSVs/AI vs Human/Llama Final/Logit based/2023_AI_vs_Human_Llama.csv",encoding="ISO-8859-1")
hum_ai_DeepSeek_2023 = pd.read_csv(r"/XXXX/ACL/CSVs/AI vs Human/DeepSeek Final/Logit based/2023_AI_vs_Human_deepseek.csv",encoding="ISO-8859-1")

In [None]:
temp_df = pd.merge(hum_ai_Mistral_2023, hum_ai_Llama_2023, on='Original_Index')

merged_df = pd.merge(temp_df, hum_ai_DeepSeek_2023, on='Original_Index')


In [None]:
merged_df_23=merged_df[["Original_Index","Paragraph","Mistral-7B","Llama-7B","Deepseek"]]

In [None]:
merged_df_23

Unnamed: 0,Original_Index,Paragraph,Mistral-7B,Llama-7B,Deepseek
0,31,Iterative Back-Translation (IBT) (Hoang et al....,AI-generated,Human-written,Human-written
1,66,We would like to thank all the LKLab lab mates...,Human-written,Human-written,Human-written
2,88,"DocRED and DocREDScratch. DocRED contains 56,3...",Human-written,Human-written,AI-generated
3,96,3 A1. Did you describe the limitations of your...,Human-written,AI-generated,AI-generated
4,110,We analyze the metric evaluation with respect ...,Human-written,Human-written,Human-written
...,...,...,...,...,...
2284,8665,"Following Cui et al. (2022), we comprehensivel...",Human-written,AI-generated,Human-written
2285,8668,We mentioned the necessity of developing multi...,Human-written,Human-written,Human-written
2286,8669,"As Table 3 suggests, MCWQ-R is easier than its...",AI-generated,AI-generated,Human-written
2287,8670,PLM comparison. mT5 fine-tuned on English fail...,Human-written,Human-written,Human-written


In [None]:
mis_23=merged_df["Mistral-7B"].tolist()
lm_23=merged_df["Llama-7B"].tolist()
dp_23=merged_df["Deepseek"].tolist()

In [None]:
import numpy as np
from collections import Counter

model_predictions = [
    mis_23,lm_23,dp_23
]

predictions = np.array(model_predictions)

final_predictions = []
consensus_scores = []

for i in range(predictions.shape[1]):
    votes = predictions[:, i]
    majority_vote = Counter(votes).most_common(1)[0][0]
    final_predictions.append(majority_vote)

    agreement = np.mean(votes == majority_vote)
    consensus_scores.append(agreement)

print("Average Consensus:", f"{np.mean(consensus_scores):.0%}")

Average Consensus: 81%


In [None]:
f_23=merged_df_23[["Original_Index","Paragraph"]].copy()
f_23["AI_HUM"]=final_predictions

In [None]:
f_23

Unnamed: 0,Original_Index,Paragraph,AI_HUM
0,31,Iterative Back-Translation (IBT) (Hoang et al....,Human-written
1,66,We would like to thank all the LKLab lab mates...,Human-written
2,88,"DocRED and DocREDScratch. DocRED contains 56,3...",Human-written
3,96,3 A1. Did you describe the limitations of your...,AI-generated
4,110,We analyze the metric evaluation with respect ...,Human-written
...,...,...,...
2284,8665,"Following Cui et al. (2022), we comprehensivel...",Human-written
2285,8668,We mentioned the necessity of developing multi...,Human-written
2286,8669,"As Table 3 suggests, MCWQ-R is easier than its...",AI-generated
2287,8670,PLM comparison. mT5 fine-tuned on English fail...,Human-written


In [None]:
f_23=f_23[["Original_Index","AI_HUM"]]

In [None]:
f_23

Unnamed: 0,Original_Index,AI_HUM
0,31,Human-written
1,66,Human-written
2,88,Human-written
3,96,AI-generated
4,110,Human-written
...,...,...
2284,8665,Human-written
2285,8668,Human-written
2286,8669,AI-generated
2287,8670,Human-written


In [None]:
df_2023 = pd.read_csv(r"/XXXX/ACL/CSVs/After_Majority_Voting_Relevance/2023_Majority_Labeled.csv",encoding="ISO-8859-1")

In [None]:
df_2023=df_2023[["Title","Abstract","Paragraph","Majority_Label"]]

In [None]:
df_2023

Unnamed: 0,Title,Abstract,Paragraph,Majority_Label
0,A Critical Evaluation of Evaluations for Long-...,Long-form question answering (LFQA) enables an...,"Long-form question answering (Fan et al., 2019...",positive
1,A Critical Evaluation of Evaluations for Long-...,Long-form question answering (LFQA) enables an...,Fa ct ua lit y Does your body ab-\nsorb all bl...,positive
2,A Critical Evaluation of Evaluations for Long-...,Long-form question answering (LFQA) enables an...,We begin by reviewing the evaluation protocols...,positive
3,A Critical Evaluation of Evaluations for Long-...,Long-form question answering (LFQA) enables an...,Prior LFQA human evaluations use non-expert cr...,positive
4,A Critical Evaluation of Evaluations for Long-...,Long-form question answering (LFQA) enables an...,Hiring experts: We recruit domain experts on t...,positive
...,...,...,...,...
8686,On Evaluating Multilingual Compositional Gener...,Compositional generalization allows efficient ...,"linguistic phenomena, demographic groups repre...",positive
8687,On Evaluating Multilingual Compositional Gener...,Compositional generalization allows efficient ...,"(e.g., GPU hours), and computing infrastructur...",positive
8688,On Evaluating Multilingual Compositional Gener...,Compositional generalization allows efficient ...,"statistics from sets of experiments), and is i...",positive
8689,On Evaluating Multilingual Compositional Gener...,Compositional generalization allows efficient ...,"you report the implementation, model, and para...",positive


In [None]:
Final_2023 = pd.concat([df_2023, f_23.set_index('Original_Index')], axis=1).reset_index()
Final_2023 = pd.DataFrame(Final_2023)
Final_2023

Unnamed: 0,index,Title,Abstract,Paragraph,Majority_Label,AI_HUM
0,0,A Critical Evaluation of Evaluations for Long-...,Long-form question answering (LFQA) enables an...,"Long-form question answering (Fan et al., 2019...",positive,
1,1,A Critical Evaluation of Evaluations for Long-...,Long-form question answering (LFQA) enables an...,Fa ct ua lit y Does your body ab-\nsorb all bl...,positive,
2,2,A Critical Evaluation of Evaluations for Long-...,Long-form question answering (LFQA) enables an...,We begin by reviewing the evaluation protocols...,positive,
3,3,A Critical Evaluation of Evaluations for Long-...,Long-form question answering (LFQA) enables an...,Prior LFQA human evaluations use non-expert cr...,positive,
4,4,A Critical Evaluation of Evaluations for Long-...,Long-form question answering (LFQA) enables an...,Hiring experts: We recruit domain experts on t...,positive,
...,...,...,...,...,...,...
8686,8686,On Evaluating Multilingual Compositional Gener...,Compositional generalization allows efficient ...,"linguistic phenomena, demographic groups repre...",positive,
8687,8687,On Evaluating Multilingual Compositional Gener...,Compositional generalization allows efficient ...,"(e.g., GPU hours), and computing infrastructur...",positive,
8688,8688,On Evaluating Multilingual Compositional Gener...,Compositional generalization allows efficient ...,"statistics from sets of experiments), and is i...",positive,
8689,8689,On Evaluating Multilingual Compositional Gener...,Compositional generalization allows efficient ...,"you report the implementation, model, and para...",positive,


In [None]:
Final_2023=Final_2023[["Title",	"Abstract",	"Paragraph", "Majority_Label",	"AI_HUM"]]

In [None]:
Final_2023['AI_HUM']=Final_2023['AI_HUM'].fillna('N/A')


0       N/A
1       N/A
2       N/A
3       N/A
4       N/A
       ... 
8686    N/A
8687    N/A
8688    N/A
8689    N/A
8690    N/A
Name: AI_HUM, Length: 8691, dtype: object

In [None]:
Final_2023.loc[100:130]

Unnamed: 0,Title,Abstract,Paragraph,Majority_Label,AI_HUM
100,Revisiting the Gold Standard: Grounding Summar...,Human evaluation is the foundation upon which ...,Inspired by the Pyramid (Nenkova and Passonnea...,positive,
101,Revisiting the Gold Standard: Grounding Summar...,Human evaluation is the foundation upon which ...,"CNNDM Test 500 12 5.6k 6k CNNDM V lid 1,000 8 ...",positive,
102,Revisiting the Gold Standard: Grounding Summar...,Human evaluation is the foundation upon which ...,We collect ACU annotations on three summarizat...,positive,
103,Revisiting the Gold Standard: Grounding Summar...,Human evaluation is the foundation upon which ...,We analyze the statistical power of our collec...,positive,
104,Revisiting the Gold Standard: Grounding Summar...,Human evaluation is the foundation upon which ...,"As a case study, in Tab. 3 we analyze the summ...",positive,
105,Revisiting the Gold Standard: Grounding Summar...,Human evaluation is the foundation upon which ...,"Apart from ACU annotations, we collect human a...",positive,
106,Revisiting the Gold Standard: Grounding Summar...,Human evaluation is the foundation upon which ...,We collected three annotations per summary on ...,positive,
107,Revisiting the Gold Standard: Grounding Summar...,Human evaluation is the foundation upon which ...,We investigate both the summary-level and syst...,positive,
108,Revisiting the Gold Standard: Grounding Summar...,Human evaluation is the foundation upon which ...,We analyze several representative automatic me...,positive,
109,Revisiting the Gold Standard: Grounding Summar...,Human evaluation is the foundation upon which ...,We use the correlations between automatic metr...,positive,


In [None]:
Final_2023.to_csv("Final_2023.csv")

## 2024

In [None]:
hum_ai_Mistral_2024 = pd.read_csv(r"/XXXX/ACL/CSVs/AI vs Human/Mistral Final/Logit based/2024_AI_vs_Human_Mistral.csv",encoding="ISO-8859-1")
hum_ai_Llama_2024 = pd.read_csv(r"/XXXX/ACL/CSVs/AI vs Human/Llama Final/Logit based/2024_AI_vs_Human_Llama.csv",encoding="ISO-8859-1")
hum_ai_DeepSeek_2024 = pd.read_csv(r"/XXXX/ACL/CSVs/AI vs Human/DeepSeek Final/Logit based/2024_AI_vs_Human_deepseek.csv",encoding="ISO-8859-1")

In [None]:
temp_df = pd.merge(hum_ai_Mistral_2024, hum_ai_Llama_2024, on='Original_Index')

merged_df = pd.merge(temp_df, hum_ai_DeepSeek_2024, on='Original_Index')


In [None]:
merged_df_24=merged_df[["Original_Index","Paragraph","Mistral-7B","Llama-7B","Deepseek"]]

In [None]:
mis_24=merged_df_24["Mistral-7B"].tolist()
lm_24=merged_df_24["Llama-7B"].tolist()
dp_24=merged_df_24["Deepseek"].tolist()

In [None]:
import numpy as np
from collections import Counter

model_predictions = [
    mis_24,lm_24,dp_24
]

predictions = np.array(model_predictions)

final_predictions = []
consensus_scores = []

for i in range(predictions.shape[1]):
    votes = predictions[:, i]
    majority_vote = Counter(votes).most_common(1)[0][0]
    final_predictions.append(majority_vote)

    agreement = np.mean(votes == majority_vote)
    consensus_scores.append(agreement)


print("Average Consensus:", f"{np.mean(consensus_scores):.0%}")

Average Consensus: 81%


In [None]:
f_24=merged_df_24[["Original_Index","Paragraph"]].copy()
f_24["AI_HUM"]=final_predictions

In [None]:
f_24

Unnamed: 0,Original_Index,Paragraph,AI_HUM
0,0,Recent Large Language Models (LLMs) have made ...,Human-written
1,2,Significant strides have been made in long-for...,Human-written
2,4,An alternative framework for evaluating long-f...,Human-written
3,5,Meta-questions were manually raised by five ex...,AI-generated
4,23,We would like to express our profound gratitud...,Human-written
...,...,...,...
2127,8179,This work was supported by the National Natura...,Human-written
2128,8180,The statistical data for the MultiWOZ dataset ...,Human-written
2129,8181,"In this section, we provide a detailed overvie...",Human-written
2130,8182,To explore where we should apply DualLoRA in T...,Human-written


In [None]:
f_24=f_24[["Original_Index","AI_HUM"]]

In [None]:
f_24

Unnamed: 0,Original_Index,AI_HUM
0,0,Human-written
1,2,Human-written
2,4,Human-written
3,5,AI-generated
4,23,Human-written
...,...,...
2127,8179,Human-written
2128,8180,Human-written
2129,8181,Human-written
2130,8182,Human-written


In [None]:
df_2024 = pd.read_csv(r"/XXXX/ACL/CSVs/After_Majority_Voting_Relevance/2024_Majority_Labeled.csv",encoding="ISO-8859-1")

In [None]:
df_2024=df_2024[["Title","Abstract","Paragraph","Majority_Label"]]

In [None]:
Final_2024 = pd.concat([df_2024, f_24.set_index('Original_Index')], axis=1).reset_index()
Final_2024 = pd.DataFrame(Final_2024)
Final_2024

Unnamed: 0,index,Title,Abstract,Paragraph,Majority_Label,AI_HUM
0,0,PROXYQA: An Alternative Framework for Evaluati...,Large Language Models (LLMs) have succeeded re...,Recent Large Language Models (LLMs) have made ...,negative,Human-written
1,1,PROXYQA: An Alternative Framework for Evaluati...,Large Language Models (LLMs) have succeeded re...,spanning tens of thousands of tokens (Anthropi...,positive,
2,2,PROXYQA: An Alternative Framework for Evaluati...,Large Language Models (LLMs) have succeeded re...,Significant strides have been made in long-for...,negative,Human-written
3,3,PROXYQA: An Alternative Framework for Evaluati...,Large Language Models (LLMs) have succeeded re...,Automated metrics such as surface form matchin...,positive,
4,4,PROXYQA: An Alternative Framework for Evaluati...,Large Language Models (LLMs) have succeeded re...,An alternative framework for evaluating long-f...,negative,Human-written
...,...,...,...,...,...,...
8180,8180,Zero-Shot Cross-Domain Dialogue State Tracking...,Zero-shot dialogue state tracking (DST) seeks ...,The statistical data for the MultiWOZ dataset ...,negative,Human-written
8181,8181,Zero-Shot Cross-Domain Dialogue State Tracking...,Zero-shot dialogue state tracking (DST) seeks ...,"In this section, we provide a detailed overvie...",negative,Human-written
8182,8182,Zero-Shot Cross-Domain Dialogue State Tracking...,Zero-shot dialogue state tracking (DST) seeks ...,To explore where we should apply DualLoRA in T...,negative,Human-written
8183,8183,Zero-Shot Cross-Domain Dialogue State Tracking...,Zero-shot dialogue state tracking (DST) seeks ...,To delve into the slot accuracy performance of...,negative,Human-written


In [None]:
Final_2024=Final_2024[["Title",	"Abstract",	"Paragraph", "Majority_Label",	"AI_HUM"]]

In [None]:
Final_2024['AI_HUM']=Final_2024['AI_HUM'].fillna('N/A')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  Final_2024['AI_HUM']=Final_2024['AI_HUM'].fillna('N/A')


In [None]:
Final_2024.to_csv("Final_2024.csv")

In [None]:
Final_2024

Unnamed: 0,Title,Abstract,Paragraph,Majority_Label,AI_HUM
0,PROXYQA: An Alternative Framework for Evaluati...,Large Language Models (LLMs) have succeeded re...,Recent Large Language Models (LLMs) have made ...,negative,Human-written
1,PROXYQA: An Alternative Framework for Evaluati...,Large Language Models (LLMs) have succeeded re...,spanning tens of thousands of tokens (Anthropi...,positive,
2,PROXYQA: An Alternative Framework for Evaluati...,Large Language Models (LLMs) have succeeded re...,Significant strides have been made in long-for...,negative,Human-written
3,PROXYQA: An Alternative Framework for Evaluati...,Large Language Models (LLMs) have succeeded re...,Automated metrics such as surface form matchin...,positive,
4,PROXYQA: An Alternative Framework for Evaluati...,Large Language Models (LLMs) have succeeded re...,An alternative framework for evaluating long-f...,negative,Human-written
...,...,...,...,...,...
8180,Zero-Shot Cross-Domain Dialogue State Tracking...,Zero-shot dialogue state tracking (DST) seeks ...,The statistical data for the MultiWOZ dataset ...,negative,Human-written
8181,Zero-Shot Cross-Domain Dialogue State Tracking...,Zero-shot dialogue state tracking (DST) seeks ...,"In this section, we provide a detailed overvie...",negative,Human-written
8182,Zero-Shot Cross-Domain Dialogue State Tracking...,Zero-shot dialogue state tracking (DST) seeks ...,To explore where we should apply DualLoRA in T...,negative,Human-written
8183,Zero-Shot Cross-Domain Dialogue State Tracking...,Zero-shot dialogue state tracking (DST) seeks ...,To delve into the slot accuracy performance of...,negative,Human-written


## Consensus Score

In [None]:
import numpy as np
from collections import Counter

mis_23_24=mis_23+mis_24
lm_23_24=lm_23+lm_24
dp_23_24=dp_23+dp_24
model_predictions = [
    mis_23_24,lm_23_24,dp_23_24
]

predictions = np.array(model_predictions)

final_predictions = []
consensus_scores = []

for i in range(predictions.shape[1]):
    votes = predictions[:, i]
    majority_vote = Counter(votes).most_common(1)[0][0]
    final_predictions.append(majority_vote)

    agreement = np.mean(votes == majority_vote)
    consensus_scores.append(agreement)

print("Average Consensus:", f"{np.mean(consensus_scores):.0%}")

Average Consensus: 81%
