In [5]:
import pandas as pd

df = pd.read_csv("final_dataset.csv")

total_samples = 1600
categories = df['category'].unique()
n_categories = len(categories)
samples_per_category = total_samples // n_categories

balanced_dfs = []

for cat in categories:
    # Half AI, half Human for this category
    samples_per_label = samples_per_category // 2
    
    for lbl in df['label'].unique():
        subset = df[(df['category'] == cat) & (df['label'] == lbl)]
        if len(subset) > 0:
            if len(subset) >= samples_per_label:
                sampled = subset.sample(n=samples_per_label, random_state=42)
            else:
                # If not enough, oversample
                sampled = subset.sample(n=samples_per_label, replace=True, random_state=42)
            balanced_dfs.append(sampled)

# Combine and shuffle
balanced_df = pd.concat(balanced_dfs).sample(frac=1, random_state=42).reset_index(drop=True)

# Save
balanced_df.to_csv("balanced_1500_dataset.csv", index=False)

print("Final dataset shape:", balanced_df.shape)
print("\nCounts per category and label in balanced dataset:")
print(balanced_df.groupby(['category', 'label']).size())


Final dataset shape: (1600, 3)

Counts per category and label in balanced dataset:
category  label
essays    ai       200
          human    200
novels    ai       200
          human    200
poems     ai       200
          human    200
stories   ai       200
          human    200
dtype: int64


In [6]:
import pandas as pd

# Load dataset
df = pd.read_csv("balanced_1500_dataset.csv")

# Show first few rows to verify structure
print(df.head())

# Check column names
print("\nColumns:", df.columns)

# --- Assuming the dataset has columns: 'label' (AI/Human) and 'category' ---
# Replace with the actual column names if different

# 1. Count AI vs Human
label_counts = df['label'].value_counts()
print("\nTotal AI vs Human samples:")
print(label_counts)

# 2. AI categories count
ai_categories = df[df['label'] == 'AI']['category'].value_counts()
print("\nAI categories distribution:")
print(ai_categories)

# 3. Human categories count
human_categories = df[df['label'] == 'Human']['category'].value_counts()
print("\nHuman categories distribution:")
print(human_categories)


                                                text  label category
0  (“On the whole, there is very little disputing...  human   essays
1  রাতের নির্জন শান্তি ভেঙে দিয়েছিল দূরে থেকে কাঁ...     ai   novels
2  তাঁর গালে ছোট্ট করে চুমু দিল রানা। ‘গুডবাই, বড...  human   novels
3  দাঁড়ানোর ভঙ্গিটি জড়োসড়ো। যেন পড়া না পারার ...  human   novels
4  চলেছি। নির্জন পথে নিঃসঙ্গ চলার একটা আকর্ষণ আছে...  human  stories

Columns: Index(['text', 'label', 'category'], dtype='object')

Total AI vs Human samples:
label
human    800
ai       800
Name: count, dtype: int64

AI categories distribution:
Series([], Name: count, dtype: int64)

Human categories distribution:
Series([], Name: count, dtype: int64)
