# Dataset Analysis

In [16]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Load your dataset
df = pd.read_csv('Human_AI.csv')  # or pd.read_excel(), pd.read_json()

# Basic info
print("ЁЯУД Basic Info:")
print(df.info())

print("\nЁЯФв First 5 Rows:")
print(df.head())

print("\nЁЯзн Dataset Shape:")
print(f"Rows: {df.shape[0]}, Columns: {df.shape[1]}")

# Check for missing values
print("\nЁЯХ│я╕П Missing Values:")
print(df.isnull().sum())
print("\nPercentage of Missing Values:")
print((df.isnull().mean() * 100).round(2))

# Check for duplicates
print("\nЁЯзм Duplicated Rows:")
print(df.duplicated().sum())

# Check data types
print("\nЁЯУж Data Types:")
print(df.dtypes)

# Check for constant columns
print("\nЁЯз▒ Constant Columns:")
print([col for col in df.columns if df[col].nunique() == 1])

# Unique value counts
print("\nЁЯФН Unique Value Count per Column:")
print(df.nunique())

# Describe numeric columns
print("\nЁЯУК Statistical Summary (Numeric):")
print(df.describe())

# Describe categorical columns
print("\nЁЯЧВя╕П Summary (Categorical):")
print(df.describe(include='object'))

# Value counts for categorical columns (top 3 categories)
print("\nЁЯз╛ Top Value Counts (Categorical Columns):")
for col in df.select_dtypes(include='object').columns:
    print(f"\n{col}:")
    print(df[col].value_counts().head(3))



ЁЯУД Basic Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18114 entries, 0 to 18113
Data columns (total 3 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   text      18114 non-null  object
 1   label     18114 non-null  object
 2   category  18114 non-null  object
dtypes: object(3)
memory usage: 424.7+ KB
None

ЁЯФв First 5 Rows:
                                                text  label category
0  ржжрзЗржмржпрж╛ржи тАУ ржЙржкржирзНржпрж╛рж╕ тАУ ржмрж┐ржнрзВрждрж┐ржнрзВрж╖ржг ржмржирзНржжрзНржпрзЛржкрж╛ржзрзНржпрж╛рзЯ рзз...  human   novels
1  ржмрж░ржкржХрзНрж╖рзЗрж░ ржирж┐ржмрж╛рж╕ ржХрж▓ржХрж╛рждрж╛, ржЖржЬржЗ ржмрзЗрж▓рж╛ рждрж┐ржиржЯрзЗрж░ рж╕ржоржпрж╝ ржорзЛ...  human   novels
2  рж╣ржмрзЗ, ржХрж▓ржХрж╛рждрж╛ ржерзЗржХрзЗ ржмрж░ржкржХрзНрж╖ ржнрж╛рж▓ ржмрж╛ржЬрж┐ ржПржирзЗржЪрзЗред ржПрж╕ржм ржкрж╛...  human   novels
3  ржпрждрзАржи рж╣рзЗрж╕рзЗ ржмрж▓рзНрж▓рзЗтАУржХрзЗржоржи, ржмрж╛ржЬрж╛рж░рзЗрж░ ржЦрж╛ржмрж╛рж░ ржХрж┐ржирждр

# Model

In [19]:
import pandas as pd
import torch
from sklearn.model_selection import train_test_split
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from transformers import DataCollatorWithPadding
from sklearn.metrics import accuracy_score, f1_score
from datasets import Dataset
import numpy as np

# Load and prepare dataset
df = pd.read_csv("Human_AI.csv")
df = df[['text', 'label']]
df = df[df['label'].isin(['human', 'ai'])]  # Filter out unexpected values
df['label'] = df['label'].map({'human': 0, 'ai': 1})  # Encode labels

# Split
train_df, test_df = train_test_split(df, test_size=0.2, stratify=df['label'], random_state=42)

# Convert to HuggingFace Datasets
train_ds = Dataset.from_pandas(train_df)
test_ds = Dataset.from_pandas(test_df)

# Load tokenizer and model
model_name = "sagorsarker/bangla-bert-base"
tokenizer = AutoTokenizer.from_pretrained(model_name)

def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=512)

train_ds = train_ds.map(tokenize_function, batched=True)
test_ds = test_ds.map(tokenize_function, batched=True)

# Define model
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)

# Metrics
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        "accuracy": accuracy_score(labels, preds),
        "f1": f1_score(labels, preds)
    }

# TrainingArguments
training_args = TrainingArguments(
    output_dir="./banglabert-human-ai",
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_dir="./logs",
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,
    weight_decay=0.01,
    load_best_model_at_end=True,
    logging_steps=10,
    save_total_limit=2,
    metric_for_best_model="f1"
)

# Data collator
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    data_collator=data_collator,
    compute_metrics=compute_metrics
)

# Train the model
trainer.train()

# Save the final model
model.save_pretrained("./banglabert-human-ai")
tokenizer.save_pretrained("./banglabert-human-ai")

The OrderedVocab you are attempting to save contains holes for indices [1015, 1016, 1017, 1018, 1053, 1054, 1055, 1056, 1057, 1060, 1061, 1062, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1099, 1101, 1112, 1113, 1556, 1557, 1568], your vocabulary could be corrupted !


Map:   0%|          | 0/14491 [00:00<?, ? examples/s]

The OrderedVocab you are attempting to save contains holes for indices [1015, 1016, 1017, 1018, 1053, 1054, 1055, 1056, 1057, 1060, 1061, 1062, 1064, 1065, 1066, 1067, 1068, 1069, 1070, 1071, 1072, 1073, 1074, 1075, 1076, 1077, 1079, 1080, 1081, 1082, 1083, 1084, 1085, 1086, 1087, 1088, 1089, 1090, 1091, 1092, 1093, 1094, 1095, 1099, 1101, 1112, 1113, 1556, 1557, 1568], your vocabulary could be corrupted !


Map:   0%|          | 0/3623 [00:00<?, ? examples/s]

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at sagorsarker/bangla-bert-base and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  trainer = Trainer(


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 