# Data Exploratory Analysis (EDA)

In [29]:
import pandas as pd
import numpy as np
from datasets import load_dataset
import matplotlib.pyplot as plt
import seaborn as sns
import os

print("✓ Setup complete!")

✓ Setup complete!


# Data Exploration

**Objective:** Understand the structure, quality, and characteristics of both datasets before cleaning.

In [30]:
# Load Chatbot Arena dataset
df_arena = pd.read_csv('../data/raw/chatbot_arena.csv')
print(df_arena.shape[0], "rows ×", df_arena.shape[1], "columns")

# Load Kaggle dataset (update filename if different)
df_kaggle = pd.read_csv('../data/raw/llm_comparison_dataset.csv')
print(df_kaggle.shape[0], "rows ×", df_kaggle.shape[1], "columns")

57477 rows × 9 columns
200 rows × 15 columns


### Inspect Arena Dataset

In [31]:
print("Columns in Arena Dataset: ")
print(df_arena.columns.tolist())

Columns in Arena Dataset: 
['id', 'model_a', 'model_b', 'prompt', 'response_a', 'response_b', 'winner_model_a', 'winner_model_b', 'winner_tie']


In [32]:
df_arena.head()

Unnamed: 0,id,model_a,model_b,prompt,response_a,response_b,winner_model_a,winner_model_b,winner_tie
0,30192,gpt-4-1106-preview,gpt-4-0613,"[""Is it morally right to try to have a certain...","[""The question of whether it is morally right ...","[""As an AI, I don't have personal beliefs or o...",1,0,0
1,53567,koala-13b,gpt-4-0613,"[""What is the difference between marriage lice...","[""A marriage license is a legal document that ...","[""A marriage license and a marriage certificat...",0,1,0
2,65089,gpt-3.5-turbo-0613,mistral-medium,"[""explain function calling. how would you call...","[""Function calling is the process of invoking ...","[""Function calling is the process of invoking ...",0,0,1
3,96401,llama-2-13b-chat,mistral-7b-instruct,"[""How can I create a test set for a very rare ...","[""Creating a test set for a very rare category...","[""When building a classifier for a very rare c...",1,0,0
4,198779,koala-13b,gpt-3.5-turbo-0314,"[""What is the best way to travel from Tel-Aviv...","[""The best way to travel from Tel Aviv to Jeru...","[""The best way to travel from Tel-Aviv to Jeru...",0,1,0


In [33]:
df_arena.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 57477 entries, 0 to 57476
Data columns (total 9 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              57477 non-null  int64 
 1   model_a         57477 non-null  object
 2   model_b         57477 non-null  object
 3   prompt          57477 non-null  object
 4   response_a      57477 non-null  object
 5   response_b      57477 non-null  object
 6   winner_model_a  57477 non-null  int64 
 7   winner_model_b  57477 non-null  int64 
 8   winner_tie      57477 non-null  int64 
dtypes: int64(4), object(5)
memory usage: 3.9+ MB


### Inspect Kaggle Dataset

In [34]:
print("Columns in Kaggle Dataset: ")
print(df_kaggle.columns.tolist())

Columns in Kaggle Dataset: 
['Model', 'Provider', 'Context Window', 'Speed (tokens/sec)', 'Latency (sec)', 'Benchmark (MMLU)', 'Benchmark (Chatbot Arena)', 'Open-Source', 'Price / Million Tokens', 'Training Dataset Size', 'Compute Power', 'Energy Efficiency', 'Quality Rating', 'Speed Rating', 'Price Rating']


In [35]:
df_kaggle.head()

Unnamed: 0,Model,Provider,Context Window,Speed (tokens/sec),Latency (sec),Benchmark (MMLU),Benchmark (Chatbot Arena),Open-Source,Price / Million Tokens,Training Dataset Size,Compute Power,Energy Efficiency,Quality Rating,Speed Rating,Price Rating
0,DeepSeek-4,Deepseek,128000,95,2.74,85,1143,1,18.81,760952565,13,0.5,2,2,3
1,Llama-8,Meta AI,300000,284,3.21,71,1390,1,3.98,22891342,22,2.07,1,3,3
2,Llama-5,Meta AI,300000,225,2.95,85,1406,0,1.02,827422145,21,0.95,2,3,2
3,DeepSeek-3,Deepseek,2000000,242,12.89,72,1264,1,27.63,694305632,86,3.51,1,3,3
4,DeepSeek-8,Deepseek,1000000,71,3.8,77,1381,1,18.52,378552278,92,1.8,2,2,3


In [36]:
df_kaggle.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 15 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   Model                      200 non-null    object 
 1   Provider                   200 non-null    object 
 2   Context Window             200 non-null    int64  
 3   Speed (tokens/sec)         200 non-null    int64  
 4   Latency (sec)              200 non-null    float64
 5   Benchmark (MMLU)           200 non-null    int64  
 6   Benchmark (Chatbot Arena)  200 non-null    int64  
 7   Open-Source                200 non-null    int64  
 8   Price / Million Tokens     200 non-null    float64
 9   Training Dataset Size      200 non-null    int64  
 10  Compute Power              200 non-null    int64  
 11  Energy Efficiency          200 non-null    float64
 12  Quality Rating             200 non-null    int64  
 13  Speed Rating               200 non-null    int64  

### Summary Statistics

In [39]:
# Arena - Numerical
df_arena.describe()

Unnamed: 0,id,winner_model_a,winner_model_b,winner_tie
count,57477.0,57477.0,57477.0,57477.0
mean,2142564000.0,0.349079,0.341911,0.309011
std,1238327000.0,0.476683,0.474354,0.46209
min,30192.0,0.0,0.0,0.0
25%,1071821000.0,0.0,0.0,0.0
50%,2133658000.0,0.0,0.0,0.0
75%,3211645000.0,1.0,1.0,1.0
max,4294947000.0,1.0,1.0,1.0


In [40]:

# Arena - Categorical
df_arena.describe(include=['object'])

Unnamed: 0,model_a,model_b,prompt,response_a,response_b
count,57477,57477,57477,57477,57477
unique,64,64,51734,56566,56609
top,gpt-4-1106-preview,gpt-4-1106-preview,"[""Answer the following statements with \""Agree...","[""Hello! How can I assist you today?""]","[""Hello! How can I assist you today?""]"
freq,3678,3709,101,109,100
