# What is Pandas?
Pandas is the primary Python library for tabular data manipulation and analysis.

In Machine Learning and Medical Computer Vision, Pandas is used to:

- Load datasets (CSV, Excel, clinical metadata)
- Clean and validate data
- Explore labels and features
- Prepare data before NumPy / PyTorch / TensorFlow
- Perform statistical inspection
NumPy handles arrays — Pandas handles datasets.
## Series
A 1‑dimensional labeled array.

In [4]:
import pandas as pd
s = pd.Series([10, 20, 30], name='values')
s

0    10
1    20
2    30
Name: values, dtype: int64

## DataFrame
A 2‑dimensional labeled table (rows × columns).

In [17]:
data = {
    "patinet_id":[1,2,3],
    "age":[45, 60,38],
    "diagnosis" : ["tumor", "normal", "tumor"]
}
df = pd.DataFrame(data)
df
# saving processed data
df.to_csv("data.csv", index=False)


In [21]:
# Loading Data
df = pd.read_csv('data.csv')
# Excel file
# df = pd.read_excel('data.xlsx')


In [23]:
# Inspect Dataset Structure
print(df.head())

   patinet_id  age diagnosis
0           1   45     tumor
1           2   60    normal
2           3   38     tumor


In [25]:
print(df.tail())

   patinet_id  age diagnosis
0           1   45     tumor
1           2   60    normal
2           3   38     tumor


In [27]:
print(df.shape)

(3, 3)


In [29]:
print(df.columns)

Index(['patinet_id', 'age', 'diagnosis'], dtype='object')


In [31]:
print(df.dtypes)

patinet_id     int64
age            int64
diagnosis     object
dtype: object


In [33]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3 entries, 0 to 2
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   patinet_id  3 non-null      int64 
 1   age         3 non-null      int64 
 2   diagnosis   3 non-null      object
dtypes: int64(2), object(1)
memory usage: 204.0+ bytes


In [35]:
df.describe()

Unnamed: 0,patinet_id,age
count,3.0,3.0
mean,2.0,47.666667
std,1.0,11.23981
min,1.0,38.0
25%,1.5,41.5
50%,2.0,45.0
75%,2.5,52.5
max,3.0,60.0


In [37]:
df['age']

0    45
1    60
2    38
Name: age, dtype: int64

In [39]:
df[['age', 'diagnosis']]

Unnamed: 0,age,diagnosis
0,45,tumor
1,60,normal
2,38,tumor


In [41]:
df.iloc[0]

patinet_id        1
age              45
diagnosis     tumor
Name: 0, dtype: object

In [45]:
df.loc[0]

patinet_id        1
age              45
diagnosis     tumor
Name: 0, dtype: object

In [47]:
df[df['age'] > 50]

Unnamed: 0,patinet_id,age,diagnosis
1,2,60,normal


In [49]:
# drop Missing Value
df.dropna()

Unnamed: 0,patinet_id,age,diagnosis
0,1,45,tumor
1,2,60,normal
2,3,38,tumor


In [51]:
df.fillna(0)
df['age'].fillna(df['age'].mean())

0    45
1    60
2    38
Name: age, dtype: int64

In [53]:
df["diagnosis"].value_counts()

diagnosis
tumor     2
normal    1
Name: count, dtype: int64

In [57]:
df["diagnosis_encoded"] = df["diagnosis"].map({
    "normal": 0,
    "tumor": 1
})
df

Unnamed: 0,patinet_id,age,diagnosis,diagnosis_encoded
0,1,45,tumor,1
1,2,60,normal,0
2,3,38,tumor,1


In [63]:
# feature selection
df = df.drop(columns=["patinet_id"])
df

Unnamed: 0,age,diagnosis,diagnosis_encoded
0,45,tumor,1
1,60,normal,0
2,38,tumor,1


In [71]:
# select feature
x = df.drop(columns=['diagnosis_encoded'])
y = df["diagnosis_encoded"]
print(x)
print('_'*30)
print(y)

   age diagnosis
0   45     tumor
1   60    normal
2   38     tumor
______________________________
0    1
1    0
2    1
Name: diagnosis_encoded, dtype: int64


In [75]:
# Combine pandas with numpy
import numpy as np
X_np = x.to_numpy()
y_np = y.to_numpy()
print(X_np)
print('_'*30)
print(y_np)

[[45 'tumor']
 [60 'normal']
 [38 'tumor']]
______________________________
[1 0 1]


# Exercise 1 — Load & Inspect Dataset
Task:
- Load data.csv
- Display the first 5 rows
- Print dataset shape
- Print column names
- Show dataset information

In [93]:
import pandas as pd
import numpy as np
# Generate data.csv
def create_dummy_csv():
    data = {
        'image_id': [f'img_{i:04d}' for i in range(100)],
        'width': np.random.choice([256, 512, 1024], 100),
        'height': np.random.choice([256, 512, 1024], 100),
        'class_label': np.random.choice(['cat', 'dog', 'car'], 100),
        'mean_intensity': np.random.uniform(0, 1, 100).round(4)
    }
    df_dummy = pd.DataFrame(data)
    df_dummy.to_csv('data.csv', index=False)
    print('Created "data.csv" for demonstration')
    
create_dummy_csv()

# Load data.csv
df = pd.read_csv('data.csv')

# Display first 5 row
print('--- first 5 rows ---')
print(df.head())

# print dataset shape
print("\n--- Dataset Shape ---")
print(f"Shape: {df.shape}")

# print column names
print("\n--- Column Names ---")
print(df.columns.tolist())

# 5. Show dataset information
# .info() prints a concise summary including index dtype, column dtypes, 
# non-null values, and memory usage.
print("\n--- Dataset Information ---")
df.info()

Created "data.csv" for demonstration
--- first 5 rows ---
   image_id  width  height class_label  mean_intensity
0  img_0000   1024     256         dog          0.0234
1  img_0001    512     512         car          0.3326
2  img_0002    512     256         dog          0.6105
3  img_0003    512    1024         car          0.5888
4  img_0004    512    1024         car          0.9638

--- Dataset Shape ---
Shape: (100, 5)

--- Column Names ---
['image_id', 'width', 'height', 'class_label', 'mean_intensity']

--- Dataset Information ---
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 5 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   image_id        100 non-null    object 
 1   width           100 non-null    int64  
 2   height          100 non-null    int64  
 3   class_label     100 non-null    object 
 4   mean_intensity  100 non-null    float64
dtypes: float64(1), int64(2), object(2)

# Exercise 2 — Remove Irrelevant Columns
Medical datasets often contain useless index columns.

Task
- Identify columns starting with "Unnamed"
- Remove them from the dataset

In [104]:
import pandas as pd
import numpy as np
# creat messy dataset
def create_messy_csv():
    df = pd.DataFrame({
        'Unnamed: 0': [0, 1, 2, 3, 4],       # Junk index column
        'Patient_ID': [101, 102, 103, 104, 105],
        'Age': [25, 30, 45, 35, 50],
        'Unnamed: 3': [np.nan] * 5           # Another junk column
    })
    df.to_csv('medical_data_messy.csv', index=False)
    print("Created 'medical_data_messy.csv' with junk columns.")

create_messy_csv()
# Load dataset
df = pd.read_csv('medical_data_messy.csv')

print("--- Columns BEFORE Cleaning ---")
print(df.columns.tolist())

# Identify columns with startinf 'Unnamed'
cols_to_drop = [col for col in df.columns if col.startswith('Unnamed')]
print(f"\nIdentified {len(cols_to_drop)} column(s) to remove: {cols_to_drop}")

# Remove them from the dataset
df.drop(columns=cols_to_drop, inplace=True)

# Verify the results
print("\n--- Columns AFTER Cleaning ---")
print(df.columns.tolist())

print("\n--- Cleaned Data Head ---")
print(df.head())

Created 'medical_data_messy.csv' with junk columns.
--- Columns BEFORE Cleaning ---
['Unnamed: 0', 'Patient_ID', 'Age', 'Unnamed: 3']

Identified 2 column(s) to remove: ['Unnamed: 0', 'Unnamed: 3']

--- Columns AFTER Cleaning ---
['Patient_ID', 'Age']

--- Cleaned Data Head ---
   Patient_ID  Age
0         101   25
1         102   30
2         103   45
3         104   35
4         105   50
