<a href="https://colab.research.google.com/github/Chowdhurynaseeh/ML_Batch-03/blob/main/pandas.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Step 1: Getting Started with Pandas**

In [1]:
!pip install pandas



In [2]:
import pandas as pd
import numpy as np   # often used together

## **Step 2: Series (1D Data)**

In [3]:
# A Series is like a labeled 1D array.
# From list
s = pd.Series([10, 20, 30, 40])
print(s)

# With custom index
s2 = pd.Series([10, 20, 30], index=["a", "b", "c"])
print(s2)

# Unlike NumPy, pandas gives labels (index).

0    10
1    20
2    30
3    40
dtype: int64
a    10
b    20
c    30
dtype: int64


## **Step 3: DataFrame (2D Table)**

In [4]:
# A DataFrame is like an Excel sheet or SQL table.
# From dictionary
data = {
    "Name": ["Alice", "Bob", "Charlie"],
    "Age": [25, 30, 35],
    "City": ["Dhaka", "Chittagong", "Sylhet"]
}

df = pd.DataFrame(data)
print(df)

      Name  Age        City
0    Alice   25       Dhaka
1      Bob   30  Chittagong
2  Charlie   35      Sylhet


## **Step 4: Inspecting Data**

In [5]:
# print(df.head())      # First 5 rows
# print(df.tail())      # Last 5 rows
# print(df.shape)       # (rows, cols)
# print(df.info())      # Column types & memory
print(df.describe())  # Stats for numeric columns

        Age
count   3.0
mean   30.0
std     5.0
min    25.0
25%    27.5
50%    30.0
75%    32.5
max    35.0


## **Step 5: Selecting Data**

In [6]:
# print(df["Name"])        # Select single column (Series)
# print(df[["Name", "City"]])  # Select multiple columns

# print(df.loc[0])         # Row by label
# print(df.iloc[1])        # Row by position (2nd row)
print(df.loc[0, "Name"]) # Single value


Alice


## **Step 6: Filtering Data**

In [7]:
# print(df[df["Age"] > 28])     # Filter rows where Age > 28
print(df[df["City"] == "Dhaka"])

    Name  Age   City
0  Alice   25  Dhaka


## **Step 7: Adding / Modifying Columns**

In [8]:
df["Age+5"] = df["Age"] + 5   # Add new column
df["Senior"] = df["Age"] > 30 # Boolean column
print(df)

      Name  Age        City  Age+5  Senior
0    Alice   25       Dhaka     30   False
1      Bob   30  Chittagong     35   False
2  Charlie   35      Sylhet     40    True


## **Step 8: Handling Missing Data**

In [9]:
df2 = pd.DataFrame({
    "Name": ["Ali", "Sara", "John"],
    "Age": [25, None, 40],
    "City": ["Dhaka", "Khulna", None]
})

# print(df2.isnull())    # Check missing
# print(df2.dropna())    # Drop rows with NaN
print(df2.fillna("Unknown"))  # Replace NaN

   Name      Age     City
0   Ali     25.0    Dhaka
1  Sara  Unknown   Khulna
2  John     40.0  Unknown


## **Step 9: Grouping & Aggregation**

In [10]:
print(df.groupby("City")["Age"].mean())  # Avg age per city

City
Chittagong    30.0
Dhaka         25.0
Sylhet        35.0
Name: Age, dtype: float64


## **Step 10: Importing & Exporting Data**

In [11]:
# Save
df.to_csv("people.csv", index=False)

# Load
df_loaded = pd.read_csv("people.csv")
print(df_loaded)

      Name  Age        City  Age+5  Senior
0    Alice   25       Dhaka     30   False
1      Bob   30  Chittagong     35   False
2  Charlie   35      Sylhet     40    True


## **Step 11: Sorting**

In [12]:
print(df.sort_values("Age"))           # Sort by Age ascending
print(df.sort_values("Age", ascending=False))  # Sort by Age descending
print(df.sort_values(["City", "Age"])) # Sort by multiple columns

      Name  Age        City  Age+5  Senior
0    Alice   25       Dhaka     30   False
1      Bob   30  Chittagong     35   False
2  Charlie   35      Sylhet     40    True
      Name  Age        City  Age+5  Senior
2  Charlie   35      Sylhet     40    True
1      Bob   30  Chittagong     35   False
0    Alice   25       Dhaka     30   False
      Name  Age        City  Age+5  Senior
1      Bob   30  Chittagong     35   False
0    Alice   25       Dhaka     30   False
2  Charlie   35      Sylhet     40    True


## **Step 12: Merging & Joining**

In [13]:
# Example datasets
data1 = {"ID": [1, 2, 3], "Name": ["Alice", "Bob", "Charlie"]}
data2 = {"ID": [1, 2, 4], "Score": [85, 90, 95]}

df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)
print(df1)
print(df2)

# Merge on ID
merged = pd.merge(df1, df2, on="ID", how="inner")   # Inner Join
print("Inner Join:\n", merged)

merged_outer = pd.merge(df1, df2, on="ID", how="outer") # Outer Join
print("Outer Join:\n", merged_outer)

   ID     Name
0   1    Alice
1   2      Bob
2   3  Charlie
   ID  Score
0   1     85
1   2     90
2   4     95
Inner Join:
    ID   Name  Score
0   1  Alice     85
1   2    Bob     90
Outer Join:
    ID     Name  Score
0   1    Alice   85.0
1   2      Bob   90.0
2   3  Charlie    NaN
3   4      NaN   95.0


## **Step 13: Concatenation**

In [14]:
df_top = df.head(2)
df_bottom = df.tail(1)

df_concat = pd.concat([df_top, df_bottom])
print(df_concat)

      Name  Age        City  Age+5  Senior
0    Alice   25       Dhaka     30   False
1      Bob   30  Chittagong     35   False
2  Charlie   35      Sylhet     40    True


## **Step 14: Pivot Tables (Excel-style)**

In [16]:
sales = pd.DataFrame({
    "City": ["Dhaka", "Dhaka", "Sylhet", "Sylhet"],
    "Product": ["A", "B", "A", "B"],
    "Sales": [100, 150, 200, 120]
})
# print(sales)

pivot = pd.pivot_table(sales, values="Sales", index="City", columns="Product", aggfunc="sum")
print(pivot)

Product    A    B
City             
Dhaka    100  150
Sylhet   200  120


## **Step 15: Apply & Lambda Functions**

In [17]:
df["Age_Double"] = df["Age"].apply(lambda x: x * 2)
print(df)

      Name  Age        City  Age+5  Senior  Age_Double
0    Alice   25       Dhaka     30   False          50
1      Bob   30  Chittagong     35   False          60
2  Charlie   35      Sylhet     40    True          70


## **Step 16: Real Dataset Example (Titanic)**

In [30]:
# Load Titanic dataset (built into seaborn or Kaggle dataset)
import seaborn as sns
titanic = sns.load_dataset("titanic")

# print(titanic.head())

# Survival rate by sex
print(titanic.groupby("sex")["survived"].mean())

# Survival rate by class
# print(titanic.groupby("class")["survived"].mean())

# Survival rate by sex & class
# print(titanic.pivot_table(values="survived", index="sex", columns="class", aggfunc="mean"))

sex
female    0.742038
male      0.188908
Name: survived, dtype: float64
