In [1]:
import pandas as pd

In [None]:
"""
What Is Pandas?

Pandas is like a powerful upgrade over NumPy. It lets you work with:
    Labels (like names for rows/columns)
    Missing data
    Structured data, like what you’d find in spreadsheets or databases
"""

"""
Pandas Has 3 Main Objects:
    Series - like a labeled 1D array
    DataFrame - like an Excel table (rows + columns)
    Index - handles the labels (for rows or columns)
"""

In [None]:
### Series – A Labeled 1D Array

import pandas as pd
data = pd.Series([0.25, 0.5, 0.75, 1.0])

print(data)  # prints the data
print(data.values)  # gives the actual NumPy array
print(data.index)   # gives the labels (default: 0, 1, 2…)

0    0.25
1    0.50
2    0.75
3    1.00
dtype: float64
[0.25 0.5  0.75 1.  ]
RangeIndex(start=0, stop=4, step=1)


In [None]:
# customize the index
ind_access = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

ind_access['b'] # access values by label

0.5

In [None]:
### DataFrame – A Labeled 2D Table

"A DataFrame is like multiple Series stacked together as columns."

# Creating DataFrame from dictionary of Series

population_dict = {'California': 39538223, 'Texas': 29145505}
population = pd.Series(population_dict)
population['Texas']  # Output: 29145505

area = pd.Series({'California': 423967, 'Texas': 695662})
states = pd.DataFrame({'population': population, 'area': area})

print(states)   # pandas dataframe defined from multiple pandas series
print(states.index) # Row labels
print(states.columns) # column labels

# Access a column like a dictionary
print(states['area'])


            population    area
California    39538223  423967
Texas         29145505  695662
Index(['California', 'Texas'], dtype='object')
Index(['population', 'area'], dtype='object')
California    423967
Texas         695662
Name: area, dtype: int64


In [16]:
#  Constructing DataFrames in Different Ways

'1)  From a single Series'
pd.DataFrame(population, columns=['population'])

'2) From a list of dictionaries'
data = [{'a': 1, 'b': 2}, {'b': 3, 'c': 4}]
pd.DataFrame(data)

'3) From a NumPy 2D array'
import numpy as np
pd.DataFrame(np.random.rand(3, 2), columns=['foo', 'bar'], index=['a', 'b', 'c'])

'4) From a structured NumPy array'
A = np.zeros(3, dtype=[('A', 'i8'), ('B', 'f8')])
pd.DataFrame(A)

Unnamed: 0,A,B
0,0,0.0
1,0,0.0
2,0,0.0


In [None]:
### 3. Index – Label Holder for Rows/Columns
"An Index is like an immutable array. It's used behind the scenes to hold labels."

ind = pd.Index([2, 3, 5, 7, 11])
ind[1]  # Output: 3
ind[::2]  # Output: [2, 5, 11]

# Note --> But you can’t modify index values (immutable) like ind[1]=0 (this is wrong)


Index([2, 5, 11], dtype='int64')

In [21]:
# Index as a Set

indA = pd.Index([1, 3, 5, 7, 9])
indB = pd.Index([2, 3, 5, 7, 11])

indA.intersection(indB)  # → [3, 5, 7]
indA.union(indB)         # → [1, 2, 3, 5, 7, 9, 11]
indA.symmetric_difference(indB)  # → [1, 2, 9, 11]


Index([1, 2, 9, 11], dtype='int64')

In [28]:
### Indexing a Series (1D)

"A Pandas Series is like a cross between a NumPy array and a dictionary. So you can access elements both ways"

"1) Series as a Dictionary"
data = pd.Series([0.25, 0.5, 0.75, 1.0], index=['a', 'b', 'c', 'd'])

print(data['b'])
print(data.keys()) # Get all keys
print(list(data.items())) # See items
data['e'] = 1.25 # add new elements
print(data)


"2) Series as an Array"
# Slicing by label (includes end)
print(data['a':'c'])  # includes 'c'

# Slicing by position (excludes end)
print(data[0:2])  # only index 0 and 1

# Masking
print(data[(data > 0.3) & (data < 0.8)])

# Fancy Indexing
print(data[['a', 'e']])

0.5
Index(['a', 'b', 'c', 'd'], dtype='object')
[('a', 0.25), ('b', 0.5), ('c', 0.75), ('d', 1.0)]
a    0.25
b    0.50
c    0.75
d    1.00
e    1.25
dtype: float64
a    0.25
b    0.50
c    0.75
dtype: float64
a    0.25
b    0.50
dtype: float64
b    0.50
c    0.75
dtype: float64
a    0.25
e    1.25
dtype: float64


In [29]:
### loc vs iloc (Important Concept)

"""
loc[] → Index by label (explicit)
iloc[] → Index by position (implicit)
"""

data = pd.Series(['a', 'b', 'c'], index=[1, 3, 5])
print(data.loc[1])
print(data.iloc[1])

a
b


In [None]:
## Indexing a DataFrame (2D)

""""
Think of a DataFrame in Two Ways:
    Like a dictionary of Series
    Like a 2D NumPy array
"""

# Dictionary-style (column-wise)
print(states['area'])  #or 
print(states.area)  # shortcut, but avoid for names like `pop` that are also methods

California    423967
Texas         695662
Name: area, dtype: int64
California    423967
Texas         695662
Name: area, dtype: int64


In [38]:
## DataFrame as 2D Array
print(states)

"Access raw data (NumPy-style)"
states.values

"Transpose rows/columns"
states.T

"Access using iloc (by position)"
states.iloc[:1, :1]

"Access using loc (by label)"
states.loc[:'Texas', :'population']


            population    area
California    39538223  423967
Texas         29145505  695662


Unnamed: 0,population
California,39538223
Texas,29145505


In [None]:
### Advanced Indexing Patterns

"Masking (row-wise filter)"
states.loc[states.area > 500000, ['population', 'area']]

"Modify a value"
states.iloc[0, 1] = 90  
print(states)

            population    area
California    39538223      90
Texas         29145505  695662


In [None]:
### Extra Useful Indexing Tricks

"Row slicing using labels"
states['California':'Texas']

"Row slicing using position"
states[0:1]

"Row masking"
states[states.area > 100]

# Note : These work just like NumPy even if they don’t follow strict Pandas conventions.

"""
| Operation        | Use It For                       | Syntax                          |
| ---------------- | -------------------------------- | ------------------------------- |
| Dictionary-style | Column access (use with caution) | `data['col']` or `data.col`     |
| `.loc[]`         | Label-based selection            | `data.loc['label']`             |
| `.iloc[]`        | Position-based selection         | `data.iloc[0]`                  |
| Masking          | Filter rows by condition         | `data[data.col > value]`        |
| Fancy indexing   | Select multiple rows/columns     | `data.loc[:, ['col1', 'col2']]` |

"""

Unnamed: 0,population,area
Texas,29145505,695662


In [50]:
"""
What are Ufuncs?

    Ufuncs = Universal functions (from NumPy) like:
    +, -, *, /
    np.exp(), np.sin(), np.log() etc.

In Pandas, these preserve row/column labels and align data by index. That's the real magic!
"""

###  Unary Operations (One input)

ser = pd.Series([0, 7, 6, 4])
np.exp(ser)  # Applies e^x on each value

df = pd.DataFrame([[4, 8, 0, 6], [2, 0, 5, 9], [7, 7, 7, 7]],
                  columns=['A', 'B', 'C', 'D'])
np.sin(df * np.pi / 4)

Unnamed: 0,A,B,C,D
0,1.224647e-16,-2.449294e-16,0.0,-1.0
1,1.0,0.0,-0.707107,0.707107
2,-0.7071068,-0.7071068,-0.707107,-0.707107


In [51]:
### Binary Operations (Two Inputs) with Index Alignment

"Note --> Pandas matches the index labels, even if they're not in the same order or one is missing. "

area = pd.Series({'Alaska': 1723337, 'Texas': 695662, 'California': 423967})
population = pd.Series({'California': 39538223, 'Texas': 29145505, 'Florida': 21538187})
population / area

Alaska              NaN
California    93.257784
Florida             NaN
Texas         41.896072
dtype: float64

In [None]:
A = pd.Series([2, 4, 6], index=[0, 1, 2])
B = pd.Series([1, 3, 5], index=[1, 2, 3])

print(A+B) #if the index is not present means it will throw NaN value

print(A.add(B, fill_value=0)) # eventhough the index mismatches, it will add the mismatched index values with the fill_value

0    NaN
1    5.0
2    9.0
3    NaN
dtype: float64
0    2.0
1    5.0
2    9.0
3    5.0
dtype: float64


In [None]:
"""
Python	   Pandas method
+	       .add()
-      	   .sub() or .subtract()
*	       .mul()
/	       .div() or .truediv()
//	       .floordiv()
%	       .mod()
**	       .pow()
"""

In [None]:
### DataFrame vs DataFrame Alignment

""" 
Pandas aligns both:
    Row indices
    Column names
"""

A = pd.DataFrame([[10, 2], [16, 9]], columns=['a', 'b'])
B = pd.DataFrame([[5, 3, 1], [9, 7, 6], [4, 8, 5]], columns=['b', 'a', 'c'])
A+B

"Note --> It'll match a and b columns, and ignore c where there's no match (→ NaN), aligning rows by index too."

"Note --> It’ll match a and b columns, and ignore c where there's no match (→ NaN), aligning rows by index too."

In [63]:
### DataFrame and Series Operations
"This is like subtracting a row or a column from a table"

"1) Row-wise subtraction (default behavior)"

df = pd.DataFrame([[4, 4, 2, 0], [5, 8, 0, 8], [8, 2, 6, 1]], columns=['Q', 'R', 'S', 'T'])
print(df)
df - df.iloc[1]

"2)  Column-wise subtraction"
df.subtract(df['Q'], axis=0)

   Q  R  S  T
0  4  4  2  0
1  5  8  0  8
2  8  2  6  1


Unnamed: 0,Q,R,S,T
0,0,0,-2,-4
1,0,3,-5,3
2,0,-6,-2,-7


In [None]:
### Smart Alignment in Mixed Cases
"Even if you're subtracting just a partial row, Pandas will align columns properly."

halfrow = df.iloc[0, ::2]  # only columns Q and S
df - halfrow

Unnamed: 0,Q,R,S,T
0,0.0,,0.0,
1,1.0,,-2.0,
2,4.0,,4.0,


In [None]:
""" 
What Is Missing Data?
    Missing values are also called:
        Null
        NaN (Not a Number)
        None
        pd.NA (newer Pandas way)

Why Do We Care?
    Because:
    Missing values break calculations or give wrong answers
    We need to decide: Drop them or Fill them?

How Does Pandas Represent Missing Data?

| Type    | Sentinel Used  | Converted To      | Example Value      |
| ------- | -------------- | ----------------- | ------------------ |
| Float   | `np.nan`       | Float (no change) | `1.0, NaN`         |
| Int     | `np.nan`       | Cast to float     | `1, NaN` → float64 |
| Object  | `None` / `NaN` | No change         | `'hello', None`    |
| Boolean | `None` / `NaN` | Cast to object    | `True, None`       |

"""

In [72]:
### Pandas Handling Examples

"None in NumPy"
vals1 = np.array([1, None, 2, 3])  # Slow operations , Fails on sum, min, etc.

"np.nan in NumPy"
vals2 = np.array([1, np.nan, 3, 4])  # Faster, Supports aggregation (though result is also nan):

# vals1.sum()  # throws error because of none value in the array
# vals2.sum()  # throws error because of none value in the array

# np.nansum(vals2)  # throws error because of none value in the array
np.nansum(vals2)    # returns the added value 

"Pandas Handles Both None and NaN"
pd.Series([1, np.nan, 2, None])  # NaN for both None and np.nan

"Nullable Dtypes (New Feature)"
pd.Series([1, np.nan, 2, None, pd.NA], dtype="Int32")  # Output: 1, <NA>, 2, <NA>, <NA>


0       1
1    <NA>
2       2
3    <NA>
4    <NA>
dtype: Int32

In [None]:
""" 
isnull
    Generates a Boolean mask indicating missing values
notnull
    Opposite of isnull
dropna
    Returns a filtered version of the data
fillna
    Returns a copy of the data with missing values filled or imputed

"""

In [74]:
### Detecting Missing Values

".isnull() and .notnull()"

data = pd.Series([1, np.nan, 'hello', None])

print(data.isnull())
print(data.isna())

0    False
1     True
2    False
3     True
dtype: bool
0    False
1     True
2    False
3     True
dtype: bool


In [75]:
### Dropping Missing Data

"Series"
data.dropna()

"DataFrame"
df = pd.DataFrame([[1, np.nan, 2],
                   [2, 3, 5],
                   [np.nan, 4, 6]])
df.dropna()  # drops rows with ANY NA
df.dropna(axis='columns')  # drops columns with ANY NA
df.dropna(how='all')  # drops only if ALL values are NA
df.dropna(thresh=3)  # drops if < 3 non-NA values

Unnamed: 0,0,1,2
1,2.0,3.0,5


In [None]:
### Filling Missing Data

" Fill with a value "
data.fillna(0)

"Forward Fill (ffill)"
data.fillna(method='ffill')  # fills with previous non-NA

"Backward Fill (bfill)"
data.fillna(method='bfill')  # fills with next non-NA

"DataFrame Fill by Row or Column"
df.fillna(method='ffill', axis=1)  # left to right fill

In [None]:
""" 
| Operation        | Use                                 |
| ---------------- | ----------------------------------- |
| `isnull()`       | Detect missing values (True/False)  |
| `notnull()`      | Detect non-missing values           |
| `dropna()`       | Remove rows/columns with NA         |
| `fillna(value)`  | Replace missing values              |
| `method='ffill'` | Fill forward                        |
| `method='bfill'` | Fill backward                       |
| `thresh=N`       | Keep rows/columns with ≥N non-NAs   |
| `pd.NA`          | New missing value marker (nullable) |

"""

In [None]:
###  What is Hierarchical Indexing (MultiIndex)?
    # Hierarchical Indexing lets you:
        # Use multiple levels of keys in the index of a Series or DataFrame
        # Represent 3D or 4D data inside 1D/2D objects
        # Perform more flexible and powerful data manipulations


# The "Bad" Way
index = [('California', 2010), ('California', 2020),('New York', 2010), ('New York', 2020),('Texas', 2010), ('Texas', 2020)]
populations = [37253956, 39538223,19378102, 20201249,25145561, 29145505]

multi_index = pd.Series(populations, index=index)   # Works, but filtering becomes messy.


# The Better Way: MultiIndex
index = pd.MultiIndex.from_tuples(index)
indexing_multi = multi_index.reindex(index)  # contains the  hierarchical representation of the data

indexing_multi[:,2020] # Get data for year 2020

California    39538223
New York      20201249
Texas         29145505
dtype: int64

In [None]:
### Convert Between Series ↔ DataFrame

"""
indexing_multi.unstack() → Converts one index level into columns
unstacking.stack() → Reverts back 
"""
print(indexing_multi)
unstacking = indexing_multi.unstack()   # --> Converts one index level into columns
stacking = unstacking.stack()  # --> Reverts back

California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64


California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64

In [90]:
# Add More Data Dimensions

new_dimension = pd.DataFrame({"total" : indexing_multi, 'under18': [9284094, 8898092,4318033, 4181528,6879014, 7432474]})
new_dimension

fitler_u18 = new_dimension['under18'] / new_dimension['total']
fitler_u18.unstack()

""" 
Creating MultiIndex — Multiple Ways

| Method            | Example                                                  |
| ----------------- | -------------------------------------------------------- |
| `from_arrays`     | `pd.MultiIndex.from_arrays([['a', 'a'], [1, 2]])`        |
| `from_tuples`     | `pd.MultiIndex.from_tuples([('a', 1), ('a', 2)])`        |
| `from_product`    | Cartesian product → `from_product([['a', 'b'], [1, 2]])` |
| `levels + codes`  | Manually specify structure                               |
| `from dictionary` | `pd.Series(data)` where keys are tuples                  |

"""

pd.MultiIndex.from_arrays([['a', 'a', 'b', 'b'], [1, 2, 1, 2]])
pd.MultiIndex.from_tuples([('a', 1), ('a', 2), ('b', 1), ('b', 2)])
pd.MultiIndex.from_product([['a', 'b'], [1, 2]])
pd.MultiIndex(levels=[['a', 'b'], [1, 2]], codes=[[0, 0, 1, 1], [0, 1, 0, 1]])

MultiIndex([('a', 1),
            ('a', 2),
            ('b', 1),
            ('b', 2)],
           )

In [94]:
### MultiIndex for Columns

# hierarchical indices and columns
index = pd.MultiIndex.from_product([[2013, 2014], [1, 2]], names=['year', 'visit'])
columns = pd.MultiIndex.from_product([['Bob', 'Guido', 'Sue'],['HR', 'Temp']],
names=['subject', 'type'])

data = np.round(np.random.randn(4, 6), 1)
data[:, ::2] *= 10
data += 37

health_data = pd.DataFrame(data, index=index, columns=columns)
health_data
health_data['Guido'] 
health_data['Guido', 'HR']


year  visit
2013  1        33.0
      2        39.0
2014  1        31.0
      2        38.0
Name: (Guido, HR), dtype: float64

In [102]:
indexing_multi.index.names = ['state', 'year']
print(indexing_multi)

# Indexing and Slicing MultiIndex

"1) Full index:"
indexing_multi['California', 2010]

"2) Partial index"
indexing_multi['California']

"3) Cross-section by level"
indexing_multi[:, 2020]

"4) Boolean mask"
indexing_multi[indexing_multi > 22000000]


state       year
California  2010    37253956
            2020    39538223
New York    2010    19378102
            2020    20201249
Texas       2010    25145561
            2020    29145505
dtype: int64


state       year
California  2010    37253956
            2020    39538223
Texas       2010    25145561
            2020    29145505
dtype: int64

In [None]:
### Sorted vs Unsorted Indices
"Some operations require the index to be sorted"

index = pd.MultiIndex.from_product([['a', 'c', 'b'], [1, 2]])
data = pd.Series(np.random.rand(6), index=index)
data.index.names = ['char', 'int']
print(data)

# data['a':'b'] #'Key length (1) was greater than MultiIndex lexsort depth (0)' error occurs before sorting the index

data = data.sort_index()
data['a':'b']  # after sorting it doesn't throw any error

char  int
a     1      0.031847
      2      0.052100
c     1      0.805116
      2      0.935519
b     1      0.409780
      2      0.217435
dtype: float64


char  int
a     1      0.031847
      2      0.052100
b     1      0.409780
      2      0.217435
dtype: float64

In [111]:
### Reshaping Data

print(data)

"1) stack() and unstack()"
print(data.unstack(level=0))  # char a,b,c as columns
print(data.unstack(level=1))  # int 1,2 as columns
print(data.unstack().stack()) # original data


"2) reset_index() → Convert index to columns"
flat = indexing_multi.reset_index(name='population')

"3) set_index() → Columns → Index"
flat.set_index(['state', 'year'])


char  int
a     1      0.031847
      2      0.052100
b     1      0.409780
      2      0.217435
c     1      0.805116
      2      0.935519
dtype: float64
char         a         b         c
int                               
1     0.031847  0.409780  0.805116
2     0.052100  0.217435  0.935519
int          1         2
char                    
a     0.031847  0.052100
b     0.409780  0.217435
c     0.805116  0.935519
char  int
a     1      0.031847
      2      0.052100
b     1      0.409780
      2      0.217435
c     1      0.805116
      2      0.935519
dtype: float64


Unnamed: 0_level_0,Unnamed: 1_level_0,population
state,year,Unnamed: 2_level_1
California,2010,37253956
California,2020,39538223
New York,2010,19378102
New York,2020,20201249
Texas,2010,25145561
Texas,2020,29145505
