In [3]:
import pandas as pd
import numpy as np
# Create a DataFrame
data = {
    'Name': ['Alice', 'Bob', 'Charlie', 'David'],
    'Age': [22, 23, 22, 24],
    'Grade': ['A', 'B', 'A', 'C']
}

df = pd.DataFrame(data)
# Create a DataFrame from a dictionary

# Display the DataFrame
print(df)


      Name  Age Grade
0    Alice   22     A
1      Bob   23     B
2  Charlie   22     A
3    David   24     C


In [4]:
df


Unnamed: 0,Name,Age,Grade
0,Alice,22,A
1,Bob,23,B
2,Charlie,22,A
3,David,24,C


In [6]:
print(df)

      Name  Age Grade
0    Alice   22     A
1      Bob   23     B
2  Charlie   22     A
3    David   24     C


In [18]:
df.head()

Unnamed: 0,Name,Age,Grade
0,Alice,22,A
1,Bob,23,B
2,Charlie,22,A
3,David,24,C


In [22]:
df.tail(2)

Unnamed: 0,Name,Age,Grade
2,Charlie,22,A
3,David,24,C


In [24]:
df.shape

(4, 3)

In [26]:
df.describe()
# would do it for numeric columns, here age
# What about non-numeric columns like Name and Grade?
# By default, describe() skips them.
# Use df.describe(include='all') to include all columns, including strings

# std dev = /(sigma((xi - xbar)^2)/n-1)


# 25th Percentile (Q1)
# This is the value at 25% of the way through the sorted list.

# ### 🔹 25th Percentile (Q1)

# Step-by-step:

# Position = (n - 1) × 0.25
#          = (4 - 1) × 0.25
#          = 3 × 0.25
#      = 0.75

# So, look between the **0th and 1st** values in the sorted list:  
# → 0th = 22, 1st = 22

# Interpolate:

# Q1 = 22 + 0.75 × (22 - 22)
#    = 22 + 0
#    = 22.0

# ✅ 25% = **22.0**

# ### 🔹 75th Percentile (Q3)

# Step-by-step:

# Position = (n - 1) × 0.75
#          = 3 × 0.75
#          = 2.25

# So, look between the **2nd and 3rd** values:  
# → 2nd = 23, 3rd = 24

# Interpolate:

# Q3 = 23 + 0.25 × (24 - 23)
#    = 23 + 0.25
#    = 23.25



Unnamed: 0,Age
count,4.0
mean,22.75
std,0.957427
min,22.0
25%,22.0
50%,22.5
75%,23.25
max,24.0


In [28]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4 entries, 0 to 3
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   Name    4 non-null      object
 1   Age     4 non-null      int64 
 2   Grade   4 non-null      object
dtypes: int64(1), object(2)
memory usage: 228.0+ bytes


In [5]:
print(df.dtypes)

# Explanation:
# Name → object
# Because it's a column of strings (names), and in pandas, string data is usually stored as object dtype.

# Age → int64
# It's a column of integers, so pandas stores it as 64-bit integers.

Name     object
Age       int64
Grade    object
dtype: object


In [81]:
df.size
#total number of elements

12

In [83]:
 df.ndim # : to check 1d, 2d ... i.e n dimensions 

2

In [30]:
df.isnull()

Unnamed: 0,Name,Age,Grade
0,False,False,False
1,False,False,False
2,False,False,False
3,False,False,False


In [75]:
df.isnull().sum()

Name     0
Age      0
Grade    0
dtype: int64

In [77]:
df.notnull()

Unnamed: 0,Name,Age,Grade
0,True,True,True
1,True,True,True
2,True,True,True
3,True,True,True


In [79]:
df.notnull().sum()

Name     4
Age      4
Grade    4
dtype: int64

In [32]:
frame1=df.copy()

In [34]:
frame1

Unnamed: 0,Name,Age,Grade
0,Alice,22,A
1,Bob,23,B
2,Charlie,22,A
3,David,24,C


In [36]:
frame1.isnull().sum

<bound method DataFrame.sum of     Name    Age  Grade
0  False  False  False
1  False  False  False
2  False  False  False
3  False  False  False>

In [38]:
frame1.isnull().sum()

Name     0
Age      0
Grade    0
dtype: int64

In [50]:
# Add NaN to some values
frame1.loc[1, 'Age'] = np.nan  # Add NaN to Bob's Age
frame1.loc[3, 'Grade'] = np.nan  # Add NaN to David's Grade
print("Updated DataFrame:")
print(frame1)

Updated DataFrame:
      Name   Age Grade
0    Alice  22.0     A
1      Bob   NaN     B
2  Charlie  22.0     A
3    David  24.0   NaN


In [52]:
# Check sum of all NaN values
nan_sum = frame1.isnull().sum()

print("\nSum of all NaN values in each column:")
print(nan_sum)



Sum of all NaN values in each column:
Name     0
Age      1
Grade    1
dtype: int64


In [None]:
# | Axis | Refers to     | Direction         | Example Use Case               |
# |------|----------------|-------------------|--------------------------------|
# | 0    | **Rows**       | Top to bottom ↓   | Dropping rows, summing columns |
# | 1    | **Columns**    | Left to right →   | Dropping columns, summing rows |

In [54]:
# Sum of NaN values by column (default is axis=0)
nan_sum_by_column = frame1.isnull().sum(axis=0)
print("\nSum of NaN values by column:")
print(nan_sum_by_column)



Sum of NaN values by column:
Name     0
Age      1
Grade    1
dtype: int64


In [56]:
# Sum of NaN values by row
nan_sum_by_row = frame1.isnull().sum(axis=1)
print("\nSum of NaN values by row:")
print(nan_sum_by_row)



Sum of NaN values by row:
0    0
1    1
2    0
3    1
dtype: int64


In [58]:
frame1


Unnamed: 0,Name,Age,Grade
0,Alice,22.0,A
1,Bob,,B
2,Charlie,22.0,A
3,David,24.0,


In [64]:

print(frame1.isnull())

    Name    Age  Grade
0  False  False  False
1  False   True  False
2  False  False  False
3  False  False   True


In [68]:
print(frame1.isna())

    Name    Age  Grade
0  False  False  False
1  False   True  False
2  False  False  False
3  False  False   True


In [70]:
frame1_filled = frame1.fillna(0)  # Replace NaN with 0
print(frame1_filled)


      Name   Age Grade
0    Alice  22.0     A
1      Bob   0.0     B
2  Charlie  22.0     A
3    David  24.0     0


In [72]:
frame1_dropped = frame1.dropna()  # Drops rows containing NaN
print(frame1_dropped)

#fillna and dropna are pandas methods

      Name   Age Grade
0    Alice  22.0     A
2  Charlie  22.0     A
