## Data Manipulation and Analysis with Pandas
Data Manipulation and analysis are key task in any data science or data analysis project. Pandas provide a wide range of functions for data manipulation and analysis making it easier to clean, transform, and extract insight from data. In this lesson , we will cover various data manipulation and analysis technique using pandas

In [2]:
import pandas as pd
df=pd.read_csv("data.csv")
df.head(5)

Unnamed: 0,Date,Category,Value,Product,Sales,Region
0,2023-01-01,A,28.0,Product1,754.0,East
1,2023-01-02,B,39.0,Product3,110.0,North
2,2023-01-03,C,32.0,Product2,398.0,East
3,2023-01-04,B,8.0,Product1,522.0,East
4,2023-01-05,B,26.0,Product3,869.0,North


In [4]:
df.dtypes

Date         object
Category     object
Value       float64
Product      object
Sales       float64
Region       object
dtype: object

#### Handling Missing data

In [None]:
#Handling Missing data

df.isnull() # will give me true whereever there is false
#In pandas, df.isnull().any(axis=1) returns row-wise information because axis=1 so any operation is applied across columns
df.isnull().any(axis=1) # will give row which will have true
df.isnull().any() # will give row which will have true

Date        False
Category    False
Value        True
Product     False
Sales        True
Region      False
dtype: bool

In [None]:
df.isnull() # will give me true whereever there is false
#In pandas, df.sum().any(axis=1) returns row-wise information because axis=1 so any operation is applied across columns
df.isnull().sum(axis=1) # will give row which will have true
df.isnull().sum() # will give row which will have true

Date        0
Category    0
Value       3
Product     0
Sales       4
Region      0
dtype: int64

In [15]:
#Filling with zero
df_filled=df.fillna(0)
df_filled.head(5)

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_mean,Value_new,New Value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,30.8
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,42.9
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,35.2
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,8.8
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,28.6


In [16]:
#Filling with mean
df_filled_mean=df
#We can apply fill na on purticular column here we are setting fill na on sales using its mean
df_filled_mean["Sales_mean"]=df["Sales"].fillna(df["Sales"].mean())
df_filled_mean.head(5)

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_mean,Value_new,New Value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,30.8
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,42.9
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,35.2
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,8.8
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,28.6


#### Column Transformation

In [17]:
#Change the Column_name
df=df.rename(columns= {"Date":"Sales Date"})
df.head(5)

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_mean,Value_new,New Value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,30.8
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,42.9
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,35.2
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,8.8
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,28.6


In [18]:
#Changing datatype
df["Value_new"]=df["Value"].fillna(df["Value"].mean()).astype(int)
df.head(5)

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_mean,Value_new,New Value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,30.8
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,42.9
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,35.2
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,8.8
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,28.6


In [19]:
# Increase value by 10 percent
# You can apply custom function using lambda
df["New Value"]=df["Value"].apply(lambda m: m*1.1)
df.head(5)

Unnamed: 0,Sales Date,Category,Value,Product,Sales,Region,Sales_mean,Value_new,New Value
0,2023-01-01,A,28.0,Product1,754.0,East,754.0,28,30.8
1,2023-01-02,B,39.0,Product3,110.0,North,110.0,39,42.9
2,2023-01-03,C,32.0,Product2,398.0,East,398.0,32,35.2
3,2023-01-04,B,8.0,Product1,522.0,East,522.0,8,8.8
4,2023-01-05,B,26.0,Product3,869.0,North,869.0,26,28.6


#### Aggregation


In [None]:
#applying aggregation on 2 columns
grouped_df=df.groupby("Product")[["Value","Sales"]].mean()
grouped_df



Unnamed: 0_level_0,Value,Sales
Product,Unnamed: 1_level_1,Unnamed: 2_level_1
Product1,46.214286,574.866667
Product2,52.8,567.230769
Product3,55.166667,535.055556


In [26]:
#applying multiple aggregation on 2 columns
grouped_df1=df.groupby("Product")[["Value","Sales"]].agg(["mean","sum"])
grouped_df1


Unnamed: 0_level_0,Value,Value,Sales,Sales
Unnamed: 0_level_1,mean,sum,mean,sum
Product,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
Product1,46.214286,647.0,574.866667,8623.0
Product2,52.8,792.0,567.230769,7374.0
Product3,55.166667,993.0,535.055556,9631.0


In [29]:
#applying different aggregation on 2 columns
grouped_df2=df.groupby("Product").agg({"Value":"mean","Sales":"sum"})

grouped_df2

Unnamed: 0_level_0,Value,Sales
Product,Unnamed: 1_level_1,Unnamed: 2_level_1
Product1,46.214286,8623.0
Product2,52.8,7374.0
Product3,55.166667,9631.0


In [30]:
#applying different aggregation on 2 columns
grouped_df3=df.groupby("Product").agg({"Value":["mean","count"],"Sales":"sum"})

grouped_df3

Unnamed: 0_level_0,Value,Value,Sales
Unnamed: 0_level_1,mean,count,sum
Product,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
Product1,46.214286,14,8623.0
Product2,52.8,15,7374.0
Product3,55.166667,18,9631.0


#### Merging and join

In [None]:
df_orders = pd.DataFrame({
    "OrderID": [101, 102, 103, 104],
    "CustomerID": [1, 2, 2, 5],  # Note: CustomerID 5 doesn't exist in df_customers
    "Amount": [250, 150, 300, 400]
})
df_customers = pd.DataFrame({
    "CustomerID": [1, 2, 3, 4],
    "Name": ["Alice", "Bob", "Charlie", "David"],
    "City": ["Mumbai", "Delhi", "Bangalore", "Chennai"]
})


Unnamed: 0,CustomerID,Name,City
0,1,Alice,Mumbai
1,2,Bob,Delhi
2,3,Charlie,Bangalore
3,4,David,Chennai


In [35]:
#merge
pd.merge(df_customers,df_orders,on="CustomerID",how="inner")

Unnamed: 0,CustomerID,Name,City,OrderID,Amount
0,1,Alice,Mumbai,101,250
1,2,Bob,Delhi,102,150
2,2,Bob,Delhi,103,300
