In [2]:
import pandas as pd
import numpy as np

In [4]:
customers = pd.DataFrame({
    "customer_id": [1, 2, 3, 4, 5],
    "name": ["Ram", "Hari", "Mohan", "Gita", None],
    "age": [25, np.nan, 32, 45, 28]
})

orders = pd.DataFrame({
    "orders_id": [101, 102, 103, 104],
    "customer_id": [1, 2, 3, None],
    "amount": ["200", "350", None, "400"]
})

print("\nOriginal orders data:")
print(orders)


Original orders data:
   orders_id  customer_id amount
0        101          1.0    200
1        102          2.0    350
2        103          3.0   None
3        104          NaN    400


In [6]:
#cleaning steps missing names
customer_clean=customers.dropna(subset=["name"])


In [15]:
#replace missing age with mean age 
mean_age = customer_clean["age"].mean()
customer_clean.loc[:, "age"] = customer_clean["age"].fillna(mean_age)
print(customer_clean)



   customer_id   name   age
0            1    Ram  25.0
1            2   Hari  34.0
2            3  Mohan  32.0
3            4   Gita  45.0


In [17]:
#convert amount(string) numeric and fill missing with 0
orders["amount"]=pd.to_numeric(orders["amount"],errors="coerce").fillna(0)
print("\n after cleaning:")
print(customer_clean)
print(orders)


 after cleaning:
   customer_id   name   age
0            1    Ram  25.0
1            2   Hari  34.0
2            3  Mohan  32.0
3            4   Gita  45.0
   orders_id  customer_id  amount
0        101          1.0   200.0
1        102          2.0   350.0
2        103          3.0     0.0
3        104          NaN   400.0


In [19]:
#data integration/merging multiple sources
merged_data=pd.merge(customer_clean,orders,on = "customer_id",how="left")
print("merged (integrated)dataset:")
print(merged_data)

merged (integrated)dataset:
   customer_id   name   age  orders_id  amount
0            1    Ram  25.0      101.0   200.0
1            2   Hari  34.0      102.0   350.0
2            3  Mohan  32.0      103.0     0.0
3            4   Gita  45.0        NaN     NaN


In [22]:
#data transformation
#create a new feature amount in thousand
merged_data["amount_k"]=merged_data["amount"]/1000

In [26]:
#normalize age(0 to 1)
merged_data["age_normalized"]=((merged_data["age"]-merged_data["age"].min())/(merged_data["age"].max()-merged_data["age"].min()))

In [None]:
#categorize age groups
merged_data["age_group"]=pd.cut(merged_data["age"],
bins=[0,30,60],
labels=["young","adults"])