In [1]:
import pandas as pd
import numpy as np


In [7]:
data_1={
    'ID':[1,2,3,4],
    'Name':['dhruvi','aadi','shubh','varsha'],
    'Age':[21,30,35,40]
}

data_2={
    'ID':[3,4,5,6],
    'Gender':['M','F','F','F'],
    'Salary':[70000,80000,90000,50000]
}


In [8]:
df1=pd.DataFrame(data_1)
df2=pd.DataFrame(data_2)

In [9]:
#data integration
#tight coupling(join dataset on a common key)
tc=pd.merge(df1,df2,on='ID',how='inner')
print("Tight coupling result:\n",tc)

Tight coupling result:
    ID    Name  Age Gender  Salary
0   3   shubh   35      M   70000
1   4  varsha   40      F   80000


In [11]:
#loose coupling(concatenate datasets)
lc=pd.concat([df1.set_index('ID'),df2.set_index('ID')],axis=1).reset_index()
print("loose coupling result:\n",lc)

loose coupling result:
    ID    Name   Age Gender   Salary
0   1  dhruvi  21.0    NaN      NaN
1   2    aadi  30.0    NaN      NaN
2   3   shubh  35.0      M  70000.0
3   4  varsha  40.0      F  80000.0
4   5     NaN   NaN      F  90000.0
5   6     NaN   NaN      F  50000.0


In [12]:
#data transformation
#smoothing(moving average for age)
lc['Smoothed_Age']=lc['Age'].rolling(window=2,min_periods=1).mean()
print("\nSmoothing:\n",lc[['ID','Age','Smoothed_Age']])



Smoothing:
    ID   Age  Smoothed_Age
0   1  21.0          21.0
1   2  30.0          25.5
2   3  35.0          32.5
3   4  40.0          37.5
4   5   NaN          40.0
5   6   NaN           NaN


In [13]:
#aggregation(summarizing salary by gender)
agg=lc.groupby('Gender')['Salary'].sum().reset_index()
print("\nAggregation:\n",agg)


Aggregation:
   Gender    Salary
0      F  220000.0
1      M   70000.0


In [14]:
#discretization(binning age into categories)
bins=[0,20,30,40,50]
labels=['Teen','Young adult','Adult','Senior']
lc['Age_Group']=pd.cut(lc['Age'],bins=bins,labels=labels)
print("\nDiscretization\n",lc[['ID','Age','Age_Group']])


Discretization
    ID   Age    Age_Group
0   1  21.0  Young adult
1   2  30.0  Young adult
2   3  35.0        Adult
3   4  40.0        Adult
4   5   NaN          NaN
5   6   NaN          NaN


In [15]:
#attribute construction(creating age salary ratio)
lc['Age_Salary_Ratio']=lc['Age']/lc['Salary']
print("\nAttribute construction\n",lc[['ID','Age','Salary','Age_Salary_Ratio']])


Attribute construction
    ID   Age   Salary  Age_Salary_Ratio
0   1  21.0      NaN               NaN
1   2  30.0      NaN               NaN
2   3  35.0  70000.0            0.0005
3   4  40.0  80000.0            0.0005
4   5   NaN  90000.0               NaN
5   6   NaN  50000.0               NaN


In [17]:
#generalization(simplifying salary into ranges)
s_bins=[0,60000,80000,100000]
s_labels=['Low','Medium','High']
lc['Salary_Range']=pd.cut(lc['Salary'],bins=s_bins,labels=s_labels)
print("\nGeneralization\n",lc[['ID','Salary','Salary_Range']])


Generalization
    ID   Salary Salary_Range
0   1      NaN          NaN
1   2      NaN          NaN
2   3  70000.0       Medium
3   4  80000.0       Medium
4   5  90000.0         High
5   6  50000.0          Low


In [20]:
#normalization
#min-max
lc['Age_minmax']=(lc['Age']-lc['Age'].min())/(lc['Age'].max() - lc['Age'].min())
print("\nminmax normalization:\n",lc[['ID','Age','Age_minmax']])


minmax normalization:
    ID   Age  Age_minmax
0   1  21.0    0.000000
1   2  30.0    0.473684
2   3  35.0    0.736842
3   4  40.0    1.000000
4   5   NaN         NaN
5   6   NaN         NaN


In [21]:
#z score normalization
lc['Age_zscore']=(lc['Age']-lc['Age'].mean())/lc['Age'].std()
print("\n z score normalization:\n",lc[['ID','Age','Age_zscore']])


 z score normalization:
    ID   Age  Age_zscore
0   1  21.0   -1.295737
1   2  30.0   -0.185105
2   3  35.0    0.431912
3   4  40.0    1.048930
4   5   NaN         NaN
5   6   NaN         NaN


In [23]:
#decimal scalinh
scaling_factor=10**np.ceil(np.log10(lc['Age'].abs().max()))
lc['Age_ds']=lc['Age']/scaling_factor
print("\nDEcimal Scaling:\n",lc[['ID','Age','Age_ds']])


DEcimal Scaling:
    ID   Age  Age_ds
0   1  21.0    0.21
1   2  30.0    0.30
2   3  35.0    0.35
3   4  40.0    0.40
4   5   NaN     NaN
5   6   NaN     NaN
