**PRACTICAL 2**

In [2]:
import pandas as pd
import numpy as np
#Sample Datasets for DataIntegration
data_1={
    'ID':[1,2,3,4,5,6,7],
    'Name':['Alice','Bob','Charlie','David','Rahul','Modi','Lilly'],
    'Age':[24,30,40,28,55,35,60]
}

data_2={
    'ID':[1,2,4,6,3,5,8],
    'Gender':['F','M','M','F','M','M','F'],
    'Salary':[70000,80000,60000,90000,45000,50000,82000]
}
df1=pd.DataFrame(data_1)
df2=pd.DataFrame(data_2)

In [3]:
#data Integration
#Tight Coupling(Joins dataset on a common key)
tight_coupling=pd.merge(df1,df2,on='ID',how='inner')
print("Tight Coupling Result:\n", tight_coupling)


Tight Coupling Result:
    ID     Name  Age Gender  Salary
0   1    Alice   24      F   70000
1   2      Bob   30      M   80000
2   3  Charlie   40      M   45000
3   4    David   28      M   60000
4   5    Rahul   55      M   50000
5   6     Modi   35      F   90000


In [4]:
#loose Coupling (concatenate datasets)
loose_coupling=pd.concat([df1.set_index('ID'),df2.set_index('ID')],axis=1).reset_index()
print("Loose Coupling Result:\n", loose_coupling)

Loose Coupling Result:
    ID     Name   Age Gender   Salary
0   1    Alice  24.0      F  70000.0
1   2      Bob  30.0      M  80000.0
2   3  Charlie  40.0      M  45000.0
3   4    David  28.0      M  60000.0
4   5    Rahul  55.0      M  50000.0
5   6     Modi  35.0      F  90000.0
6   7    Lilly  60.0    NaN      NaN
7   8      NaN   NaN      F  82000.0


In [5]:
#Data Transformation
##smoothing (Moving Average for Age )
loose_coupling['Smoothed_Age']=loose_coupling['Age'].rolling(window=2,min_periods=1).mean()
print("\nSmoothing:\n", loose_coupling[['ID', 'Age', 'Smoothed_Age']])


Smoothing:
    ID   Age  Smoothed_Age
0   1  24.0          24.0
1   2  30.0          27.0
2   3  40.0          35.0
3   4  28.0          34.0
4   5  55.0          41.5
5   6  35.0          45.0
6   7  60.0          47.5
7   8   NaN          60.0


In [6]:
##Aggregation (summarizing salary by gender)
aggregation=loose_coupling.groupby('Gender')['Salary'].sum().reset_index()
print("\nAggregation:\n", aggregation)



Aggregation:
   Gender    Salary
0      F  242000.0
1      M  235000.0


In [7]:
##Discretization (Binning Age into categories)
bins=[0,20,30,40,50]
labels=['Teen','Young Adult','Adult','Senior']
loose_coupling['Age_Group'] = pd.cut(loose_coupling['Age'], bins=bins, labels=labels)
print("\nDiscretization:\n", loose_coupling[['ID', 'Age', 'Age_Group']])



Discretization:
    ID   Age    Age_Group
0   1  24.0  Young Adult
1   2  30.0  Young Adult
2   3  40.0        Adult
3   4  28.0  Young Adult
4   5  55.0          NaN
5   6  35.0        Adult
6   7  60.0          NaN
7   8   NaN          NaN


In [8]:
##Attribute Construction (Creating Age-salary Ratio)
loose_coupling['Age_Salary_Ratio'] = loose_coupling['Age'] / loose_coupling['Salary']
print("\nAttribute Construction:\n", loose_coupling[['ID', 'Age', 'Salary', 'Age_Salary_Ratio']])



Attribute Construction:
    ID   Age   Salary  Age_Salary_Ratio
0   1  24.0  70000.0          0.000343
1   2  30.0  80000.0          0.000375
2   3  40.0  45000.0          0.000889
3   4  28.0  60000.0          0.000467
4   5  55.0  50000.0          0.001100
5   6  35.0  90000.0          0.000389
6   7  60.0      NaN               NaN
7   8   NaN  82000.0               NaN


In [11]:
##Generalization 
salary_bins=[50000,70000,80000,90000]
salary_labels=['Low','Medium','High']
loose_coupling['Salary_Range'] = pd.cut(loose_coupling['Salary'], bins=salary_bins, labels=salary_labels)
print("\nGeneralization:\n", loose_coupling[['ID', 'Salary', 'Salary_Range']])



Generalization:
    ID   Salary Salary_Range
0   1  70000.0          Low
1   2  80000.0       Medium
2   3  45000.0          NaN
3   4  60000.0          Low
4   5  50000.0          NaN
5   6  90000.0         High
6   7      NaN          NaN
7   8  82000.0         High


In [12]:
##Normalization
###Min-Max NOrmalization
loose_coupling['Age_MinMax'] = (loose_coupling['Age'] - loose_coupling['Age'].min()) / (loose_coupling['Age'].max() - loose_coupling['Age'].min())
print("\nMin-Max Normalization:\n", loose_coupling[['ID', 'Age', 'Age_MinMax']])

     


Min-Max Normalization:
    ID   Age  Age_MinMax
0   1  24.0    0.000000
1   2  30.0    0.166667
2   3  40.0    0.444444
3   4  28.0    0.111111
4   5  55.0    0.861111
5   6  35.0    0.305556
6   7  60.0    1.000000
7   8   NaN         NaN


In [13]:
### Z-Score Normalization
loose_coupling['Age_ZScore'] = (loose_coupling['Age'] - loose_coupling['Age'].mean()) / loose_coupling['Age'].std()
print("\nZ-Score Normalization:\n", loose_coupling[['ID', 'Age', 'Age_ZScore']])



Z-Score Normalization:
    ID   Age  Age_ZScore
0   1  24.0   -1.077445
1   2  30.0   -0.642323
2   3  40.0    0.082880
3   4  28.0   -0.787364
4   5  55.0    1.170686
5   6  35.0   -0.279721
6   7  60.0    1.533288
7   8   NaN         NaN


In [14]:
###Decimal Sacaling
scaling_factor = 10 ** np.ceil(np.log10(loose_coupling['Age'].abs().max()))
loose_coupling['Age_DecimalScaling'] = loose_coupling['Age'] / scaling_factor
print("\nDecimal Scaling:\n", loose_coupling[['ID', 'Age', 'Age_DecimalScaling']])



Decimal Scaling:
    ID   Age  Age_DecimalScaling
0   1  24.0                0.24
1   2  30.0                0.30
2   3  40.0                0.40
3   4  28.0                0.28
4   5  55.0                0.55
5   6  35.0                0.35
6   7  60.0                0.60
7   8   NaN                 NaN
