In [4]:
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.preprocessing import StandardScaler

# Load the iris dataset
df = pd.read_csv('Iris.csv', 
                 names=['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'species'])

df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
Id,SepalLengthCm,SepalWidthCm,PetalLengthCm,PetalWidthCm,Species
1,5.1,3.5,1.4,0.2,Iris-setosa
2,4.9,3.0,1.4,0.2,Iris-setosa
3,4.7,3.2,1.3,0.2,Iris-setosa
4,4.6,3.1,1.5,0.2,Iris-setosa


In [5]:
# Normalize the variables
scaler = MinMaxScaler()
df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] = scaler.fit_transform(df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

# Check the result
df.head()

ValueError: could not convert string to float: 'SepalLengthCm'

In [15]:
# Encode the species variable
le = LabelEncoder()
df['species'] = le.fit_transform(df['species'])

# Check the result
df.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,0.222222,0.625,0.067797,0.041667,0
1,0.166667,0.416667,0.067797,0.041667,0
2,0.111111,0.5,0.050847,0.041667,0
3,0.083333,0.458333,0.084746,0.041667,0
4,0.194444,0.666667,0.067797,0.041667,0


In [16]:
# Aggregate the data by species and get summary statistics
agg_df = df.groupby(['species']).agg({'sepal_length': 'mean', 
                                      'sepal_width': 'mean', 
                                      'petal_length': 'mean', 
                                      'petal_width': 'mean'})

# Check the result
print(agg_df)

         sepal_length  sepal_width  petal_length  petal_width
species                                                      
0            0.196111     0.590833      0.078644     0.060000
1            0.454444     0.320833      0.552542     0.510833
2            0.635556     0.405833      0.771525     0.802500


In [24]:
# Log-transform the sepal length and width
df['sepal_length'] = np.log(df['sepal_length'])
df['sepal_width'] = np.log(df['sepal_width'])

# Check the result
print(df.head(20))

    sepal_length  sepal_width  petal_length  petal_width      species
0       1.629241     1.252763           1.4          0.2  Iris-setosa
1       1.589235     1.098612           1.4          0.2  Iris-setosa
2       1.547563     1.163151           1.3          0.2  Iris-setosa
3       1.526056     1.131402           1.5          0.2  Iris-setosa
4       1.609438     1.280934           1.4          0.2  Iris-setosa
5       1.686399     1.360977           1.7          0.4  Iris-setosa
6       1.526056     1.223775           1.4          0.3  Iris-setosa
7       1.609438     1.223775           1.5          0.2  Iris-setosa
8       1.481605     1.064711           1.4          0.2  Iris-setosa
9       1.589235     1.131402           1.5          0.1  Iris-setosa
10      1.686399     1.308333           1.5          0.2  Iris-setosa
11      1.568616     1.223775           1.6          0.2  Iris-setosa
12      1.568616     1.098612           1.4          0.1  Iris-setosa
13      1.458615    

In [25]:
# Scale the variables
scaler = StandardScaler()
df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']] = scaler.fit_transform(df[['sepal_length', 'sepal_width', 'petal_length', 'petal_width']])

# Check the result
print(df.head())

   sepal_length  sepal_width  petal_length  petal_width      species
0     -0.896493     1.031400     -1.341272    -1.312977  Iris-setosa
1     -1.180788    -0.055193     -1.341272    -1.312977  Iris-setosa
2     -1.476932     0.399733     -1.398138    -1.312977  Iris-setosa
3     -1.629764     0.175939     -1.284407    -1.312977  Iris-setosa
4     -1.037219     1.229974     -1.341272    -1.312977  Iris-setosa


In [None]:
# Try how to perform data integration (data merging)
# Customer information dataset
customer_id  name         address       phone_number
1            John Doe     123 Main St   555-555-5555
2            Jane Doe     456 Oak Ave   555-555-5556
3            John Smith   789 Birch Rd  555-555-5557

# Purchase information dataset
purchase_id  customer_id  purchase_date  product
1            1            2022-01-01     T-Shirt
2            2            2022-02-01     Hat
3            1            2022-03-01     Shoes
4            3            2022-04-01     Pants
