In [14]:
import pandas as pd  # No need for numpy

data = {'ID': [1, 2, 3],
        'color': ['Red', 'Green', 'Blue']}

df = pd.DataFrame(data)

df_encoded = pd.get_dummies(df, columns=['color'], prefix='color')

print(df_encoded)


   ID  color_Blue  color_Green  color_Red
0   1       False        False       True
1   2       False         True      False
2   3        True        False      False


In [15]:
from datetime import datetime
import pandas as pd

data = {'ID': [1, 2, 3],
        'DateOfBirth': ['1990-05-15', '1985-10-20', '2000-03-08']}

df = pd.DataFrame(data)

# Convert 'DateOfBirth' to datetime format
df['DateOfBirth'] = pd.to_datetime(df['DateOfBirth'], errors='coerce')

# Ensure current date is in the same format
current_date = pd.Timestamp.now()

# Calculate age
df['Age'] = (current_date - df['DateOfBirth']).dt.days // 365

print(df)


   ID DateOfBirth  Age
0   1  1990-05-15   34
1   2  1985-10-20   39
2   3  2000-03-08   24


# Aggregation

Summarizing information at a higher level for simplifying analysis or reporting purposes

In [16]:
data = {'ID': [1, 2, 3, 4],
        'Category': ['A', 'B', 'A','B'],
        'Sales': [100, 150, 170, 200]}

df = pd.DataFrame(data)
df_aggregated = df.groupby('Category')['Sales'].sum().reset_index()

print(df_aggregated)

  Category  Sales
0        A    270
1        B    350


# Dimensionality

Technique used to reduce the number of features while preserving essential information

# Principal Component Analysis

Reducing the number of dimensions in large datasets


In [17]:
from sklearn.decomposition import PCA

#Sample DataFrame with multiple features
data = {'Feature1': [1, 2, 3, 4, 5],
        'Feature2': [5, 4, 3, 2, 1]}

df = pd.DataFrame(data)

#PCA with 2 components
pca = PCA(n_components=2)

principalComponents = pca.fit_transform(df)

#Create a DataFrame with the principal components
df_pca = pd.DataFrame(data = principalComponents, columns = ['PC1', 'PC2'])
print(df_pca)

        PC1           PC2
0 -2.828427 -2.941441e-16
1 -1.414214  9.804804e-17
2  0.000000 -0.000000e+00
3  1.414214 -9.804804e-17
4  2.828427 -1.960961e-16


In [18]:
import pandas as pd

# Sample customer data
data = {
    'Customer_ID': [101, 102, 103, 104, 105],
    'Gender': ['Male', 'Female', 'Female', 'Male', 'Female'],
    'Age': [25, 34, 40, 29, 50],
    'Spending_Score': [60, 75, 40, 55, 30]
}

df = pd.DataFrame(data)


In [19]:
from sklearn.decomposition import PCA

# One-hot encoding the categorical column
df_encoded = pd.get_dummies(df, columns=['Gender'])
# Selecting numerical columns for PCA
df_numerical = df_encoded[['Age', 'Spending_Score']]
# Applying PCA with 1 component
pca = PCA(n_components=1)
# Create a new DataFrame with PCA results
df_pca = pd.DataFrame(pca.fit_transform(df_numerical), columns=['PC1'])
# Merge PCA results with the original DataFrame
df_encoded = df_encoded.join(df_pca)
# Select final columns
df_final = df_encoded[['Customer_ID', 'Gender_Male', 'PC1']]

print(df_final)


   Customer_ID  Gender_Male        PC1
0          101         True  11.753084
1          102        False  21.490639
2          103        False -12.731133
3          104         True   5.525541
4          105        False -26.038131
