In [63]:
print("🚀 Starting Data Transformation Pipeline")
print("=" * 50)

🚀 Starting Data Transformation Pipeline


# Data Transformation Pipeline for MLOps Using SageMaker Jupyter Notebooks
`This notebook demonstrates key data transformation techniques commonly used in machine learning pipelines. It follows MLOps best practices for data preprocessing and feature engineering using AWS SageMaker JupyterLab.`

## 1. Environment Setup and Configuration
### 📦 Step 1: Setup Environment

In [64]:
# 📦 Step 1: Setup Environment
import sagemaker
import boto3
import pandas as pd
import os
from sagemaker.processing import ProcessingInput, ProcessingOutput, ScriptProcessor
import json
import numpy as np
from datetime import datetime

sagemaker_session = sagemaker.Session()
role = sagemaker.get_execution_role()
bucket = sagemaker_session.default_bucket()
region = boto3.Session().region_name

print(f"SageMaker Role: {role}")
print(f"Default Bucket: {bucket}")

SageMaker Role: arn:aws:iam::533267207758:role/cfst-4286-d6d7d92a5be8e8ac67-SageMakerExecutionRole-iHlDx6QURhhp
Default Bucket: sagemaker-us-east-1-533267207758


### ⚙️ Step 2: Data Generation
Creating a realistic dataset that simulates common data quality challenges found in production environments.

In [65]:
import pandas as pd
import numpy as np
import json
import random
from datetime import datetime, timedelta
import os

# Set random seed for reproducibility
np.random.seed(42)

# Number of records
num_records = 50000

# Generate random data
data = {
    "id": np.arange(1, num_records + 1),
    "name": [f"Name_{i}" for i in np.random.randint(1, 1000, num_records)],
    "age": np.random.randint(18, 80, num_records),
    "salary": np.random.choice([50000, 60000, 70000, None], num_records),
    "hire_date": [
        (datetime.now() - timedelta(days=random.randint(0, 3650))).strftime("%Y-%m-%d")
        if random.random() > 0.1 else None
        for _ in range(num_records)
    ],
    "profile": [
        json.dumps({
            "address": f"Street {random.randint(1, 100)}, City {random.randint(1, 50)}",
            "phone": f"{random.randint(1000000000, 9999999999)}",
            "email": f"email_{random.randint(1, 1000)}@example.com"
        })
        if random.random() > 0.1 else None
        for _ in range(num_records)
    ],
    "department": np.random.choice(["HR", "IT", "Finance", "Marketing", None], num_records),
    "bonus": [None if random.random() > 0.9 else random.randint(1000, 10000) for _ in range(num_records)]
}

# Create DataFrame
df = pd.DataFrame(data)

# Introduce some NaN values randomly
df.loc[np.random.choice(df.index, size=int(num_records * 0.05), replace=False), "age"] = np.nan
df.loc[np.random.choice(df.index, size=int(num_records * 0.1), replace=False), "salary"] = np.nan

# Ensure 'data' folder exists
os.makedirs("data", exist_ok=True)

# Save to CSV
df.to_csv("data/mock_data.csv", index=False)
print("Dataset created and uploaded to data/mock_data.csv")

Dataset created and uploaded to data/mock_data.csv


### ⚙️ Step 3: Upload Source Data to S3
Upload the source CSV dataset to input location in S3 (default bucket)  

In [66]:
s3 = boto3.resource('s3')
s3.meta.client.upload_file('data/mock_data.csv', bucket, 'input/mock_data.csv')
print(f"Dataset 'mock_data.csv' uploaded to: s3://{bucket}/input/mock_data.csv")

Dataset 'mock_data.csv' uploaded to: s3://sagemaker-us-east-1-533267207758/input/mock_data.csv


## 2. Data Exploration  
Load the raw dataset and perform initial data profiling. 
This step is crucial for understanding data quality and structure. 

### Step 1: Load the CSV File from S3 into the DataFrame

In [67]:
try:
    df = pd.read_csv(f's3://{bucket}/input/mock_data.csv')
    print(f"✅ Dataset loaded successfully!")
    print(f"📏 Dataset shape: {df.shape}")
except FileNotFoundError:
    print("❌ Error: mock_data.csv not found. Please run create_dataset.py first.")
    exit()

✅ Dataset loaded successfully!
📏 Dataset shape: (50000, 8)


### Step 2: Analyse the Data  
Perform comprehensive data analysis to understand:
- Data types and memory usage
- Missing values pattern
- Statistical distribution
- Unique values and categories

In [68]:
# Display the first 5 rows from the loaded DataFrame
print("\n📋 First 5 rows:")
df.head()


📋 First 5 rows:


Unnamed: 0,id,name,age,salary,hire_date,profile,department,bonus
0,1,Name_103,,,2022-03-04,"{""address"": ""Street 54, City 10"", ""phone"": ""99...",Marketing,5578.0
1,2,Name_436,24.0,70000.0,2023-01-30,"{""address"": ""Street 87, City 10"", ""phone"": ""33...",HR,9661.0
2,3,Name_861,33.0,,2022-11-03,"{""address"": ""Street 71, City 38"", ""phone"": ""61...",Marketing,9186.0
3,4,Name_271,50.0,50000.0,2024-02-28,"{""address"": ""Street 24, City 28"", ""phone"": ""59...",Finance,7530.0
4,5,Name_107,50.0,70000.0,2018-07-11,"{""address"": ""Street 33, City 49"", ""phone"": ""97...",Marketing,6772.0


In [69]:
# Get the summary of the DataFrame
print("\n📊 Data Types & Non-Null Counts:\n")
df.info()


📊 Data Types & Non-Null Counts:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   id          50000 non-null  int64  
 1   name        50000 non-null  object 
 2   age         47500 non-null  float64
 3   salary      33848 non-null  float64
 4   hire_date   45069 non-null  object 
 5   profile     44979 non-null  object 
 6   department  39884 non-null  object 
 7   bonus       45060 non-null  float64
dtypes: float64(3), int64(1), object(4)
memory usage: 3.1+ MB


In [70]:
# Check for duplicate rows
duplicates = df.duplicated().sum()
print(f"\n🔄 Duplicate rows: {duplicates}")


🔄 Duplicate rows: 0


In [71]:
# Check unique values in the department column
df['department'].unique()

array(['Marketing', 'HR', 'Finance', nan, 'IT'], dtype=object)

In [72]:
# View statistical summary for numeric coloums
print("\n📈 Statistical Summary:")
df.describe(include='all')


📈 Statistical Summary:


Unnamed: 0,id,name,age,salary,hire_date,profile,department,bonus
count,50000.0,50000,47500.0,33848.0,45069,44979,39884,45060.0
unique,,999,,,3651,44979,4,
top,,Name_794,,,2017-08-06,"{""address"": ""Street 5, City 28"", ""phone"": ""360...",IT,
freq,,73,,,27,1,10074,
mean,25000.5,,48.5044,59940.616875,,,,5506.777253
std,14433.901067,,17.90915,8166.619236,,,,2609.355309
min,1.0,,18.0,50000.0,,,,1000.0
25%,12500.75,,33.0,50000.0,,,,3253.0
50%,25000.5,,48.0,60000.0,,,,5505.5
75%,37500.25,,64.0,70000.0,,,,7759.0


In [73]:
# Check for missing values
print("\n❓ Missing Values Analysis:\n")
df.isnull().sum()


❓ Missing Values Analysis:



id                0
name              0
age            2500
salary        16152
hire_date      4931
profile        5021
department    10116
bonus          4940
dtype: int64

## 🧹 3. Data Cleaning & Quality Improvement

### Step 1: Handle Missing values of age, and salary
Handle missing values in age and salary columns using appropriate strategies:
- For age: Use median (robust to outliers)
- For salary: Use median (robust to outliers)

In [74]:
# Analyze missing patterns
print("\n📊 Missing Value Patterns:")
print("Missing Age values:")
print(df[df['age'].isnull()][['age', 'salary', 'department']])


📊 Missing Value Patterns:
Missing Age values:
       age   salary department
0      NaN      NaN  Marketing
17     NaN  50000.0         IT
41     NaN      NaN         IT
51     NaN  50000.0         HR
58     NaN  60000.0         IT
...    ...      ...        ...
49890  NaN  70000.0  Marketing
49943  NaN  50000.0         HR
49986  NaN      NaN    Finance
49987  NaN  60000.0         IT
49996  NaN      NaN        NaN

[2500 rows x 3 columns]


In [75]:
print("Missing Salary values")
print(df[df['salary'].isnull()][['age', 'salary', 'department']])

Missing Salary values
        age  salary department
0       NaN     NaN  Marketing
2      33.0     NaN  Marketing
6      60.0     NaN        NaN
11     29.0     NaN         HR
13     48.0     NaN        NaN
...     ...     ...        ...
49981  72.0     NaN  Marketing
49983  67.0     NaN         HR
49986   NaN     NaN    Finance
49992  43.0     NaN         HR
49996   NaN     NaN        NaN

[16152 rows x 3 columns]


In [76]:
# Get the median values for age, and salary
age_median = df['age'].median()
salary_median = df['salary'].median()
print("Age Median", age_median)
print("Salary Median", salary_median)

Age Median 48.0
Salary Median 60000.0


In [77]:
# Fill missing values of age with age_median
df['age'] = df['age'].fillna(age_median)
# Fill missing values of salary with salary_median
df['salary'] = df['salary'].fillna(salary_median)

#### Age & Salary columns missing values are filled with the respective median

In [78]:
# Verify the Age & Salary data
df.head()
# Check for missing values
print("Missing values in each column")
df.isnull().sum()

Missing values in each column


id                0
name              0
age               0
salary            0
hire_date      4931
profile        5021
department    10116
bonus          4940
dtype: int64

### Step 2: Handle Missing values of Department
Handle missing values in categorical columns:
- For department: Use 'Unknown' category
- This preserves the information that the department was missing

In [79]:
print("Print the missing values for Department\n")
print("Missing Department Missing values")
print(df[df['department'].isnull()][['age', 'salary', 'department']])

Print the missing values for Department

Missing Department Missing values
        age   salary department
6      60.0  60000.0        NaN
13     48.0  60000.0        NaN
14     29.0  60000.0        NaN
15     27.0  70000.0        NaN
16     57.0  70000.0        NaN
...     ...      ...        ...
49985  35.0  50000.0        NaN
49988  25.0  50000.0        NaN
49989  33.0  50000.0        NaN
49996  48.0  60000.0        NaN
49998  61.0  50000.0        NaN

[10116 rows x 3 columns]


In [80]:
# Fill the missing values in department with 'Unknown'
df['department'] = df['department'].fillna('Unknown')

#### Department column missing values are filled with the respective median

In [81]:
# Verify the Age & Salary data
df.head()
# Check for missing values
print("Missing values in each column")
print(df.isnull().sum())
# Check unique values in the department column
df['department'].unique()

Missing values in each column
id               0
name             0
age              0
salary           0
hire_date     4931
profile       5021
department       0
bonus         4940
dtype: int64


array(['Marketing', 'HR', 'Finance', 'Unknown', 'IT'], dtype=object)

### Step 3: Parse and Extract Profile Information
Devide Profile Column into 3 different columns i.e., Address, Phone, Email   

Parse JSON profile data and extract structured information:
- Extract address, phone, and email into separate columns
- Handle malformed JSON gracefully
- Maintain data integrity during extraction

In [82]:
print("Top rows from profile column \n")
print(df['profile'].head())

# Find the first non-null value in the column
profile_first_value = df['profile'].dropna().iloc[0]
# Print its type
print("\nProfile column values current data type")
print(type(profile_first_value))

# If your 'profile' column already contains Python dictionaries, not JSON strings.
# You do not need to parse it with json.loads(). The data is ready to be used directly.

# Convert profile JSON strings into dictionaries
df['profile'] = df['profile'].apply(lambda x: json.loads(x) if pd.notnull(x) else {})

Top rows from profile column 

0    {"address": "Street 54, City 10", "phone": "99...
1    {"address": "Street 87, City 10", "phone": "33...
2    {"address": "Street 71, City 38", "phone": "61...
3    {"address": "Street 24, City 28", "phone": "59...
4    {"address": "Street 33, City 49", "phone": "97...
Name: profile, dtype: object

Profile column values current data type
<class 'str'>


In [83]:
# Extract Address Field
print("Extract Address Field....\n")
# Create new 'address' column by extracting from 'profile' dictionaries
df['address'] = df['profile'].apply(lambda x: x.get('address', None))  # Returns None if no address key

print("Top rows from profile column \n")
print(df['profile'].head())
print("\nTop rows from newly created address column \n")
print(df['address'].head())


Extract Address Field....

Top rows from profile column 

0    {'address': 'Street 54, City 10', 'phone': '99...
1    {'address': 'Street 87, City 10', 'phone': '33...
2    {'address': 'Street 71, City 38', 'phone': '61...
3    {'address': 'Street 24, City 28', 'phone': '59...
4    {'address': 'Street 33, City 49', 'phone': '97...
Name: profile, dtype: object

Top rows from newly created address column 

0    Street 54, City 10
1    Street 87, City 10
2    Street 71, City 38
3    Street 24, City 28
4    Street 33, City 49
Name: address, dtype: object


In [84]:
# Extract Phone Field
print("Extract Phone Field....\n")
# Create new 'phone' column by extracting from 'profile' dictionaries
df['phone'] = df['profile'].apply(lambda x: x.get('phone', None))  # Returns None if no address key

print("Top rows from profile column \n")
print(df['profile'].head())
print("\nTop rows from newly created phone column \n")
print(df['phone'].head())


Extract Phone Field....

Top rows from profile column 

0    {'address': 'Street 54, City 10', 'phone': '99...
1    {'address': 'Street 87, City 10', 'phone': '33...
2    {'address': 'Street 71, City 38', 'phone': '61...
3    {'address': 'Street 24, City 28', 'phone': '59...
4    {'address': 'Street 33, City 49', 'phone': '97...
Name: profile, dtype: object

Top rows from newly created phone column 

0    9988067504
1    3318847816
2    6174139872
3    5900220383
4    9755253027
Name: phone, dtype: object


In [85]:
# Extract Email Field
print("Extract Email Field....\n")
# Create new 'email' column by extracting from 'profile' dictionaries
df['email'] = df['profile'].apply(lambda x: x.get('email', None))  # Returns None if no address key

print("Top rows from profile column \n")
print(df['profile'].head())
print("\nTop rows from newly created email column \n")
print(df['email'].head())

print(f"\n✅ Profile fields extracted:")

Extract Email Field....

Top rows from profile column 

0    {'address': 'Street 54, City 10', 'phone': '99...
1    {'address': 'Street 87, City 10', 'phone': '33...
2    {'address': 'Street 71, City 38', 'phone': '61...
3    {'address': 'Street 24, City 28', 'phone': '59...
4    {'address': 'Street 33, City 49', 'phone': '97...
Name: profile, dtype: object

Top rows from newly created email column 

0    email_951@example.com
1    email_170@example.com
2    email_711@example.com
3    email_354@example.com
4    email_532@example.com
Name: email, dtype: object

✅ Profile fields extracted:


In [86]:
# Now drop the profile column
print("\nColumns before dropping profile:")
print(df.columns.tolist())

# Without inplace=True (df remains unchanged)
cleaned_df = df.drop(columns=['profile'])

# With inplace=True (df is modified directly)
#df.drop(columns=['profile'], inplace=True)

print("\nColumns in new DataFrame after dropping profile:")
# print(df.columns.tolist())
print(cleaned_df.columns.tolist())


Columns before dropping profile:
['id', 'name', 'age', 'salary', 'hire_date', 'profile', 'department', 'bonus', 'address', 'phone', 'email']

Columns in new DataFrame after dropping profile:
['id', 'name', 'age', 'salary', 'hire_date', 'department', 'bonus', 'address', 'phone', 'email']


### Step 4: Save cleaned data into new CSV and upload it to S3

In [87]:
print("\n💾 Saving cleaned data to: 'data/cleaned_data.csv' ...")
cleaned_df.to_csv("data/cleaned_data.csv", index=False)
print("✅ Cleaned data saved to: 'data/cleaned_data.csv'")

print(f"\nUploading dataset to s3 bucket: {bucket}")
s3.meta.client.upload_file('data/cleaned_data.csv', bucket, 'output/cleaned_data.csv')
print(f"Dataset 'mock_data.csv' uploaded to: s3://{bucket}/output/cleaned_data.csv")


💾 Saving cleaned data to: 'data/cleaned_data.csv' ...
✅ Cleaned data saved to: 'data/cleaned_data.csv'

Uploading dataset to s3 bucket: sagemaker-us-east-1-533267207758
Dataset 'mock_data.csv' uploaded to: s3://sagemaker-us-east-1-533267207758/output/cleaned_data.csv


## 4. Data Transformation & Feature Engineering

### Step 1: Load the cleaned dataset into new DataFrame

In [88]:
transform_df = pd.read_csv(f's3://{bucket}/output/cleaned_data.csv')
transform_df.head()

Unnamed: 0,id,name,age,salary,hire_date,department,bonus,address,phone,email
0,1,Name_103,48.0,60000.0,2022-03-04,Marketing,5578.0,"Street 54, City 10",9988068000.0,email_951@example.com
1,2,Name_436,24.0,70000.0,2023-01-30,HR,9661.0,"Street 87, City 10",3318848000.0,email_170@example.com
2,3,Name_861,33.0,60000.0,2022-11-03,Marketing,9186.0,"Street 71, City 38",6174140000.0,email_711@example.com
3,4,Name_271,50.0,50000.0,2024-02-28,Finance,7530.0,"Street 24, City 28",5900220000.0,email_354@example.com
4,5,Name_107,50.0,70000.0,2018-07-11,Marketing,6772.0,"Street 33, City 49",9755253000.0,email_532@example.com


### Step 2: Feature Engineering - Salary Categorization
Create salary categories for easier analysis and modeling.  
This converts continuous salary into ordinal categories.

In [89]:
print("\n🔧 Creating Salary Categories...")
# Define the bins and labels
bins = [0, 50000, 70000, 100000]
labels = ['low', 'medium', 'high']

# Create a new column 'salary_category'
transform_df['salary_category'] = pd.cut(df['salary'], bins=bins, labels=labels, include_lowest=True)

# Print sample data after adding the 'salary_category' column
print("Sample data after adding the 'salary_category' column: \n")
transform_df[['salary', 'salary_category']].head()


🔧 Creating Salary Categories...
Sample data after adding the 'salary_category' column: 



Unnamed: 0,salary,salary_category
0,60000.0,medium
1,70000.0,medium
2,60000.0,medium
3,50000.0,low
4,70000.0,medium


### Step 3: Feature Engineering - Age Groups  
Create age groups for demographic analysis.  
This helps in understanding age-based patterns in the data.  

In [90]:
print("\n🔧 Creating Age Groups...")
# Define age bins and labels
age_bins = [0, 25, 35, 45, 55, float('inf')]
age_labels = ['Young', 'Early Career', 'Mid Career', 'Senior', 'Experienced']

# Create a new column 'salary_category'
transform_df['age_group'] = pd.cut(df['age'], bins=age_bins, labels=age_labels, include_lowest=True)

# Age group distribution
print(f"Age group distribution:")
print(transform_df['age_group'].value_counts())

# Print sample data after adding the 'salary_category' column
print("\nSample data after adding the 'age_group' column: \n")
transform_df[['age', 'age_group']].head()



🔧 Creating Age Groups...
Age group distribution:
age_group
Experienced     18413
Senior          10091
Mid Career       7779
Early Career     7530
Young            6187
Name: count, dtype: int64

Sample data after adding the 'age_group' column: 



Unnamed: 0,age,age_group
0,48.0,Senior
1,24.0,Young
2,33.0,Early Career
3,50.0,Senior
4,50.0,Senior


### Step 4: Remove missing values in bonus (is the target)

In [91]:
# Check for missing values
print("\n❓ Missing Values Analysis before Removing Missing:\n")
print(transform_df.isnull().sum())

# Remove missing rows for bonus
transform_df = transform_df[transform_df['bonus'].notna()]
print("\n❓ Missing Values Analysis after Removing Missing:\n")
transform_df.isnull().sum()



❓ Missing Values Analysis before Removing Missing:

id                    0
name                  0
age                   0
salary                0
hire_date          4931
department            0
bonus              4940
address            5021
phone              5021
email              5021
salary_category       0
age_group             0
dtype: int64

❓ Missing Values Analysis after Removing Missing:



id                    0
name                  0
age                   0
salary                0
hire_date          4444
department            0
bonus                 0
address            4545
phone              4545
email              4545
salary_category       0
age_group             0
dtype: int64

### Step 5: Feature Engineering - One-Hot Encoding on department, age_group & salary_category

In [92]:
transform_df = pd.get_dummies(transform_df, columns=['department', 'age_group', 'salary_category'], prefix=['dept', 'age', 'salary'])
print("Top 5 rows with boolean values")
print(transform_df.head())

bool_cols = transform_df.select_dtypes(include='bool').columns
transform_df[bool_cols] = transform_df[bool_cols].astype(int)

print("\nTop 5 rows with numberic values\n")
print(transform_df.head())

Top 5 rows with boolean values
   id      name   age   salary   hire_date   bonus             address  \
0   1  Name_103  48.0  60000.0  2022-03-04  5578.0  Street 54, City 10   
1   2  Name_436  24.0  70000.0  2023-01-30  9661.0  Street 87, City 10   
2   3  Name_861  33.0  60000.0  2022-11-03  9186.0  Street 71, City 38   
3   4  Name_271  50.0  50000.0  2024-02-28  7530.0  Street 24, City 28   
4   5  Name_107  50.0  70000.0  2018-07-11  6772.0  Street 33, City 49   

          phone                  email  dept_Finance  ...  dept_Marketing  \
0  9.988068e+09  email_951@example.com         False  ...            True   
1  3.318848e+09  email_170@example.com         False  ...           False   
2  6.174140e+09  email_711@example.com         False  ...            True   
3  5.900220e+09  email_354@example.com          True  ...           False   
4  9.755253e+09  email_532@example.com         False  ...            True   

   dept_Unknown  age_Young  age_Early Career  age_Mid Career 

### Step 6: Feature Engineering | Calculate Tenure

In [93]:
print("Convert hire_date to datetime")
transform_df['hire_date'] = pd.to_datetime(transform_df['hire_date'], errors='coerce')
print(transform_df['hire_date'].head())
# print(transform_df.dtypes)

# non_date_rows = transform_df[transform_df['hire_date'].apply(lambda x: isinstance(x, str))]
# print("non date rows:")
# print(non_date_rows)

print("Calculate Tenure in Days....")
transform_df['tenure_days'] = (pd.Timestamp('now') - transform_df['hire_date']).dt.days

print("Calculated Tenure Days")
print(transform_df['tenure_days'])

print("Handle Missing values of tenure_days")
transform_df['tenure_days'] = transform_df['tenure_days'].fillna(transform_df['tenure_days'].median())

print("Tenure Days after handled missing days")
print(transform_df['tenure_days'])

Convert hire_date to datetime
0   2022-03-04
1   2023-01-30
2   2022-11-03
3   2024-02-28
4   2018-07-11
Name: hire_date, dtype: datetime64[ns]
Calculate Tenure in Days....
Calculated Tenure Days
0        1241.0
1         909.0
2         997.0
3         515.0
4        2573.0
          ...  
49994     806.0
49995    3311.0
49996     181.0
49997    2892.0
49998     358.0
Name: tenure_days, Length: 45060, dtype: float64
Handle Missing values of tenure_days
Tenure Days after handled missing days
0        1241.0
1         909.0
2         997.0
3         515.0
4        2573.0
          ...  
49994     806.0
49995    3311.0
49996     181.0
49997    2892.0
49998     358.0
Name: tenure_days, Length: 45060, dtype: float64


### Step 7: Feature Engineering | Removing Irrelevant or non-predictive columns


In [94]:
print("Dropping ID, Address, Phone, Name, Hire Date, and Email.....")
transform_df.drop(columns=['id', 'address', 'phone', 'email', 'name', 'hire_date'], inplace=True)
print("After dropping ID, Address, Phone, Name, Hire Date, and Email, dataset look like")
print(transform_df.head())

Dropping ID, Address, Phone, Name, Hire Date, and Email.....
After dropping ID, Address, Phone, Name, Hire Date, and Email, dataset look like
    age   salary   bonus  dept_Finance  dept_HR  dept_IT  dept_Marketing  \
0  48.0  60000.0  5578.0             0        0        0               1   
1  24.0  70000.0  9661.0             0        1        0               0   
2  33.0  60000.0  9186.0             0        0        0               1   
3  50.0  50000.0  7530.0             1        0        0               0   
4  50.0  70000.0  6772.0             0        0        0               1   

   dept_Unknown  age_Young  age_Early Career  age_Mid Career  age_Senior  \
0             0          0                 0               0           1   
1             0          1                 0               0           0   
2             0          0                 1               0           0   
3             0          0                 0               0           1   
4             0      

### Step 8: Save the transformed DataFrame to a new csv file

In [95]:
print("Saving Transformed data csv to: 'data/transformed_data.csv' ...")
transform_df.to_csv("data/transformed_data.csv", index=False)
print("\nTransformed data csv saved to: 'data/transformed_data.csv'")

s3.meta.client.upload_file('data/transformed_data.csv', bucket, 'output/transformed_data.csv')
print(f"Transformed data 'transformed_data.csv' uploaded to: s3://{bucket}/output/transformed_data.csv")

Saving Transformed data csv to: 'data/transformed_data.csv' ...

Transformed data csv saved to: 'data/transformed_data.csv'
Transformed data 'transformed_data.csv' uploaded to: s3://sagemaker-us-east-1-533267207758/output/transformed_data.csv
