Import necessary libraries

In [149]:
#importing relevaent libraries
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error,mean_absolute_error
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
import sagemaker
import boto3

Load the employee data

In [150]:
#loading dataset
data = pd.read_csv('employee_data.csv')
data

Unnamed: 0,ID,Gender,Experience (Years),Position,Salary
0,1,F,4,DevOps Engineer,109976
1,2,M,6,DevOps Engineer,120088
2,3,M,17,Web Developer,181301
3,4,M,7,Systems Administrator,77530
4,5,F,13,Systems Administrator,152397
...,...,...,...,...,...
395,396,F,19,Cloud Solutions Architect,236045
396,397,F,20,Web Developer,182770
397,398,F,9,Network Administrator,85550
398,399,M,18,Database Administrator (DBA),129996


In [151]:
# Rename the column
data.rename(columns={'Experience (Years)': 'Experience'}, inplace=True)

In [152]:
#changing datatype of gender from object to string for further manipulation
data["Gender"]=data["Gender"].astype('string')
data["Position"]=data["Position"].astype('string')
data.dtypes


ID                     int64
Gender        string[python]
Experience             int64
Position      string[python]
Salary                 int64
dtype: object

Split the data into training and test data sets (70% split)

In [153]:
# Apply one-hot encoding
data_encoded = pd.get_dummies(data,drop_first=True)

# Display the encoded DataFrame
data_encoded = data_encoded.astype(int)

print(data_encoded.shape)
print(data_encoded.columns)

(400, 14)
Index(['ID', 'Experience', 'Salary', 'Gender_M',
       'Position_Database Administrator (DBA)', 'Position_DevOps Engineer',
       'Position_IT Manager', 'Position_IT Security Analyst',
       'Position_IT Support Specialist', 'Position_Network Administrator',
       'Position_Software Engineer', 'Position_Systems Administrator',
       'Position_Systems Analyst', 'Position_Web Developer'],
      dtype='object')


In [154]:
X=data_encoded.drop(columns="Salary")
X_columns=X.columns
Y=data_encoded["Salary"]
# Combine the feature column names with the target column name
columns_names = list(X.columns) + ["Salary"]

print(columns_names)



['ID', 'Experience', 'Gender_M', 'Position_Database Administrator (DBA)', 'Position_DevOps Engineer', 'Position_IT Manager', 'Position_IT Security Analyst', 'Position_IT Support Specialist', 'Position_Network Administrator', 'Position_Software Engineer', 'Position_Systems Administrator', 'Position_Systems Analyst', 'Position_Web Developer', 'Salary']


In [155]:
X

Unnamed: 0,ID,Experience,Gender_M,Position_Database Administrator (DBA),Position_DevOps Engineer,Position_IT Manager,Position_IT Security Analyst,Position_IT Support Specialist,Position_Network Administrator,Position_Software Engineer,Position_Systems Administrator,Position_Systems Analyst,Position_Web Developer
0,1,4,0,0,1,0,0,0,0,0,0,0,0
1,2,6,1,0,1,0,0,0,0,0,0,0,0
2,3,17,1,0,0,0,0,0,0,0,0,0,1
3,4,7,1,0,0,0,0,0,0,0,1,0,0
4,5,13,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,396,19,0,0,0,0,0,0,0,0,0,0,0
396,397,20,0,0,0,0,0,0,0,0,0,0,1
397,398,9,0,0,0,0,0,0,1,0,0,0,0
398,399,18,1,1,0,0,0,0,0,0,0,0,0


In [156]:
#Splitting dataset in training and testing data

X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size=0.3, random_state=42)

In [157]:
#Shape of testing and trainig data

X_train.shape,Y_train.shape,X_test.shape,Y_test.shape

((280, 13), (280,), (120, 13), (120,))

In [158]:
trainX = pd.DataFrame(X_train)
trainX['Salary'] = Y_train

testX = pd.DataFrame(X_test)
testX['Salary'] = Y_test

In [159]:
testX

Unnamed: 0,ID,Experience,Gender_M,Position_Database Administrator (DBA),Position_DevOps Engineer,Position_IT Manager,Position_IT Security Analyst,Position_IT Support Specialist,Position_Network Administrator,Position_Software Engineer,Position_Systems Administrator,Position_Systems Analyst,Position_Web Developer,Salary
209,210,11,1,1,0,0,0,0,0,0,0,0,0,93165
280,281,5,1,0,0,0,0,0,1,0,0,0,0,91842
33,34,3,1,0,0,0,1,0,0,0,0,0,0,97240
210,211,4,0,0,0,0,0,0,0,0,0,0,1,68000
93,94,16,1,0,0,0,1,0,0,0,0,0,0,128620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,61,7,1,1,0,0,0,0,0,0,0,0,0,105182
79,80,7,0,0,0,0,0,0,0,0,0,0,1,77058
285,286,11,1,0,1,0,0,0,0,0,0,0,0,180261
305,306,6,0,0,0,0,0,0,0,1,0,0,0,136900


In [160]:
#creating s3 bucket
boto3_sm = boto3.client("sagemaker", region_name="ap-south-1")
session = sagemaker.Session()
region = session.boto_session.region_name
bucket = "mlops-salary-predictor-app"
print("Using Bucket:", bucket)

Using Bucket: mlops-salary-predictor-app


In [161]:
# Convert the NumPy array to a DataFrame
X_train_df=pd.DataFrame(trainX,columns=columns_names)
X_train_df
# Convert the NumPy array to a DataFrame
X_test_df=pd.DataFrame(testX,columns=columns_names)
X_test_df
Y_train_df=pd.DataFrame(Y_train)
Y_train_df
Y_test_df=pd.DataFrame(Y_train)
Y_test_df

Unnamed: 0,Salary
157,69668
109,123370
17,188681
347,242819
24,71211
...,...
71,110321
106,135066
270,115263
348,115769


In [162]:
# Save the split datasets into new CSV files
X_test_df.to_csv('test_v1.csv', index=False)
X_train_df.to_csv('train_v1.csv', index=False)


In [163]:
#Standardising features for similar scale

scaler=StandardScaler()

X_train=scaler.fit_transform(X_train)

X_test=scaler.transform(X_test)


print(X_train)
print("\n")
print(f"shape of trainig data is {X_train.shape}")
print("\n")
print('*'*80)
print("\n")
print(X_test)
print("\n")
print(f"shape of testing data is {X_test.shape}")

[[-0.41085418 -0.32005212  0.98581488 ... -0.31994094 -0.29915575
  -0.34641016]
 [-0.84023834  0.65401954  0.98581488 ... -0.31994094 -0.29915575
  -0.34641016]
 [-1.66322463  0.97871009 -1.01438923 ... -0.31994094 -0.29915575
  -0.34641016]
 ...
 [ 0.59998768 -0.48239739 -1.01438923 ... -0.31994094 -0.29915575
  -0.34641016]
 [ 1.29773693 -0.15770684  0.98581488 ...  3.12557687 -0.29915575
  -0.34641016]
 [-0.90285686 -1.1317785  -1.01438923 ... -0.31994094 -0.29915575
  -0.34641016]]


shape of trainig data is (280, 13)


********************************************************************************


[[ 0.05431198  0.16698371  0.98581488 ... -0.31994094 -0.29915575
  -0.34641016]
 [ 0.68944271 -0.80708794  0.98581488 ... -0.31994094 -0.29915575
  -0.34641016]
 [-1.52009658 -1.1317785   0.98581488 ... -0.31994094 -0.29915575
  -0.34641016]
 ...
 [ 0.73417023  0.16698371  0.98581488 ... -0.31994094 -0.29915575
  -0.34641016]
 [ 0.91308029 -0.64474267 -1.01438923 ... -0.31994094 -0.

In [164]:
#Multiple linear regression
model=LinearRegression()

#Fitting the model
model.fit(X_train,Y_train)

#Prediciting bike rental duration
y_pred=model.predict(X_test)

In [165]:
X_test_df

Unnamed: 0,ID,Experience,Gender_M,Position_Database Administrator (DBA),Position_DevOps Engineer,Position_IT Manager,Position_IT Security Analyst,Position_IT Support Specialist,Position_Network Administrator,Position_Software Engineer,Position_Systems Administrator,Position_Systems Analyst,Position_Web Developer,Salary
209,210,11,1,1,0,0,0,0,0,0,0,0,0,93165
280,281,5,1,0,0,0,0,0,1,0,0,0,0,91842
33,34,3,1,0,0,0,1,0,0,0,0,0,0,97240
210,211,4,0,0,0,0,0,0,0,0,0,0,1,68000
93,94,16,1,0,0,0,1,0,0,0,0,0,0,128620
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
60,61,7,1,1,0,0,0,0,0,0,0,0,0,105182
79,80,7,0,0,0,0,0,0,0,0,0,0,1,77058
285,286,11,1,0,1,0,0,0,0,0,0,0,0,180261
305,306,6,0,0,0,0,0,0,0,1,0,0,0,136900


In [166]:
MAE=mean_absolute_error(Y_test, y_pred)
print(f'Mean Absolute Error of model is {round(MAE,2)}')

Mean Absolute Error of model is 21599.67


In [167]:
X

Unnamed: 0,ID,Experience,Gender_M,Position_Database Administrator (DBA),Position_DevOps Engineer,Position_IT Manager,Position_IT Security Analyst,Position_IT Support Specialist,Position_Network Administrator,Position_Software Engineer,Position_Systems Administrator,Position_Systems Analyst,Position_Web Developer
0,1,4,0,0,1,0,0,0,0,0,0,0,0
1,2,6,1,0,1,0,0,0,0,0,0,0,0
2,3,17,1,0,0,0,0,0,0,0,0,0,1
3,4,7,1,0,0,0,0,0,0,0,1,0,0
4,5,13,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
395,396,19,0,0,0,0,0,0,0,0,0,0,0
396,397,20,0,0,0,0,0,0,0,0,0,0,1
397,398,9,0,0,0,0,0,0,1,0,0,0,0
398,399,18,1,1,0,0,0,0,0,0,0,0,0


In [168]:
import numpy as np
new_data=X.sample(5)
new_data_scaled = scaler.transform(new_data)

# Make predictions using the model
predictions = model.predict(new_data_scaled)


#reversing transformations
original_data = scaler.inverse_transform(new_data_scaled)
original_ID=original_data[:,:1].astype(int)


# Convert array to DataFrame with a column name
df = pd.DataFrame({
    'ID': original_ID.flatten(),  # Flatten to match the (5,) shape
    'Predicted_Salary': predictions
})


# Print DataFrame
print(df)

    ID  Predicted_Salary
0  105     154644.082667
1  349     106827.112735
2   82      99187.417124
3  222      97923.324471
4   23      99720.141885


In [169]:
import joblib

# Save the model to a file
model_filename = 'model.joblib'
joblib.dump(model, model_filename)

print(f"Model saved as {model_filename}")


Model saved as model.joblib


In [170]:


# Specify your S3 bucket and model file
model_file_path = 'model.joblib'
s3_model_path = 'data/model.joblib'  # The path where you want to store the model in S3

# Initialize the S3 client
s3 = boto3.client('s3')

# Upload the model to S3
s3.upload_file(model_file_path, bucket, s3_model_path)

print(f"Model uploaded to s3://{bucket}/{s3_model_path}")



Model uploaded to s3://mlops-salary-predictor-app/data/model.joblib


In [171]:
# Sagemaker takes training data from S3 bucket, so uploading data to S3 bucket
train_path = session.upload_data(path="train_v1.csv", bucket=bucket)
test_path = session.upload_data(path="test_v1.csv", bucket=bucket)


In [172]:
test_path

's3://mlops-salary-predictor-app/data/test_v1.csv'

In [173]:
import os
os.environ['SM_MODEL_DIR'] = '/home/arshad/mlops-zoomcamp-project-cohort-2024/'
os.environ['SM_CHANNEL_TRAIN'] = '/home/arshad/mlops-zoomcamp-project-cohort-2024/'
os.environ['SM_CHANNEL_TEST'] = '/home/arshad/mlops-zoomcamp-project-cohort-2024/'


In [174]:
import argparse
import os
import sklearn
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error
import joblib
import pandas as pd
import logging


# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Log the environment variables
logger.info(f"SM_MODEL_DIR: {os.environ.get('SM_MODEL_DIR')}")
logger.info(f"SM_CHANNEL_TRAIN: {os.environ.get('SM_CHANNEL_TRAIN')}")
logger.info(f"SM_CHANNEL_TEST: {os.environ.get('SM_CHANNEL_TEST')}")

# Check if files exist
train_file = os.path.join(os.environ['SM_CHANNEL_TRAIN'], 'train_v1.csv')
test_file = os.path.join(os.environ['SM_CHANNEL_TEST'], 'test_v1.csv')

if not os.path.exists(train_file):
    logger.error(f"Training file not found at {train_file}")
    exit(1)

if not os.path.exists(test_file):
    logger.error(f"Test file not found at {test_file}")
    exit(1)

logger.info("Files found, proceeding with training.")

def model_fn(model_dir):
    clf = joblib.load(os.path.join(model_dir, "model.joblib"))
    return clf

if __name__ == '__main__':
    

    print("[INFO] Extracting arguments")
    parser = argparse.ArgumentParser()

    # input data and model directories
    parser.add_argument('--model-dir', type=str, default=os.environ['SM_MODEL_DIR'])
    parser.add_argument('--train', type=str, default=os.environ['SM_CHANNEL_TRAIN'])
    parser.add_argument('--test', type=str, default=os.environ['SM_CHANNEL_TEST'])
    parser.add_argument('--train_file', type=str, default='train_v1.csv')
    parser.add_argument('--test_file', type=str, default='test_v1.csv')

    args, _ = parser.parse_known_args()

    print("SKLearn Version: ", sklearn.__version__)
    print("Joblib Version: ", joblib.__version__)

    print("[INFO] Reading data")
    train_df = pd.read_csv(os.path.join(args.train, args.train_file))
    test_df = pd.read_csv(os.path.join(args.test, args.test_file))

  
    features = list(train_df.columns[:-1])
    label = train_df.columns[-1]

    print("Building training and testing datasets")
    X_train = train_df[features]
    X_test = test_df[features]
    y_train = train_df[label]
    y_test = test_df[label]

    print("Training Linear Regression Model...")
    model = LinearRegression()
    model.fit(X_train, y_train)


    model_path = os.path.join(args.model_dir, "model.joblib")
    joblib.dump(model, model_path)
    print("Model persisted at " + model_path)
    print()

    y_pred_test = model.predict(X_test)
    test_mae = mean_absolute_error(y_test, y_pred_test)

    print(f"Test Mean Absolute Error: {test_mae}")

INFO:__main__:SM_MODEL_DIR: /home/arshad/mlops-zoomcamp-project-cohort-2024/
INFO:__main__:SM_CHANNEL_TRAIN: /home/arshad/mlops-zoomcamp-project-cohort-2024/
INFO:__main__:SM_CHANNEL_TEST: /home/arshad/mlops-zoomcamp-project-cohort-2024/
INFO:__main__:Files found, proceeding with training.


[INFO] Extracting arguments
SKLearn Version:  1.5.2
Joblib Version:  1.4.2
[INFO] Reading data
Building training and testing datasets
Training Linear Regression Model...
Model persisted at /home/arshad/mlops-zoomcamp-project-cohort-2024/model.joblib

Test Mean Absolute Error: 21599.66943435516


In [175]:
from sagemaker.sklearn.estimator import SKLearn

FRAMEWORK_VERSION = "0.23-1"

sklearn_estimator = SKLearn(
    entry_point = "training-script.py",
    role="arn:aws:iam::802313481629:role/sagemaker-execution-role",
    instance_count=1,
    instance_type="ml.m5.large",
    framework_version=FRAMEWORK_VERSION,
    base_job_name="RF-custom-sklearn",
    use_spot_instances = True,
    max_wait = 7200,
    max_run = 3600
)



In [176]:
test_path

's3://mlops-salary-predictor-app/data/test_v1.csv'

In [177]:
sklearn_estimator.fit({"train": train_path, 
                        "test": test_path}, 
                       wait=True)



INFO:sagemaker:Creating training-job with name: RF-custom-sklearn-2024-09-15-14-16-01-693


2024-09-15 14:16:06 Starting - Starting the training job...
2024-09-15 14:16:20 Starting - Preparing the instances for training...
2024-09-15 14:17:09 Downloading - Downloading the training image.....2024-09-15 14:17:53,615 sagemaker-containers INFO     Imported framework sagemaker_sklearn_container.training
2024-09-15 14:17:53,618 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-09-15 14:17:53,654 sagemaker_sklearn_container.training INFO     Invoking user training script.
2024-09-15 14:17:53,804 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-09-15 14:17:53,816 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-09-15 14:17:53,830 sagemaker-training-toolkit INFO     No GPUs detected (normal if no gpus installed)
2024-09-15 14:17:53,839 sagemaker-training-toolkit INFO     Invoking user script
Training Env:
{
    "additional_framework_parameters": {},
    "channel_input_di

In [178]:
sklearn_estimator.latest_training_job.wait(logs="None")
artifact = boto3_sm.describe_training_job(
    TrainingJobName=sklearn_estimator.latest_training_job.name
)["ModelArtifacts"]["S3ModelArtifacts"]

print("Model artifact present at: ", artifact)


2024-09-15 14:18:13 Starting - Preparing the instances for training
2024-09-15 14:18:13 Downloading - Downloading the training image
2024-09-15 14:18:13 Training - Training image download completed. Training in progress.
2024-09-15 14:18:13 Uploading - Uploading generated training model
2024-09-15 14:18:13 Completed - Training job completed
Model artifact present at:  s3://sagemaker-ap-south-1-802313481629/RF-custom-sklearn-2024-09-15-14-16-01-693/output/model.tar.gz


In [179]:
artifact

's3://sagemaker-ap-south-1-802313481629/RF-custom-sklearn-2024-09-15-14-16-01-693/output/model.tar.gz'

In [180]:
# Making a copy of the built model for deployment
from sagemaker.sklearn.model import SKLearnModel
from time import gmtime, strftime

model_name = "Custom-sklearn-model" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
model = SKLearnModel(
    name=model_name,
    model_data=artifact,
    role="arn:aws:iam::802313481629:role/sagemaker-execution-role",
    entry_point="invoke-script.py",
    framework_version=FRAMEWORK_VERSION
)

In [181]:
model

<sagemaker.sklearn.model.SKLearnModel at 0x7f4b617f0f10>

In [182]:
# Endpoint Deployment
endpoint_name = "Custom-sklearn-model" + strftime("%Y-%m-%d-%H-%M-%S", gmtime())
print("EndpointName={}".format(endpoint_name))

predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.2xlarge",
    endpoint_name=endpoint_name
)

EndpointName=Custom-sklearn-model2024-09-15-14-18-31


INFO:sagemaker:Creating model with name: Custom-sklearn-model2024-09-15-14-18-31
INFO:sagemaker:Creating endpoint-config with name Custom-sklearn-model2024-09-15-14-18-31
INFO:sagemaker:Creating endpoint with name Custom-sklearn-model2024-09-15-14-18-31


-----!

In [183]:
endpoint_name

'Custom-sklearn-model2024-09-15-14-18-31'

In [184]:
%store endpoint_name

Stored 'endpoint_name' (str)


  db[ 'autorestore/' + arg ] = obj


In [185]:
predictor

<sagemaker.sklearn.model.SKLearnPredictor at 0x7f4b61804910>

In [186]:
import pandas as pd
from sagemaker.sklearn.model import SKLearnPredictor
from sagemaker import Session
import numpy as np


# create a predictor 
sagemaker_session = Session()
predictor = SKLearnPredictor(endpoint_name=endpoint_name, sagemaker_session=sagemaker_session)



def predict_salary(csvfile):
    data = pd.read_csv(csvfile)

    # Rename the column
    data.rename(columns={'Experience (Years)': 'Experience'}, inplace=True)

    data["Gender"]=data["Gender"].astype('string')
    data["Position"]=data["Position"].astype('string')

    # Apply one-hot encoding
    data_encoded = pd.get_dummies(data,drop_first=True)

    # Display the encoded DataFrame
    data_encoded = data_encoded.astype(int)
    
    #removing target column
    X=data_encoded.drop(columns="Salary")

    #transforming dataset into desired format
    new_data=X
    new_data_scaled = scaler.transform(new_data)

 

    # Make predictions using the model
    predictions =predictor.predict(new_data_scaled)


    #reversing transformations
    original_data = scaler.inverse_transform(new_data_scaled)
    original_ID=original_data[:,:1].astype(int)


    # Convert array to DataFrame with a column name
    df = pd.DataFrame({
    'ID': original_ID.flatten(),  # Flatten to match the (5,) shape
    'Predicted_Salary': predictions
    })


    # Print DataFrame
    return df

In [187]:
predict_salary("./employee_data.csv")

Unnamed: 0,ID,Predicted_Salary
0,1,129517.135534
1,2,140551.354184
2,3,148572.633508
3,4,100786.610896
4,5,126356.682999
...,...,...
395,396,202164.221938
396,397,156860.835993
397,398,99010.934798
398,399,163879.286341


In [188]:
# Deleting the endpoint to avoid the costs
# boto3_sm.delete_endpoint(EndpointName=endpoint_name)

