# PYCARET AutoML


In [1]:
from pycaret.classification import *
import pandas as pd

# Load the datasets
train_df = pd.read_csv('../data/processed_train.csv')
test_df = pd.read_csv('../data/processed_test.csv')

# Initialize setup
# clf1 = setup(data=train_df, target='Emotion', session_id=123, text_features=['processed_text'])

# # Align columns
# # For training data, we use 'processed_text' as feature and 'Emotion' as target
# train_df = train_df[['processed_text', 'Emotion']]

# # For test data, we use 'processed_text' as feature and 'sentiment' as target
# # Rename 'sentiment' to 'Emotion' to match the training set
# test_df = test_df[['processed_text', 'sentiment']]
# test_df.rename(columns={'sentiment': 'Emotion'}, inplace=True)

# Verify the alignment
# print("Training Dataset Columns:", train_df.columns)
# print("Test Dataset Columns:", test_df.columns)

# Handle missing values in 'processed_text'
train_df['processed_text'].fillna('', inplace=True)
test_df['processed_text'].fillna('', inplace=True)

In [2]:
# Ensure columns match
assert list(train_df.columns) == list(test_df.columns), "Columns do not match!"


In [3]:
# Verify the alignment
print("Training Dataset Columns:", train_df.columns)
print("Test Dataset Columns:", test_df.columns)

Training Dataset Columns: Index(['processed_text', 'Emotion'], dtype='object')
Test Dataset Columns: Index(['processed_text', 'Emotion'], dtype='object')


In [4]:
clf = setup(data=train_df, target='Emotion', session_id=123, text_features=['processed_text'])


Unnamed: 0,Description,Value
0,Session id,123
1,Target,Emotion
2,Target type,Multiclass
3,Target mapping,"anger: 0, fear: 1, joy: 2, love: 3, sadness: 4, surprise: 5"
4,Original data shape,"(21459, 2)"
5,Transformed data shape,"(21459, 14110)"
6,Transformed train set shape,"(15021, 14110)"
7,Transformed test set shape,"(6438, 14110)"
8,Text features,1
9,Preprocess,True


In [5]:
# Compare models
best_model = compare_models()


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.8914,0.9859,0.8914,0.8908,0.8904,0.8573,0.8575,21.824
svm,SVM - Linear Kernel,0.8906,0.0,0.8906,0.8903,0.8891,0.8558,0.8563,5.079
ridge,Ridge Classifier,0.8864,0.0,0.8864,0.8861,0.8851,0.8503,0.8508,22.437
rf,Random Forest Classifier,0.8777,0.9865,0.8777,0.8773,0.8766,0.8392,0.8395,9.988
dt,Decision Tree Classifier,0.8634,0.9139,0.8634,0.8644,0.8636,0.8218,0.8219,11.166
lr,Logistic Regression,0.8549,0.0,0.8549,0.8591,0.8492,0.8061,0.809,15.59
lightgbm,Light Gradient Boosting Machine,0.8444,0.9788,0.8444,0.8446,0.8434,0.7954,0.796,3.533
gbc,Gradient Boosting Classifier,0.821,0.0,0.821,0.844,0.8216,0.7616,0.7701,149.381
knn,K Neighbors Classifier,0.7804,0.9316,0.7804,0.7818,0.7757,0.7087,0.7102,5.997
lda,Linear Discriminant Analysis,0.4802,0.0,0.4802,0.4823,0.4808,0.414,0.4142,361.962


In [6]:
# Train the best model
best_model = create_model(best_model)

# Evaluate the best model
evaluate_model(best_model)


Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,0.8809,0.9841,0.8809,0.8804,0.8803,0.8439,0.844
1,0.8915,0.9828,0.8915,0.8909,0.8906,0.8574,0.8577
2,0.8988,0.9866,0.8988,0.8987,0.8978,0.8669,0.8673
3,0.9035,0.9894,0.9035,0.9031,0.9029,0.8733,0.8735
4,0.8888,0.9841,0.8888,0.888,0.8879,0.8539,0.8541
5,0.8881,0.9885,0.8881,0.8872,0.8865,0.8527,0.8531
6,0.8915,0.9869,0.8915,0.8917,0.8908,0.8575,0.8577
7,0.8921,0.9837,0.8921,0.8914,0.8915,0.8585,0.8586
8,0.8975,0.9883,0.8975,0.8967,0.8965,0.8653,0.8655
9,0.8808,0.9845,0.8808,0.8799,0.8796,0.8434,0.8437


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Pipeline Plot', 'pipelin…

In [7]:
# Ensure the test set has the same structure
test_df = test_df[train_df.columns]

# Predict on the test dataset
predictions = predict_model(best_model, data=test_df.drop(columns=['Emotion']))
print(predictions.head())


                                      processed_text prediction_label  \
0  tiffanylue know listenin bad habit earlier sta...          sadness   
1             layin n bed headache ughhhhwaitin call            anger   
2                      funeral ceremonygloomy friday            anger   
3                              want hang friend soon            anger   
4  dannycastillo want trade someone houston ticke...            anger   

   prediction_score  
0              0.56  
1              0.42  
2              0.44  
3              0.54  
4              0.31  


# SageMaker AutoML

In [None]:
import boto3
from botocore.exceptions import NoCredentialsError, PartialCredentialsError

# Create a Boto3 session with explicit credentials
boto3_session = boto3.Session(
    aws_access_key_id="ASIAQ3EGSBV2RCWNX3XO",
    aws_secret_access_key="W+Mba+akEgso5rj7NUUDVnG2o5SjUMuRlPsEed/Y",
    region_name="eu-west-1"
)

# Initialize the S3 client
s3 = boto3_session.client('s3')

# Verify the credentials
try:
    sts_client = boto3_session.client('sts')
    response = sts_client.get_caller_identity()
    print("Caller Identity:", response)
except (NoCredentialsError, PartialCredentialsError) as e:
    print("Credentials not found or incomplete:", e)
except Exception as e:
    print("Error:", e)

# Upload a file to S3
try:
    s3.upload_file('../data/processed_train.csv', 'your-s3-bucket-name', 'data/processed_train.csv')
    print("File uploaded successfully")
except Exception as e:
    print("Error uploading file:", e)


In [None]:
import sagemaker
from sagemaker import get_execution_role
import boto3

# Initialize SageMaker session
sagemaker_session = sagemaker.Session()

# Get execution role
role =  "arn:aws:iam::943240599753:role/service-role/AmazonSageMaker-ExecutionRole-20240312T113030"
print(role)

# Define S3 bucket and data locations
bucket = 'your-s3-bucket-name'
prefix = 'your-data-prefix'
train_file = f's3://{bucket}/{prefix}/processed_train.csv'
test_file = f's3://{bucket}/{prefix}/processed_test.csv'


In [None]:
import pandas as pd

# Load the datasets
train_df = pd.read_csv('../data/processed_train.csv')
test_df = pd.read_csv('../data/processed_test.csv')

# Upload data to S3
train_s3_path = sagemaker_session.upload_data(path='../data/processed_train.csv', bucket=bucket, key_prefix=prefix)
test_s3_path = sagemaker_session.upload_data(path='../data/processed_test.csv', bucket=bucket, key_prefix=prefix)


In [None]:
sm = boto3.client('sagemaker')

# Define Autopilot job name
autopilot_job_name = 'emotion-detection-autopilot-job'

# Define input data locations
input_data_config = [
    {
        'ChannelName': 'training',
        'DataSource': {
            'S3DataSource': {
                'S3DataType': 'S3Prefix',
                'S3Uri': train_s3_path,
                'S3DataDistributionType': 'FullyReplicated'
            }
        },
        'ContentType': 'text/csv',
        'CompressionType': 'None',
        'InputMode': 'File'
    }
]

# Define output data location
output_data_config = {
    'S3OutputPath': f's3://{bucket}/{prefix}/autopilot-output'
}

# Start Autopilot job
sm.create_auto_ml_job(
    AutoMLJobName=autopilot_job_name,
    InputDataConfig=input_data_config,
    OutputDataConfig=output_data_config,
    ProblemType='MulticlassClassification',
    AutoMLJobObjective={'MetricName': 'Accuracy'},
    RoleArn=role,
    MaxCandidates=5,
    MaxRuntimePerTrainingJobInSeconds=3600,
    MaxAutoMLJobRuntimeInSeconds=36000
)


In [None]:
import time

# Function to check the status of the Autopilot job
def get_job_status(job_name):
    response = sm.describe_auto_ml_job(AutoMLJobName=job_name)
    return response['AutoMLJobStatus']

# Wait for the job to complete
while True:
    status = get_job_status(autopilot_job_name)
    print(f'Job Status: {status}')
    if status in ['Completed', 'Failed', 'Stopped']:
        break
    time.sleep(300)

# List all candidate models
candidates = sm.list_candidates_for_auto_ml_job(AutoMLJobName=autopilot_job_name)

# Print the best model
best_candidate = max(candidates['Candidates'], key=lambda x: x['FinalAutoMLJobObjectiveMetric']['Value'])
print("Best Candidate:")
print(best_candidate)
