In [5]:
# Uncomment below if you want to run this file only
#%run main.ipynb
#%run data_cleaning.ipynb
#%run data_visualization.ipynb
#%run feature_engineering.ipynb

### Import Packages

In [None]:
!pip install pandas sagemaker boto3 botocore numpy matplotlib seaborn scikit-learn nbconvert
!pip install imbalanced-learn lime shap

# Import packages for S3 connection
import pandas as pd
import boto3
import json
import botocore
import sagemaker
from io import StringIO

# Import packages for data manipulation
import numpy as np
import matplotlib.pyplot as plt

# Import packages for data visualization
import seaborn as sns


# Import packages for featured engineering
from sklearn.preprocessing import LabelEncoder

# Import packages for data modeling
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import roc_auc_score, roc_curve, auc
from sklearn.metrics import accuracy_score, precision_score, recall_score,\
f1_score, confusion_matrix, ConfusionMatrixDisplay, RocCurveDisplay, PrecisionRecallDisplay

from sklearn.decomposition import PCA

from sklearn.ensemble import RandomForestClassifier

#SVM
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.metrics import classification_report, confusion_matrix
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC


# This lets us see all of the columns, preventing Juptyer from redacting them.
pd.set_option('display.max_columns', None)

# This module lets us save our models once we fit them.
import pickle

### Data Conection to S3

In [1]:
class S3Utils:
    def __init__(self, secret_name_or_arn, file_path=None):
        self.secret_name_or_arn = secret_name_or_arn
        self.file_path = file_path
        if file_path:
            self.aws_access_key_id, self.aws_secret_access_key, self.bucket_name = self.get_aws_credentials_from_file()
        else:
            self.aws_access_key_id, self.aws_secret_access_key, self.bucket_name = self.get_aws_credentials_from_secrets_manager()
        self.s3_client = self.create_s3_client()

    def get_aws_credentials_from_secrets_manager(self):
        client = boto3.client(service_name='secretsmanager')
        get_secret_value_response = client.get_secret_value(SecretId=self.secret_name_or_arn)
        secret_dict = json.loads(get_secret_value_response['SecretString'])
        return secret_dict['aws_access_key_id'], secret_dict['aws_secret_access_key'], secret_dict['bucket_name']

    def get_aws_credentials_from_file(self):
        credentials_df = pd.read_csv(self.file_path)
        return credentials_df['aws_access_key_id'].iloc[0], credentials_df['aws_secret_access_key'].iloc[0], credentials_df['bucket_name'].iloc[0]

    def create_s3_client(self):
        return boto3.client(
            's3',
            aws_access_key_id=self.aws_access_key_id,
            aws_secret_access_key=self.aws_secret_access_key
        )

    def check_and_transfer_file(self, source_key, destination_key):
        try:
            self.s3_client.head_object(Bucket=self.bucket_name, Key=destination_key)
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                self.s3_client.copy_object(
                    Bucket=self.bucket_name,
                    CopySource=f"{self.bucket_name}/{source_key}",
                    Key=destination_key
                )
            else:
                raise e

    def check_file_exists(self, file_key):
        try:
            self.s3_client.head_object(Bucket=self.bucket_name, Key=file_key)
            return True
        except botocore.exceptions.ClientError as e:
            if e.response['Error']['Code'] == "404":
                return False
            else:
                raise e

    def read_csv_from_s3(self, file_key):
        obj = self.s3_client.get_object(Bucket=self.bucket_name, Key=file_key)
        return pd.read_csv(obj['Body'])

    def write_csv_to_s3(self, file_key, df):
        csv_buffer = StringIO()
        df.to_csv(csv_buffer, index=False)
        self.s3_client.put_object(
            Bucket=self.bucket_name,
            Key=file_key,
            Body=csv_buffer.getvalue()
        )


In [None]:
# Reference in connecting on file or secrets manager
# Using credentials from a file
#s3_utils_file = S3Utils(secret_name_or_arn=None, file_path="path/to/credentials.csv")

# Using credentials from Secrets Manager
#s3_utils_secrets_manager = S3Utils(secret_name_or_arn="YourSecretNameOrARN")


In [None]:
# Import the S3Utils class from s3conn.ipynb
# %run s3conn.ipynb

# # Set the path to S3 AWS credentials file
# Using credentials from a file
#s3_utils = S3Utils(secret_name_or_arn=None, file_path="C:/churn/bucketcredentials.csv")

# Using credentials from Secrets Manager
s3_utils = S3Utils(secret_name_or_arn="arn:aws:secretsmanager:us-east-2:767397996410:secret:dev/s3/bucket_token-6t6xMP")


# Create an instance of the S3Utils class
# s3_utils = S3Utils(file_path)
# s3_utils = S3Utils()

# Define your parameters
env = "dev" #  dev, test, staging, prod 
mainsource = "SourceDataSet/bank_data_train.csv"
envraw = f'{env}/raw/bank_data_train.csv'

# Always check the Raw SourceCode is it exists if that it will copy the file to raw on a specific environment
# Send sourcefile to env/raw
s3_utils.check_and_transfer_file(mainsource, envraw)



In [None]:
# Read the raw data from the S3 bucket
input_file_key_data_cleaning = f'{env}/raw/bank_data_train.csv'  #this filename is from Kaggle and for the purpose of identification we are going to use the same filename as our main raw datasources. subfolder are used for distinction of file

# Output file key directory which is also servers as an input for other process
output_file_key_data_cleaning = f'{env}/processed/bank_data_cleaned.csv'

output_file_key_data_visualization = f'{env}/processed/bank_data_visualization.csv'

output_file_key_data_feature_engineering= f'{env}/final/bank_data_feature_eng.csv'

output_file_key_data_X_train = f'{env}/final/model_one/X_train.csv'
output_file_key_data_X_test = f'{env}/final/model_one/X_test.csv'

output_file_key_data_Y_train = f'{env}/final/model_one/Y_train.csv'
output_file_key_data_Y_test = f'{env}/final/model_one/Y_test.csv'

# output_file_key_data_model_one_train = f'{env}/final/model_one/bank_data_train.csv'
# output_file_key_data_model_one_test = f'{env}/final/model_one/bank_data_test.csv'

# output_file_key_data_model_two_train =   f'{env}/final/model_two/bank_data_train.csv'
# output_file_key_data_model_two_test =   f'{env}/final/model_two/bank_data_test.csv'

output_file_key_data_model_three_train = f'{env}/final/model_c/bank_data_train.csv'
output_file_key_data_model_three_test = f'{env}/final/model_c/bank_data_test.csv'

### execute notebooks accdly

In [None]:
# %run data_cleaning.ipynb
# %run data_visualization.ipynb
# %run feature_engineering.ipynb
# %run random_forest.ipynb


In [6]:
# Read the CSV From FEATURE ENGINEERING data source file from S3 into a DataFrame
# Use the methods from the S3Utils class
if s3_utils.check_file_exists(output_file_key_data_feature_engineering):
    data = s3_utils.read_csv_from_s3(output_file_key_data_feature_engineering)

In [7]:
# Separate features and target
X = data.drop('target', axis=1)  # Replace 'target' with the name of your target column
y = data['target']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [8]:
# Preprocess the data
imputer = SimpleImputer(strategy='mean')
X_train_imputed = imputer.fit_transform(X_train)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
smote = SMOTE(random_state=42, n_jobs=-1)
X_train_res, y_train_res = smote.fit_resample(X_train_scaled, y_train)

# Impute missing values in the test set using the same imputer
X_test_imputed = imputer.transform(X_test)

# Scale the imputed test data using the same scaler
X_test_scaled = scaler.transform(X_test_imputed)

# Apply PCA
pca = PCA(n_components=0.95)  # Retain 95% of the variance
X_train_pca = pca.fit_transform(X_train_res)
X_test_pca = pca.transform(X_test_scaled)  # Transform the test set with PCA

# Train the SVM model
model = LinearSVC(kernel='linear', class_weight='balanced')
model.fit(X_train_pca, y_train_res)

# Make predictions on the test set
y_pred = model.predict(X_test_pca)

# Save the predictions to a CSV file
predictions_df = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
predictions_df.to_csv('svm_improved_predictions.csv', index=False)

# Save the trained model and preprocessing objects
with open('model.pkl', 'wb') as file:
    pickle.dump({'model': model, 'imputer': imputer, 'scaler': scaler, 'pca': pca}, file)
