In [1]:
import pandas as pd
%matplotlib inline
from sklearn import preprocessing
from sklearn.preprocessing import LabelEncoder
from scipy.stats import chi2_contingency
import configparser
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np

In [2]:
def init_config():
    config = configparser.ConfigParser()
    config.read("../config.ini")
    return config

def download_blob(config):
    """Downloads a blob from the bucket."""
    
    # Initialize a storage client
    path = config['gcp']['raw_data']
    df = pd.read_csv(path)
    return df

config = init_config()
df = download_blob(config)

In [3]:
categorical_columns = df.select_dtypes(include=['object']).columns
for column in categorical_columns:
    df[column] = df[column].str.replace(' ', '_').str.lower()

In [4]:
df.Department = df.Department.map(lambda x : "research_and_development" if x == 'research_&_development' else x) 

In [None]:
df.head()

In [6]:
numerical_columns = df.select_dtypes(include=['int64'])

In [7]:
def create_and_replace_bins(df, columns, num_bins=5):
    binned_df = df.copy()
    for col in columns:
        col_range = df[col].max() - df[col].min()
        if col_range > 5:
            # Calculate quantile-based bins
            _, bins = pd.qcut(df[col], q=num_bins, duplicates='drop', retbins=True)
            # Adjust bins to integer values
            bins = np.floor(bins).astype(int)
            bins[-1] = bins[-1] + 1  # Ensure the last bin is inclusive
            # Create labels based on bin edges
            labels = [f'{bins[i]}<=x<{bins[i+1]}' for i in range(len(bins)-1)]
            # Apply binning and replace original column
            binned_df[col] = pd.cut(df[col], bins=bins, labels=labels, right=False, include_lowest=True)
    return binned_df

# Apply dynamic binning and replace columns with integer bins where range is > 5
df = create_and_replace_bins(df, numerical_columns, num_bins=5)

In [None]:
def get_binned_value_counts(df, columns):
    value_counts_dict = {}
    for col in columns:
        value_counts_dict[col] = df[col].value_counts().to_dict()
    return value_counts_dict

# Get value counts for each binned column
get_binned_value_counts(df, numerical_columns)


In [9]:
df['Attrition'] = df['Attrition'].map(lambda x : 0 if x == 'no' else 1) 


In [None]:
df.select_dtypes(include=['object'])

In [None]:
# Encoding categorical variables
df['Gender'] = df['Gender'].map({'male': 1, 'female': 0})
df['MaritalStatus'] = df['MaritalStatus'].map({'single': 0, 'married': 1, 'divorced': 2})

# Contingency tables
contingency_table_gender = pd.crosstab(df['Gender'], df['Attrition'])
contingency_table_marital = pd.crosstab(df['MaritalStatus'], df['Attrition'])

print("Contingency Table - Gender vs Target:")
print(contingency_table_gender)

print("\nContingency Table - Marital Status vs Target:")
print(contingency_table_marital)

# Chi-Square tests
chi2_gender, p_gender, dof_gender, ex_gender = chi2_contingency(contingency_table_gender)
chi2_marital, p_marital, dof_marital, ex_marital = chi2_contingency(contingency_table_marital)

print(f"\nChi-Square Test between gender and target: chi2 = {chi2_gender}, p-value = {p_gender}")
print(f"Chi-Square Test between marital_status and target: chi2 = {chi2_marital}, p-value = {p_marital}")

# Interpretation
alpha = 0.05
print("\nInterpretation:")
if p_gender < alpha:
    print("There is a significant association between gender and target (p < 0.05).")
else:
    print("There is no significant association between gender and target (p >= 0.05).")

if p_marital < alpha:
    print("There is a significant association between marital status and target (p < 0.05).")
else:
    print("There is no significant association between marital status and target (p >= 0.05).")

In [12]:
df.drop(columns=['Gender','StandardHours', 'Over18', 'EmployeeCount', 'EmployeeNumber', 'BusinessTravel'], inplace=True)

In [None]:
df.head()

In [14]:
# Business Travel

# df['BusinessTravel'].value_counts()
# # in this case we will use label encoding -- you should check the reason !! we could have choosen one-hot but we didn't ...
# df['BusinessTravel'] = df['BusinessTravel'].apply(lambda x: 0 if x =='Non-Travel' else (1 if x == 'Travel_Rarely' else 2))

# Department

df['Department'].value_counts()
department_dummies = pd.get_dummies(df['Department'], prefix='Department', dtype=float) # why now we used one-hot encoding and not label encoding?
df.drop(columns=['Department'], inplace=True)
df = pd.concat([df,department_dummies], axis=1)

# Education Field and JobRole

for field in ['JobRole', 'EducationField']:
    lb = preprocessing.LabelBinarizer()
    new_data = lb.fit_transform(df[field])
    binary_df = pd.DataFrame(new_data, columns=[f"{field}_{cls}" for cls in lb.classes_])
    df = pd.concat([df.drop(columns=[field]), binary_df], axis=1)


# OverTime

df['OverTime'] = df['OverTime'].map({'yes': 1, 'no': 0})

In [None]:
df.head()

In [None]:
df.dtypes

In [17]:
from imblearn.over_sampling import SMOTENC

desired_minority_count = int(0.3 * 1233)
sampling_strategy = {0: 1233, 1: desired_minority_count}
X_train = df.drop(columns=['Attrition'])
y_train = df['Attrition']

smotenc = SMOTENC(categorical_features="auto", sampling_strategy=sampling_strategy, random_state=42)
X_train_resampled, y_train_resampled = smotenc.fit_resample(X_train, y_train)

# Combine resampled features with target
df = pd.concat([pd.DataFrame(X_train_resampled, columns=X_train.columns), pd.DataFrame(y_train_resampled, columns=['Attrition'])], axis=1)

In [None]:
df.columns

In [None]:
df.count()

In [None]:
df.Attrition.value_counts()

In [None]:
df.isnull().sum()

In [22]:
df.to_csv("gs://pa-poc-mlspec-3-cs/pre-processed4.csv", index=False)

In [23]:
def init_config():
    config = configparser.ConfigParser()
    config.read("../config.ini")
    return config

config = init_config()

In [None]:
# First create a dataset 

import os

from google.cloud import aiplatform

PROJECT_ID = config['gcp']['project']
REGION = config['gcp']['region']

aiplatform.init(project=PROJECT_ID, location=REGION)

dataset = aiplatform.TabularDataset.create(
    display_name="HR Analytics3",
    gcs_source="gs://pa-poc-mlspec-3-cs/pre-processed4.csv",
)

label_column = "Attrition"

print(dataset.resource_name)
     

In [None]:
job = aiplatform.AutoMLTabularTrainingJob(
  display_name="train-automl-hr-analytics1",
  optimization_prediction_type="classification",
  optimization_objective="maximize-au-prc",
)

model = job.run(
    dataset=dataset,
    target_column=label_column,
    training_fraction_split=0.6,
    validation_fraction_split=0.2,
    test_fraction_split=0.2,
    budget_milli_node_hours=1000,
    model_display_name="test1",
    disable_early_stopping=False,
)

In [None]:
endpoint = model.deploy(deployed_model_display_name='test1',
    machine_type='n1-standard-4'
)

In [None]:
import requests
import subprocess
from pprint import pprint

# Get the access token using gcloud
access_token = (
    subprocess.check_output("gcloud auth print-access-token", shell=True)
    .decode("utf-8")
    .strip()
)


# Define the project ID, endpoint ID, and input data file
project_id = "121050757542"
endpoint_id = "1928034321235443712"

input_data = {"instances": [[1.0, 2.0, 17.0, 2.0, 0.0, 1.0, 5.0, 8.0]]}
# Define the endpoint URL
url = f"https://us-central1-aiplatform.googleapis.com/v1/projects/{project_id}/locations/us-central1/endpoints/{endpoint_id}:predict"

# Define the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json",
}

# Send the POST request
response = requests.post(url, headers=headers, json=input_data)

# Print the response
pprint(response.json())

TEST WITH CALLABLE API

In [None]:
import requests
import subprocess
from pprint import pprint

# Get the access token using gcloud
access_token = (
    subprocess.check_output("gcloud auth print-access-token", shell=True)
    .decode("utf-8")
    .strip()
)


# Define the project ID, endpoint ID, and input data file
project_id = config['prediction']['project_id']
endpoint_id = config['prediction']['endpoint_id']

input_data = {
    "instances": [
        {
            "Age": "29 <= x <= 34",
            "DailyRate": "102 <= x <= 391",
            "DistanceFromHome": "2 <= x <= 5",
            "Education": "3",
            "EnvironmentSatisfaction": "3",
            "HourlyRate": "45 <= x <= 59",
            "JobInvolvement": "3",
            "JobLevel": "1",
            "JobSatisfaction": "3",
            "MaritalStatus": "1",
            "MonthlyIncome": "1009 <= x <= 2695",
            "MonthlyRate": "6887 <= x <= 11773",
            "NumCompaniesWorked": "1 <= x <= 3",
            "OverTime": "0",
            "PercentSalaryHike": "13 <= x <= 15",
            "PerformanceRating": "3",
            "RelationshipSatisfaction": "3",
            "StockOptionLevel": "0",
            "TotalWorkingYears": "10 <= x <= 17",
            "TrainingTimesLastYear": "2 <= x <= 3",
            "WorkLifeBalance": "3",
            "YearsAtCompany": "2 <= x <= 5",
            "YearsInCurrentRole": "2 <= x <= 4",
            "YearsSinceLastPromotion": "0 <= x <= 1",
            "YearsWithCurrManager": "2 <= x <= 4",
            "Department_human_resources": "0.0",
            "Department_research_and_development": "1.0",
            "Department_sales": "0.0",
            "JobRole_healthcare_representative": "0",
            "JobRole_human_resources": "0",
            "JobRole_laboratory_technician": "0",
            "JobRole_manager": "0",
            "JobRole_manufacturing_director": "0",
            "JobRole_research_director": "0",
            "JobRole_research_scientist": "0",
            "JobRole_sales_executive": "0",
            "JobRole_sales_representative": "0",
            "EducationField_human_resources": "0",
            "EducationField_life_sciences": "0",
            "EducationField_marketing": "0",
            "EducationField_medical": "0",
            "EducationField_other": "0",
            "EducationField_technical_degree": "0",
        }
    ]
}
url = f"https://europe-west2-aiplatform.googleapis.com/v1/projects/{project_id}/locations/europe-west2/endpoints/{endpoint_id}:predict"

# Define the headers
headers = {
    "Authorization": f"Bearer {access_token}",
    "Content-Type": "application/json",
}

# Send the POST request
response = requests.post(url, headers=headers, json=input_data)

# Print the response
pprint(response.json())


In [None]:
from google.cloud import storage
from PIL import Image
import io

# Initialize the Google Cloud Storage client
storage_client = storage.Client()

# Function to resize an image
def resize_image(image_bytes, target_size=(224, 224)):
    img = Image.open(io.BytesIO(image_bytes))
    img_resized = img.resize(target_size)
    output_bytes = io.BytesIO()
    img_resized.save(output_bytes, format=img.format)
    return output_bytes.getvalue()

# Function to process images from GCS
def process_images_from_gcs(bucket_name, source_folder, target_folder, target_size=(224, 224)):
    bucket = storage_client.bucket(bucket_name)

    # List all images in the source folder
    blobs = bucket.list_blobs(prefix=source_folder)

    for blob in blobs:
        if blob.name.endswith(".jpg"):  # Check for image files (adjust for your format)
            # Download the image as bytes
            image_bytes = blob.download_as_bytes()

            # Resize the image
            resized_image_bytes = resize_image(image_bytes, target_size)

            # Define the target path in GCS
            target_path = f"{target_folder}/{blob.name.split('/')[-1]}"

            # Upload the resized image back to GCS
            new_blob = bucket.blob(target_path)
            new_blob.upload_from_string(resized_image_bytes, content_type="image/jpeg")

            print(f"Resized and uploaded: {target_path}")

# Example usage
bucket_name = "pa-poc-mlspec-3-cs"
source_folder = "path/to/source/images"
target_folder = "path/to/resized/images"
process_images_from_gcs(bucket_name, source_folder, target_folder)
