<a href="https://colab.research.google.com/github/1vanl0pez/DataScienceSantiagoIvan/blob/streamLit/DSPRO1_HRAnalytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install streamlit pyngrok
!pip install millify

Collecting streamlit
  Downloading streamlit-1.41.1-py2.py3-none-any.whl.metadata (8.5 kB)
Collecting pyngrok
  Downloading pyngrok-7.2.2-py3-none-any.whl.metadata (8.4 kB)
Collecting watchdog<7,>=2.1.5 (from streamlit)
  Downloading watchdog-6.0.0-py3-none-manylinux2014_x86_64.whl.metadata (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0m
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.41.1-py2.py3-none-any.whl (9.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.1/9.1 MB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyngrok-7.2.2-py3-none-any.whl (22 kB)
Downloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m45.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading watchdog-6.0.0-py3-none-manylinux2014_x86_64

In [12]:
%%writefile app.py
# Import libraries
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from millify import millify

# Read dataset
#file_path = '/content/general_data.csv'
#df = pd.read_csv(file_path)
# Data cleaning
def clean_data(df):
  columns_with_na = df.columns[df.isnull().sum() > 0].tolist()
  for col in columns_with_na:
      median_value = df[col].median()
      df.fillna({col: median_value}, inplace=True)
  df = pd.get_dummies(df, drop_first=True)
  return df
# Function to optimize input variables
def suggest_optimized_adjustments(input_data, model, target_increase=0.5, max_satisfaction=4.0, max_iterations=500, percent_adjustment=0.02):
    base_prediction = model.predict(pd.DataFrame(input_data, columns=top_5_features.columns))[0]
    target_prediction = min(base_prediction + target_increase, max_satisfaction)
    adjusted_data = input_data.copy()
    correlations = {feature: data_stayed[feature].corr(data_stayed["JobSatisfaction"]) for feature in top_5_features}
    for iteration in range(max_iterations):
        total_adjustments = []
        adjustments_applied = False
        for idx, feature in enumerate(top_5_features):
            if feature == "MonthlyIncome":
                percent_salary_hike_index = top_5_features.columns.get_loc("PercentSalaryHike")
                if adjusted_data[0, percent_salary_hike_index] > 30 and all(adjusted_data[0, i] <= 0 for i in range(len(top_5_features)) if top_5_features[i] != "MonthlyIncome"):
                    adjusted_data[0, idx] += 100
                    new_prediction = model.predict(pd.DataFrame(adjusted_data, columns=top_5_features))[0]
                    if new_prediction >= target_prediction:
                        return {
                            "Adjusted Inputs": dict(zip(top_5_features, adjusted_data[0])),
                            "New Prediction": new_prediction,
                            "Iterations": iteration + 1
                        }
                    adjustments_applied = True
                    continue
            input_modified = adjusted_data.copy()
            relationship_direction = 1 if correlations[feature] > 0 else -1
            proposed_value = adjusted_data[0, idx] * (1 + percent_adjustment * relationship_direction)
            if proposed_value <= 0:
                continue
            if feature == "PercentSalaryHike" and proposed_value > 30:
                continue
            input_modified[0, idx] = proposed_value
            new_prediction = model.predict(pd.DataFrame(input_modified, columns=top_5_features.columns))[0]
            impact_per_unit = (new_prediction - base_prediction) / (adjusted_data[0, idx] * percent_adjustment)
            if abs(impact_per_unit) > 0:
                adjustment_step = proposed_value - adjusted_data[0, idx]
                total_adjustments.append((idx, adjustment_step, impact_per_unit))
                adjustments_applied = True

        for idx, adjustment_step, _ in total_adjustments:
            adjusted_data[0, idx] += adjustment_step

        new_prediction = model.predict(pd.DataFrame(adjusted_data, columns=top_5_features.columns))[0]
        if new_prediction >= target_prediction:
            return {
                "Adjusted Inputs": dict(zip(top_5_features.columns, adjusted_data[0])),
                "New Prediction": new_prediction,
                "Iterations": iteration + 1
            }
        if not adjustments_applied:
            return {
                "Message": "No further optimization possible while maintaining positive values.",
                "Adjusted Inputs": dict(zip(top_5_features, adjusted_data[0])),
                "Final Prediction": new_prediction
            }
    return {
        "Message": "Max iterations reached without achieving target increase.",
        "Adjusted Inputs": dict(zip(top_5_features.columns, adjusted_data[0])),
        "Final Prediction": new_prediction
    }

# USER INTERFACE USING STREAMLIT
# Title and logo using columns to improve distribution
col1, col2 = st.columns([1, 3])
with col1:
    st.image("logo.png", width=400)
with col2:
    st.title("Job Satisfaction Prediction App")

st.write("#### To start with the analysis, upload your data")
cols = st.columns(3)
with cols[0]:
    dataset = st.file_uploader("Choose a file", type=["xlsx"], accept_multiple_files=False)
if dataset is not None:
    df = pd.read_excel(dataset)
    df.to_csv('/content/general_data.csv', index=False)
    df = clean_data(df)
    # Train the model with employees who stayed in the company
    data_stayed = df[df['Attrition_Yes'] == 0]
    X_stayed = data_stayed.drop(columns=["JobSatisfaction"])
    y_stayed = data_stayed["JobSatisfaction"]
    # Select the features to work with
    top_5_features = X_stayed[["MonthlyIncome", "DistanceFromHome", "PercentSalaryHike",
                              "YearsSinceLastPromotion", "TrainingTimesLastYear"]]
    X_important_stayed = data_stayed[top_5_features.columns]
    # Data model: random forest regressor
    final_model = RandomForestRegressor(n_estimators=100, max_depth=9, random_state=24)
    final_model.fit(X_important_stayed, y_stayed)
    # Subheader for feature ranges
    # Horizontal separator
    st.write("-" * 500)
    st.subheader("Features range:")
    cols = st.columns(3)  # Divide the UI into 3 columns
    colors = ["#0671e3", "#09bf04", "#e32406"]  # Progress bar colors

    # Display progress bars for the first three features
    for i, feature in enumerate(top_5_features.columns[:3]):
        min_val = data_stayed[feature].min()
        max_val = data_stayed[feature].max()
        mean_val = data_stayed[feature].mean()
        # Normalize mean for progress bar
        normalized_mean = (mean_val - min_val) / (max_val - min_val) * 100
        color = colors[i]
        # HTML for a styled progress bar
        progress_bar_html = f"""
        <div style="text-align: center; margin-bottom: 20px;">
            <div style="font-size: 18px; font-weight: bold; color: #a8b0b2; margin-bottom: px;">{feature}</div>
            <div style="width: 100%; background-color: #b3c8cb; border-radius: 25px; position: relative;">
                <div style="height: 20px; width: {normalized_mean}%; background-color: {color}; border-radius: 25px;"></div>
                <span style="position: absolute; left: 0%; top: 25px; font-size: 18px; font-weight: bold; color: #b3c8cb;">{min_val}</span>
                <span style="position: absolute; left: {normalized_mean}%; top: 25px; font-size: 18px; font-weight: bold; color: #b3c8cb; transform: translateX(-50%);">{mean_val:.2f}</span>
                <span style="position: absolute; right: 0%; top: 25px; font-size: 18px; font-weight: bold; color: #b3c8cb;">{max_val}</span>
            </div>
        </div>"""
        # Render the progress bar in the column
        cols[i].markdown(progress_bar_html, unsafe_allow_html=True)

    # Display progress bars for the last two features
    cols2 = st.columns(4)
    colors2 = ["#e306a0", "#8b00ff"]  # Progress bar colors
    for i, feature in enumerate(top_5_features.columns[3:]):
        min_val = data_stayed[feature].min()
        max_val = data_stayed[feature].max()
        mean_val = data_stayed[feature].mean()
        normalized_mean = (mean_val - min_val) / (max_val - min_val) * 100
        color = colors2[i]
        progress_bar_html = f"""
        <br>
        <div style="text-align: center; margin-bottom: 20px;">
            <div style="font-size: 18px; font-weight: bold; color: #a8b0b2; margin-bottom: 5px;">{feature}</div>
            <div style="width: 100%; background-color: #b3c8cb; border-radius: 25px; position: relative;">
                <div style="height: 20px; width: {normalized_mean}%; background-color: {color}; border-radius: 25px;"></div>
                <span style="position: absolute; left: 0%; top: 25px; font-size: 18px; font-weight: bold; color: #b3c8cb;">{min_val}</span>
                <span style="position: absolute; left: {normalized_mean}%; top: 25px; font-size: 18px; font-weight: bold; color: #b3c8cb; transform: translateX(-50%);">{mean_val:.2f}</span>
                <span style="position: absolute; right: 0%; top: 25px; font-size: 18px; font-weight: bold; color: #b3c8cb;">{max_val}</span>
            </div>
        </div>"""
        # Render the progress bar in columns
        cols2[i + 1].markdown(progress_bar_html, unsafe_allow_html=True)
    # Horizontal separator
    st.write("-" * 500)

    # Input fields for each feature
    st.subheader("Enter the following data to compute job satisfaction:")
    input_data = []
    # Use columns to improve distribution
    col_inputs = st.columns(5)
    for i, feature in enumerate(top_5_features.columns):
        with col_inputs[i]:
            value = st.number_input(feature, min_value=0.0)
        input_data.append(value)
    # Convert input to numpy array
    input_data = np.array([input_data])

    # Button to predict satisfaction
    if st.button('Predict satisfaction'):
        # Initial values for variables
        st.session_state["satisfaction_prediction"] = None
        st.session_state["optimization_result"] = None
        st.session_state["prediction_made"] = False

        satisfaction_prediction = final_model.predict(pd.DataFrame(input_data, columns=top_5_features.columns))[0]
        st.session_state["satisfaction_prediction"] = satisfaction_prediction
        st.session_state["prediction_made"] = True

    # Display prediction results when the prediction has been made
    # .get (key, default value) -> returns the existing value, so if
    # it's True, the code will be executed
    if st.session_state.get("prediction_made", False):
        satisfaction_prediction = st.session_state["satisfaction_prediction"]
        # Select image to display based on the result
        if satisfaction_prediction <= 2:
            traffic_image = "red.png"
        elif 2 < satisfaction_prediction <= 3:
            traffic_image = "yellow.png"
        else:
            traffic_image = "green.png"
        # Create columns to improve visual arrangement
        cols = st.columns(4)
        # Display predicted satisfaction using an HTML markdown
        with cols[1]:
            st.markdown(f"""
                <div style="display: flex; align-items: center; justify-content: space-between; height: 135px; padding: 40px; border: 1px solid #ddd; border-radius: 10px; margin: 10px 0;">
                    <div style="text-align: center;">
                        <div style="font-size: 20px; font-weight: bold; color: #a8b0b2;">Predicted satisfaction</div>
                        <div style="font-size: 40px; font-weight: bold; color: #0671e3;">{satisfaction_prediction:.2f}</div>
                    </div>
                </div>
            """, unsafe_allow_html=True)
        # Display the traffic light image
        with cols[2]:
            st.image(traffic_image, width=75)

        # Show optimization button if satisfaction is not green
        if satisfaction_prediction > 3:
            st.write("Satisfaction is already at green level. No adjustments needed.")
        else:
            # Optimize input features
            if st.button('Optimize results'):
                optimization_result = suggest_optimized_adjustments(
                    input_data, final_model, step=0.5, target_increase=0.5, max_satisfaction=4.0
                )
                st.session_state["optimization_result"] = optimization_result

    # Display optimization results if there are optimized values
    if st.session_state.get("optimization_result", None):
        optimization_result = st.session_state["optimization_result"]
        # In case features were optimized compute delta with original ones
        if "Adjusted Inputs" in optimization_result:
            original_values = dict(zip(top_5_features.columns, input_data[0]))
            adjusted_values = optimization_result["Adjusted Inputs"]
            final_prediction = optimization_result.get("New Prediction", st.session_state["satisfaction_prediction"])
            iterations = optimization_result.get("Iterations", 0)
            prediction_delta = final_prediction - st.session_state["satisfaction_prediction"]

            #Display results in columns
            st.subheader("Optimization results:")
            columns = st.columns(len(top_5_features.columns))  # Create columns for metrics
            for idx, (feature, adjusted_value) in enumerate(adjusted_values.items()):
                original_value = original_values[feature]
                delta = adjusted_value - original_value
                with columns[idx]:
                    # Millify MonthlyIncome
                    if feature == "MonthlyIncome":
                        income = millify(adjusted_value, precision=5)
                        st.metric(label=feature, value=income, delta=f"{delta:.2f}")
                    else:
                        st.metric(label=feature, value=f"{adjusted_value:.2f}", delta=f"{delta:.2f}")

            # Display final prediction and iterations
            if final_prediction <= 2:
                traffic_image = "red.png"
            elif 2 < final_prediction <= 3:
                traffic_image = "yellow.png"
            else:
                traffic_image = "green.png"
            colus = st.columns(4)
            with colus[1]:
                st.markdown(f"""
                    <div style="display: flex; align-items: center; justify-content: space-between; height: 135px; padding: 60px; border: 1px solid #ddd; border-radius: 10px; margin: 10px 0;">
                        <div style="text-align: center;">
                            <div style="font-size: 20px; font-weight: bold; color: #a8b0b2;">Final satisfaction</div>
                            <div style="font-size: 40px; font-weight: bold; color: #0671e3;">{final_prediction:.2f}</div>
                        </div>
                    </div>
                """, unsafe_allow_html=True)
            with colus[2]:
                st.image(traffic_image, width=75)
            st.write(f"Iterations taken for optimization: **{iterations}**")

Overwriting app.py


In [13]:
# Token authentification
!ngrok authtoken 2pWW0XfbvaktveDe8yiGGwsULY4_2ruivCyVZmVyQYPSEPGrQ

Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [14]:
# Stop all running processes
!killall ngrok



In [15]:
# Create public URL
from pyngrok import ngrok
# Connect to port 8501 and specify the tunnel type
public_url = ngrok.connect(8501)
print(f"Streamlit app URL: {public_url}")

Streamlit app URL: NgrokTunnel: "https://0a82-34-106-220-71.ngrok-free.app" -> "http://localhost:8501"


In [16]:
# Run app
!streamlit run app.py &>/dev/null&