<a href="https://colab.research.google.com/github/1vanl0pez/DataScienceSantiagoIvan/blob/streamLit/DSPRO1_HRAnalytics.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install streamlit pyngrok


Collecting pyngrok
  Downloading pyngrok-7.2.1-py3-none-any.whl.metadata (8.3 kB)
Downloading pyngrok-7.2.1-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.1


In [16]:
from google.colab import drive
drive.mount('/content/drive')

import pandas as pd
file_path = '/content/drive/MyDrive/general_data.xlsx'
df = pd.read_excel(file_path)

# Save dataframe as a CSV to make it easier to load in the Streamlit script
df.to_csv('/content/general_data.csv', index=False)


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [32]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

# Cargar datos desde el archivo CSV guardado
file_path = '/content/general_data.csv'
df = pd.read_csv(file_path)

# Paso de limpieza de datos
columns_with_na = df.columns[df.isnull().sum() > 0].tolist()
for col in columns_with_na:
    median_value = df[col].median()
    df.fillna({col: median_value}, inplace=True)
df = pd.get_dummies(df, drop_first=True)

# Función para realizar la validación cruzada con k fold y registrar la importancia de las características
def calculate_feature_importances(X, y):
    kf = KFold(n_splits=5, shuffle=True, random_state=24)
    fold_importances = []
    fold_errors = []
    for train_index, test_index in kf.split(X):
        X_train, X_test = X.iloc[train_index], X.iloc[test_index]
        y_train, y_test = y.iloc[train_index], y.iloc[test_index]
        model = RandomForestRegressor(n_estimators=100, random_state=24)
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)
        fold_errors.append(mean_squared_error(y_test, y_pred))
        fold_importances.append(model.feature_importances_)
    avg_importances = np.mean(fold_importances, axis=0)
    avg_error = np.mean(fold_errors)
    return avg_importances, avg_error

def calculate_adjustments(model, X, y, step=0.1):
    adjustments = []
    base_prediction = model.predict(X)
    for feature in X.columns:
        X_modified = X.copy()
        X_modified[feature] += step
        increased_prediction = model.predict(X_modified)
        impact_per_unit = (increased_prediction - base_prediction).mean() / step
        correlation = X[feature].corr(y)
        direction = "Increase" if correlation > 0 else "Decrease"
        adjustments.append({
            "Feature": feature,
            "Impact per unit": impact_per_unit,
            "Suggested change": direction
        })
    adjustment_df = pd.DataFrame(adjustments).sort_values(by="Impact per unit", ascending=False)
    return adjustment_df

def suggest_optimized_adjustments(input_data, model, step=0.5, target_increase=0.5, max_satisfaction=4.0, max_iterations=500):
    base_prediction = model.predict(pd.DataFrame(input_data, columns=top_5_features.columns))[0]
    target_prediction = min(base_prediction + target_increase, max_satisfaction)
    adjusted_data = input_data.copy()
    correlations = {feature: data_stayed[feature].corr(data_stayed["JobSatisfaction"]) for feature in top_5_features}

    for iteration in range(max_iterations):
        total_adjustments = []
        adjustments_applied = False
        for idx, feature in enumerate(top_5_features):
            if feature == "MonthlyIncome":
                percent_salary_hike_index = top_5_features.columns.get_loc("PercentSalaryHike")
                if adjusted_data[0, percent_salary_hike_index] > 30 and all(adjusted_data[0, i] <= 0 for i in range(len(top_5_features)) if top_5_features[i] != "MonthlyIncome"):
                    adjusted_data[0, idx] += 100
                    new_prediction = model.predict(pd.DataFrame(adjusted_data, columns=top_5_features))[0]
                    if new_prediction >= target_prediction:
                        return {
                            "Adjusted Inputs": dict(zip(top_5_features, adjusted_data[0])),
                            "New Prediction": new_prediction,
                            "Iterations": iteration + 1
                        }
                    adjustments_applied = True
                    continue
            input_modified = adjusted_data.copy()
            relationship_direction = 1 if correlations[feature] > 0 else -1
            proposed_value = adjusted_data[0, idx] + step * relationship_direction
            if proposed_value <= 0:
                continue
            if feature == "PercentSalaryHike" and proposed_value > 30:
                continue
            input_modified[0, idx] = proposed_value
            new_prediction = model.predict(pd.DataFrame(input_modified, columns=top_5_features.columns))[0]
            impact_per_unit = (new_prediction - base_prediction) / step
            if abs(impact_per_unit) > 0:
                adjustment_step = step * relationship_direction
                total_adjustments.append((idx, adjustment_step, impact_per_unit))
                adjustments_applied = True

        for idx, adjustment_step, _ in total_adjustments:
            adjusted_data[0, idx] += adjustment_step

        new_prediction = model.predict(pd.DataFrame(adjusted_data, columns=top_5_features.columns))[0]
        if new_prediction >= target_prediction:
            return {
                "Adjusted Inputs": dict(zip(top_5_features.columns, adjusted_data[0])),
                "New Prediction": new_prediction,
                "Iterations": iteration + 1
            }
        if not adjustments_applied:
            return {
                "Message": "No further optimization possible while maintaining positive values.",
                "Adjusted Inputs": dict(zip(top_5_features, adjusted_data[0])),
                "Final Prediction": new_prediction
            }
    return {
        "Message": "Max iterations reached without achieving target increase.",
        "Adjusted Inputs": dict(zip(top_5_features.columns, adjusted_data[0])),
        "Final Prediction": new_prediction
    }

# Definir características y pasos de entrenamiento del modelo
data_stayed = df[df['Attrition_Yes'] == 0]
X_stayed = data_stayed.drop(columns=["JobSatisfaction"])
y_stayed = data_stayed["JobSatisfaction"]

top_5_features = X_stayed[["MonthlyIncome", "DistanceFromHome", "PercentSalaryHike",
                           "YearsSinceLastPromotion", "TrainingTimesLastYear"]]
X_important_stayed = data_stayed[top_5_features.columns]

final_model = RandomForestRegressor(n_estimators=100, random_state=24)
final_model.fit(X_important_stayed, y_stayed)

# Interfaz de Streamlit
st.title("Job Satisfaction Prediction App")
st.write("## Enter the following data to determine job satisfaction:")

input_data = []
for feature in top_5_features.columns:
    value = st.number_input(feature, min_value=0.0)
    input_data.append(value)

input_data = np.array([input_data])

if st.button('Predict Satisfaction'):
    satisfaction_prediction = final_model.predict(pd.DataFrame(input_data, columns=top_5_features.columns))[0]

    if satisfaction_prediction <= 2:
        traffic_light = "Red"
    elif 2 < satisfaction_prediction <= 3:
        traffic_light = "Yellow"
    else:
        traffic_light = "Green"

    st.write(f"Predicted Satisfaction: {satisfaction_prediction:.2f}")
    st.write(f"Traffic Light Scale: {traffic_light}")

    if traffic_light != "Green":
        st.write("Optimizing adjustments to improve satisfaction...")
        optimization_result = suggest_optimized_adjustments(input_data, final_model, step=0.5, target_increase=0.5, max_satisfaction=4.0)
        st.write("Optimization Results:")
        for key, value in optimization_result.items():
            st.write(f"{key}: {value}")
    else:
        st.write("Satisfaction is already at green level. No adjustments needed.")

st.write("### Relevant Features Range for Job Satisfaction:")
for feature in top_5_features.columns:
    min_val = data_stayed[feature].min()
    max_val = data_stayed[feature].max()
    mean_val = data_stayed[feature].mean()
    st.write(f"{feature} - Min: {min_val}, Max: {max_val}, Avg: {mean_val:.2f}")


Overwriting app.py


In [33]:
!ngrok authtoken 2pWW0XfbvaktveDe8yiGGwsULY4_2ruivCyVZmVyQYPSEPGrQ


Authtoken saved to configuration file: /root/.config/ngrok/ngrok.yml


In [34]:
!killall ngrok


In [35]:
from pyngrok import ngrok

# Connect to port 8501 and specify the tunnel type
public_url = ngrok.connect(8501)
print(f"Streamlit app URL: {public_url}")


Streamlit app URL: NgrokTunnel: "https://9188-34-169-61-194.ngrok-free.app" -> "http://localhost:8501"


In [20]:
!streamlit run app.py &>/dev/null&

