In [None]:
!pip install -q streamlit
!npm install -q localtunne

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.3/44.3 kB[0m [31m2.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m9.9/9.9 MB[0m [31m95.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.9/6.9 MB[0m [31m101.8 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m79.1/79.1 kB[0m [31m6.4 MB/s[0m eta [36m0:00:00[0m
[?25h[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K[1mnpm[22m [31merror[39m [94mcode[39m E404
[1G[0K⠋[1G[0K[1mnpm[22m [31merror[39m [94m404[39m Not Found - GET https://registry.npmjs.org/localtunne - Not found
[1G[0K⠋[1G[0K[1mnpm[22m [31merror[39m [94m404[39m
[1G[0K⠋[1G[0K[1mnpm[22m [31merror[39m [94m404[39m  'localtunne@*' is not in this registry.
[1G[0K⠋[1G[0K[1mnpm[22m [31merror[39m [94m404[39m
[1G[0K⠋[1G[0K[1mnpm[2

In [None]:
%%writefile app.py
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import streamlit as st
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import (accuracy_score, precision_score, recall_score, f1_score,
                             roc_auc_score, confusion_matrix, roc_curve)
import joblib

# ------------------- Data Loading -------------------
@st.cache_data
def load_data():
    df = pd.read_csv("Employee-Attrition - Employee-Attrition.csv")
    df = df.drop(['EmployeeCount', 'Over18', 'StandardHours', 'EmployeeNumber'], axis=1)
    df['AttritionFlag'] = df['Attrition'].map({'Yes': 1, 'No': 0})
    df.drop('Attrition', axis=1, inplace=True)
    return df

df = load_data()

# ------------------- Task Selection -------------------
st.title("Employee Insights & Prediction Dashboard")
task = st.sidebar.selectbox("Select Prediction Task", [
    "Attrition Prediction",
    "Promotion Likelihood Prediction"
])

# ------------------- Task-specific Feature Selection -------------------
if task == "Attrition Prediction":
    target = "AttritionFlag"
    features = ["Age", "Department", "MonthlyIncome", "JobSatisfaction",
                "YearsAtCompany", "MaritalStatus", "OverTime"]
    model_type = "classification"
    st.header("Attrition Prediction")

elif task == "Promotion Likelihood Prediction":
    target = "YearsSinceLastPromotion"
    features = ["JobLevel", "TotalWorkingYears", "YearsInCurrentRole",
                "PerformanceRating", "Education"]
    model_type = "regression"
    st.header("Promotion Likelihood Prediction")

# ------------------- EDA -------------------
st.subheader("Exploratory Data Analysis (EDA)")
# Histogram
st.markdown("### Histograms")
numeric_cols = df[features].select_dtypes(include=['int64', 'float64']).columns
for col in numeric_cols:
    fig, ax = plt.subplots()
    sns.histplot(data=df, x=col, kde=True, ax=ax)
    st.pyplot(fig)

# Countplots for categorical
st.markdown("### Categorical Distributions")
cat_cols = df[features].select_dtypes(include='object').columns
for col in cat_cols:
    fig, ax = plt.subplots()
    sns.countplot(data=df, x=col, ax=ax)
    plt.xticks(rotation=45, ha='right')
    st.pyplot(fig)

# Correlation heatmap
st.markdown("### Correlation Heatmap")
numeric_df = df.select_dtypes(include=np.number)
corr = numeric_df.corr()
fig, ax = plt.subplots(figsize=(10, 8))
sns.heatmap(corr, annot=True, fmt=".2f", cmap="coolwarm", ax=ax)
ax.set_title("Correlation Heatmap")
st.pyplot(fig)

# ------------------- Preprocessing -------------------
df_model = df[features + [target]].copy()
df_model = pd.get_dummies(df_model, drop_first=True)

X = df_model.drop(target, axis=1)
y = df_model[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# ------------------- Model Training -------------------
if model_type == "classification":
    models = {
        "Logistic Regression": LogisticRegression(max_iter=1000),
        "Random Forest": RandomForestClassifier(n_estimators=100),
        "SVM": SVC(kernel='rbf', probability=True),
        "Decision Tree": DecisionTreeClassifier()
    }
else:
    from sklearn.linear_model import LinearRegression
    from sklearn.ensemble import RandomForestRegressor
    from sklearn.tree import DecisionTreeRegressor
    from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
    models = {
        "Linear Regression": LinearRegression(),
        "Random Forest Regressor": RandomForestRegressor(n_estimators=100),
        "Decision Tree Regressor": DecisionTreeRegressor()
    }

metrics = []
roc_data = {}
conf_matrices = {}

for name, model in models.items():
    model.fit(X_train_scaled, y_train)
    y_pred = model.predict(X_test_scaled)

    if model_type == "classification":
        y_prob = model.predict_proba(X_test_scaled)[:, 1] if hasattr(model, "predict_proba") else y_pred
        acc = accuracy_score(y_test, y_pred)
        prec = precision_score(y_test, y_pred, average='macro')
        rec = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        auc = roc_auc_score(y_test, y_prob, multi_class='ovr') if len(np.unique(y)) > 2 else roc_auc_score(y_test, y_prob)
        metrics.append([name, acc, prec, rec, f1, auc])
        if len(np.unique(y)) == 2:
            fpr, tpr, _ = roc_curve(y_test, y_prob)
            roc_data[name] = (fpr, tpr)
        conf_matrices[name] = confusion_matrix(y_test, y_pred)
    else:
        mse = mean_squared_error(y_test, y_pred)
        mae = mean_absolute_error(y_test, y_pred)
        r2 = r2_score(y_test, y_pred)
        metrics.append([name, mse, mae, r2])

# ------------------- Evaluation -------------------
st.subheader("Model Evaluation Metrics")
if model_type == "classification":
    metrics_df = pd.DataFrame(metrics, columns=["Model", "Accuracy", "Precision", "Recall", "F1-score", "AUC-ROC"])
else:
    metrics_df = pd.DataFrame(metrics, columns=["Model", "MSE", "MAE", "R2"])
st.dataframe(metrics_df.set_index("Model"))

if model_type == "classification":
    st.subheader("Confusion Matrices")
    for model_name, cm in conf_matrices.items():
        st.write(f"**{model_name}**")
        fig, ax = plt.subplots()
        sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", ax=ax)
        ax.set_xlabel("Predicted")
        ax.set_ylabel("Actual")
        st.pyplot(fig)

    if len(np.unique(y)) == 2:
        st.subheader("ROC Curves")
        fig, ax = plt.subplots()
        for model_name, (fpr, tpr) in roc_data.items():
            ax.plot(fpr, tpr, label=f"{model_name} (AUC = {roc_auc_score(y_test, models[model_name].predict_proba(X_test_scaled)[:,1]):.2f})")
        ax.plot([0, 1], [0, 1], linestyle='--')
        ax.set_xlabel("False Positive Rate")
        ax.set_ylabel("True Positive Rate")
        ax.legend()
        st.pyplot(fig)

# ------------------- Feature Importance -------------------
if "Random Forest" in models or "Random Forest Regressor" in models:
    st.subheader("Feature Importance (Random Forest)")
    rf_key = "Random Forest" if "Random Forest" in models else "Random Forest Regressor"
    rf_model = models[rf_key]
    if hasattr(rf_model, "feature_importances_"):
        importances = rf_model.feature_importances_
        indices = np.argsort(importances)[::-1]
        top_features = X.columns[indices[:15]]
        fig, ax = plt.subplots(figsize=(8, 6))
        sns.barplot(x=importances[indices[:15]], y=top_features, ax=ax)
        ax.set_title("Top 15 Important Features")
        st.pyplot(fig)

# ------------------- Hyperparameter Tuning -------------------
st.subheader("Hyperparameter Tuning: Random Forest")
param_grid = {
    'n_estimators': [50, 100, 200],
    'max_depth': [None, 5, 10],
    'min_samples_split': [2, 5]
}
rf_class = RandomForestClassifier if model_type == "classification" else RandomForestRegressor
grid = GridSearchCV(rf_class(random_state=42), param_grid, cv=3, scoring='f1_macro' if model_type == "classification" else 'r2')
grid.fit(X_train_scaled, y_train)
st.write("Best Parameters:", grid.best_params_)
st.write("Best Score:", grid.best_score_)

# ------------------- Save Best Model -------------------
best_model = max(metrics, key=lambda x: x[-1])
joblib.dump(models[best_model[0]], f"best_model_{task.replace(' ', '_')}_{best_model[0].replace(' ', '_')}.pkl")


Writing app.py


In [None]:
!streamlit run app.py &>/content/logs.txt &
!npx localtunnel --port 8501

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K[1G[0JNeed to install the following packages:
localtunnel@2.0.2
Ok to proceed? (y) [20Gy

[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K⠋[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0Kyour url is: https://social-toys-shop.loca.lt


 1. EDA (Exploratory Data Analysis)
Histograms for numerical features

Countplots for categorical features

Correlation heatmap

 2. Prediction Tasks
Attrition Prediction (classification)

Logistic Regression, SVM, Decision Tree, Random Forest

Promotion Likelihood Prediction (regression)

Linear Regression, Decision Tree Regressor, Random Forest Regressor

3. Model Evaluation
Classification: Accuracy, Precision, Recall, F1-score, AUC-ROC, Confusion Matrix, ROC Curve

Regression: MSE, MAE, R² Score

4. Feature Importance
Top 15 features by importance (Random Forest)

5. Hyperparameter Tuning
GridSearchCV on Random Forest

Displays best parameters and cross-validation score

 6. Model Saving
Automatically saves the best-performing model based on F1-score or R².