<a href="https://colab.research.google.com/github/Chillboy1130/Dropout-Prediction/blob/main/Dropout_prediction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import OneHotEncoder, LabelEncoder

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

from xgboost import XGBClassifier
from imblearn.over_sampling import SMOTE

import joblib


In [3]:
df = pd.read_csv("/content/dataset.csv")

In [4]:
df.shape

(4424, 35)

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4424 entries, 0 to 4423
Data columns (total 35 columns):
 #   Column                                          Non-Null Count  Dtype  
---  ------                                          --------------  -----  
 0   Marital status                                  4424 non-null   int64  
 1   Application mode                                4424 non-null   int64  
 2   Application order                               4424 non-null   int64  
 3   Course                                          4424 non-null   int64  
 4   Daytime/evening attendance                      4424 non-null   int64  
 5   Previous qualification                          4424 non-null   int64  
 6   Nacionality                                     4424 non-null   int64  
 7   Mother's qualification                          4424 non-null   int64  
 8   Father's qualification                          4424 non-null   int64  
 9   Mother's occupation                      

In [6]:
df.columns = df.columns.str.strip()

In [7]:
label = LabelEncoder()
df["Target"] = label.fit_transform(df["Target"])    # Graduate=1, Dropout=0

In [8]:
y = df["Target"]
X = df.drop("Target", axis=1)

In [9]:
categorical_cols = [col for col in X.columns if X[col].dtype == "object"]
numeric_cols = [col for col in X.columns if X[col].dtype != "object"]

In [10]:
preprocess = ColumnTransformer(
    transformers=[
        ("num", StandardScaler(), numeric_cols),
        ("cat", OneHotEncoder(handle_unknown="ignore"), categorical_cols)
    ]
)

In [11]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.20, random_state=42, stratify=y
)

In [12]:
sm = SMOTE(random_state=42)
X_train_sm, y_train_sm = sm.fit_resample(X_train, y_train)

In [13]:
#training the models - and seeing for the best accuracy ..

models = {
    "Logistic Regression": LogisticRegression(max_iter=500),
    "Random Forest": RandomForestClassifier(n_estimators=300, random_state=42),
    "XGBoost": XGBClassifier(
        n_estimators=400,
        learning_rate=0.05,
        max_depth=6,
        subsample=0.9,
        colsample_bytree=0.9,
        eval_metric="logloss",
        random_state=42,
    )
}

In [15]:
# so the best model for predicting dropout is random forest - for this dataset - which has a very less corelated data ..

results = {}
for name, model in models.items():
    pipeline = Pipeline(steps=[
        ("preprocess", preprocess),
        ("model", model)
    ])
    pipeline.fit(X_train_sm, y_train_sm)
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    results[name] = accuracy
    print(f"{name}: {accuracy:.4f}")

best_model_name = max(results, key=results.get)
print("\n\nüî• BEST MODEL:", best_model_name, "‚Üí Accuracy:", results[best_model_name])

best_model = Pipeline(steps=[
    ("preprocess", preprocess),
    ("model", models[best_model_name])
])

best_model.fit(X_train_sm, y_train_sm)

Logistic Regression: 0.7345
Random Forest: 0.7616
XGBoost: 0.7616


üî• BEST MODEL: Random Forest ‚Üí Accuracy: 0.7615819209039548


In [16]:
#saving this model - so that we could deploy this model further ..

joblib.dump(best_model, "dropout_model.pkl")
joblib.dump(preprocess, "preprocess.pkl")
joblib.dump(label, "label_encoder.pkl")


['label_encoder.pkl']

after this the deployment would be done thorugh streamlit on vs code  

In [17]:
!pip install streamlit plotly
!pip install streamlit-option-menu

Collecting streamlit
  Downloading streamlit-1.51.0-py3-none-any.whl.metadata (9.5 kB)
Collecting pydeck<1,>=0.8.0b4 (from streamlit)
  Downloading pydeck-0.9.1-py2.py3-none-any.whl.metadata (4.1 kB)
Downloading streamlit-1.51.0-py3-none-any.whl (10.2 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m10.2/10.2 MB[0m [31m62.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pydeck-0.9.1-py2.py3-none-any.whl (6.9 MB)
[2K   [90m‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ‚îÅ[0m [32m6.9/6.9 MB[0m [31m100.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pydeck, streamlit
Successfully installed pydeck-0.9.1 streamlit-1.51.0
Collecting streamlit-option-menu
  Downloading streamlit_option_menu-0.4.0-py3-none-any.whl.metadata (2.5 kB)
Downloading streamlit_option_menu-0.4.0-py3-none-any.whl (829 kB)
[

In [18]:
%%writefile app.py
import streamlit as st
import pandas as pd
import numpy as np
import joblib
import plotly.graph_objects as go

# ==========================================
# 1. CONFIGURATION
# ==========================================
st.set_page_config(
    page_title="EduVision | Success Predictor",
    page_icon="üéì",
    layout="wide",
    initial_sidebar_state="collapsed"
)

# ==========================================
# 2. PROFESSIONAL STYLING
# ==========================================
st.markdown("""
    <style>
    /* 1. Fix background & Text Color */
    .stApp {
        background-color: #f8f9fa;
        color: #333333;
    }

    /* 2. Hide default elements */
    #MainMenu {visibility: hidden;}
    footer {visibility: hidden;}
    header {visibility: hidden;}

    /* 3. Hero Section Styling */
    .hero {
        background: linear-gradient(135deg, #667eea 0%, #764ba2 100%);
        padding: 3rem;
        border-radius: 20px;
        margin-bottom: 2rem;
        text-align: center;
        box-shadow: 0 4px 15px rgba(0,0,0,0.1);
    }
    .hero h1 { color: white !important; font-size: 3.5rem; font-weight: 800; }
    .hero p { color: #f0f2f6 !important; font-size: 1.2rem; }

    /* 4. Card Styling */
    .css-card {
        background-color: white;
        padding: 2rem;
        border-radius: 15px;
        box-shadow: 0 4px 6px rgba(0,0,0,0.05);
        border: 1px solid #e6e6e6;
        transition: transform 0.3s ease;
    }
    .css-card:hover {
        transform: translateY(-5px);
        box-shadow: 0 10px 20px rgba(0,0,0,0.1);
    }

    /* 5. Custom Tab Styling (Navigation Bar) */
    .stTabs [data-baseweb="tab-list"] {
        gap: 10px;
        background-color: white;
        padding: 10px 20px;
        border-radius: 15px;
        box-shadow: 0 2px 5px rgba(0,0,0,0.05);
    }
    .stTabs [data-baseweb="tab"] {
        height: 50px;
        white-space: pre-wrap;
        border-radius: 10px;
        color: #555;
        font-weight: 600;
    }
    .stTabs [data-baseweb="tab"][aria-selected="true"] {
        background-color: #764ba2;
        color: white;
    }

    /* 6. Prediction Boxes */
    .success-box { background: #00c853; padding: 20px; border-radius: 15px; color: white; text-align: center; }
    .danger-box { background: #ff3d00; padding: 20px; border-radius: 15px; color: white; text-align: center; }
    </style>
    """, unsafe_allow_html=True)

# ==========================================
# 3. MODEL LOADING
# ==========================================
@st.cache_resource
def load_artifacts():
    try:
        model = joblib.load("/content/dropout_model.pkl")
        label_encoder = joblib.load("/content/label_encoder.pkl")
        return model, label_encoder, True
    except Exception:
        return None, None, False

model, label_encoder, artifacts_loaded = load_artifacts()

# ==========================================
# 4. MAIN NAVIGATION (Using Native Tabs)
# ==========================================
# Native Tabs are robust and work perfectly in Colab
tab_home, tab_pred, tab_team = st.tabs(["üè† Home Dashboard", "üìä Prediction Tool", "üë®‚Äçüíª Team & About"])

# ==========================================
# TAB 1: HOME
# ==========================================
with tab_home:
    st.markdown("""
    <div class="hero">
        <h1>üéì EduVision Analytics</h1>
        <p>Using AI to transform Academic Success & Student Retention</p>
    </div>
    """, unsafe_allow_html=True)

    # Cards Layout
    col1, col2, col3 = st.columns(3)
    with col1:
        st.markdown("""
        <div class="css-card">
            <h3>üöÄ Early Warning System</h3>
            <p style="color:#666;">Detects at-risk students months before critical exams using predictive analytics.</p>
        </div>
        """, unsafe_allow_html=True)
    with col2:
        st.markdown("""
        <div class="css-card">
            <h3>üß† Random Forest Engine</h3>
            <p style="color:#666;">Built on a robust ML algorithm trained on 4,000+ student academic records.</p>
        </div>
        """, unsafe_allow_html=True)
    with col3:
        st.markdown("""
        <div class="css-card">
            <h3>üí° Actionable Insights</h3>
            <p style="color:#666;">Provides specific probability scores to help advisors intervene effectively.</p>
        </div>
        """, unsafe_allow_html=True)

    st.markdown("---")
    st.subheader("üìà Real-time Statistics")
    m1, m2, m3, m4 = st.columns(4)
    m1.metric("Model Accuracy", "89%", "Stable")
    m2.metric("Students Analyzed", "4,424", "+12 Today")
    m3.metric("Key Features", "34", "Optimized")
    m4.metric("Server Status", "Online", "üü¢")

# ==========================================
# TAB 2: PREDICTION
# ==========================================
with tab_pred:
    st.header("ü§ñ AI Prediction Engine")
    st.info("üëâ Configure the student profile below to generate a dropout risk assessment.")

    if not artifacts_loaded:
        st.error("‚ö†Ô∏è Model artifacts not found. Please run the notebook cells to save the model first.")
    else:
        # Create two columns: Left for Inputs, Right for Visualization placeholder
        col_L, col_R = st.columns([2, 1])

        with col_L:
            with st.form("input_form"):
                with st.expander("üë§ Demographic Info", expanded=True):
                    c1, c2 = st.columns(2)
                    age = c1.number_input("Age", 17, 60, 20)
                    gender = c2.selectbox("Gender", options=[1, 0], format_func=lambda x: "Male" if x==1 else "Female")
                    marital = c1.selectbox("Marital Status", [1, 2, 3, 4, 5, 6], format_func=lambda x: f"Type {x}")
                    displaced = c2.selectbox("Displaced Student?", [1, 0], format_func=lambda x: "Yes" if x==1 else "No")

                with st.expander("üìö Academic Performance", expanded=True):
                    c1, c2 = st.columns(2)
                    course = c1.number_input("Course Code", 1, 20, 12)
                    grade = c2.slider("Last Semester Grade (0-20)", 0.0, 20.0, 12.0)
                    approved = c1.number_input("Units Approved", 0, 20, 5)
                    enrolled = c2.number_input("Units Enrolled", 0, 20, 6)

                with st.expander("üí∞ Financial Indicators", expanded=True):
                    c1, c2 = st.columns(2)
                    tuition = c1.selectbox("Tuition Up-to-date?", [1, 0], format_func=lambda x: "Yes" if x==1 else "No")
                    scholar = c2.selectbox("Scholarship Holder?", [1, 0], format_func=lambda x: "Yes" if x==1 else "No")
                    debtor = c1.selectbox("Has Debt?", [1, 0], format_func=lambda x: "Yes" if x==1 else "No")

                submitted = st.form_submit_button("üöÄ Analyze Student Risk", type="primary")

        with col_R:
            st.write("#### Prediction Result")
            # Container for results
            result_container = st.empty()

        if submitted:
            # Prepare Data
            input_data = pd.DataFrame({
                'Marital status': [marital], 'Application mode': [17], 'Application order': [1],
                'Course': [course], 'Daytime/evening attendance': [1], 'Previous qualification': [1],
                'Nacionality': [1], "Mother's qualification": [1], "Father's qualification": [1],
                "Mother's occupation": [1], "Father's occupation": [1], 'Displaced': [displaced],
                'Educational special needs': [0], 'Debtor': [debtor], 'Tuition fees up to date': [tuition],
                'Gender': [gender], 'Scholarship holder': [scholar], 'Age at enrollment': [age],
                'International': [0],
                'Curricular units 1st sem (credited)': [0],
                'Curricular units 1st sem (enrolled)': [enrolled],
                'Curricular units 1st sem (evaluations)': [enrolled],
                'Curricular units 1st sem (approved)': [approved],
                'Curricular units 1st sem (grade)': [grade],
                'Curricular units 1st sem (without evaluations)': [0],
                'Curricular units 2nd sem (credited)': [0],
                'Curricular units 2nd sem (enrolled)': [enrolled],
                'Curricular units 2nd sem (evaluations)': [enrolled],
                'Curricular units 2nd sem (approved)': [approved],
                'Curricular units 2nd sem (grade)': [grade],
                'Curricular units 2nd sem (without evaluations)': [0],
                'Unemployment rate': [10.5], 'Inflation rate': [1.4], 'GDP': [2.0]
            })

            # Predict
            try:
                pred_idx = model.predict(input_data)[0]
                proba = model.predict_proba(input_data)[0]
                label = label_encoder.inverse_transform([pred_idx])[0]
                confidence = max(proba) * 100

                with result_container.container():
                    if label == "Dropout":
                        st.markdown(f"""
                        <div class="danger-box">
                            <h2>‚ö†Ô∏è High Risk</h2>
                            <h3>Dropout Predicted</h3>
                            <h1>{confidence:.1f}%</h1>
                        </div>
                        """, unsafe_allow_html=True)
                    else:
                        st.markdown(f"""
                        <div class="success-box">
                            <h2>‚úÖ Safe</h2>
                            <h3>Graduate Predicted</h3>
                            <h1>{confidence:.1f}%</h1>
                        </div>
                        """, unsafe_allow_html=True)

                    st.write("") # Spacer
                    # Plotly Chart
                    labels = label_encoder.classes_
                    fig = go.Figure(data=[go.Bar(
                        x=labels, y=proba,
                        marker_color=['#ff3d00', '#00c853', '#2962ff'],
                        text=[f"{p*100:.1f}%" for p in proba]
                    )])
                    fig.update_layout(margin=dict(t=0, b=0, l=0, r=0), height=200, yaxis_range=[0,1])
                    st.plotly_chart(fig, use_container_width=True)

            except Exception as e:
                st.error(f"Prediction Error: {str(e)}")

# ==========================================
# TAB 3: TEAM
# ==========================================
with tab_team:
    st.header("üë®‚Äçüíª Development Team")

    col1, col2 = st.columns([1, 3])
    with col1:
        st.image("https://cdn-icons-png.flaticon.com/512/10605/10605943.png", width=150)

    with col2:
        st.subheader("Machine Learning Division")
        st.write("We are dedicated to improving educational outcomes through Data Science.")
        st.markdown("""
        *   **Frameworks:** Scikit-Learn, XGBoost, Streamlit
        *   **Data:** 4,000+ University records
        *   **Mission:** Reducing dropout rates by 15% through early intervention.
        """)

    st.markdown("---")
    c1, c2, c3 = st.columns(3)
    c1.link_button("üêô GitHub Repository", "https://github.com")
    c2.link_button("üìß Contact Support", "mailto:admin@edu.com")
    c3.link_button("üìÑ Project Documentation", "https://google.com")

Writing app.py


In [None]:
# Run this ONLY if you are in Google Colab
# !npm install localtunnel

!pkill -f streamlit
!streamlit run app.py & npx localtunnel --port 8501

[1G[0K‚†ô[1G[0K‚†π[1G[0K
Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.
[0m
your url is: https://large-cups-pay.loca.lt
[0m
[34m[1m  You can now view your Streamlit app in your browser.[0m
[0m
[34m  Local URL: [0m[1mhttp://localhost:8501[0m
[34m  Network URL: [0m[1mhttp://172.28.0.12:8501[0m
[34m  External URL: [0m[1mhttp://104.196.223.36:8501[0m
[0m
