# End-to-End MLOps Pipeline for Predicting Student Academic Risk

This project demonstrates a comprehensive Machine Learning Operations (MLOps) pipeline designed to predict the academic risk of students in higher education. The pipeline encompasses data preprocessing, model training, hyperparameter tuning, model evaluation, visualization, continuous integration, and deployment. By leveraging MLOps practices, the project ensures reproducibility, version control, and efficient deployment of machine learning models.

Step 1: Setup and Install Required Libraries
First, we need to install the necessary libraries and set up the environment.

In [None]:
# Install required libraries
!pip install scikit-learn pandas numpy matplotlib seaborn mlflow joblib fastapi uvicorn pydantic docker

Step 2: Import Libraries
Import the necessary libraries for data processing, modeling, and deployment.

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix
import joblib
import mlflow
from fastapi import FastAPI, HTTPException
from pydantic import BaseModel
import uvicorn
from typing import List
import logging

Step 3: Load and Explore the Dataset
Load the dataset and perform initial exploration.

In [None]:
# Load the dataset
# Replace 'your_dataset.csv' with the actual path to your dataset
df = pd.read_csv('your_dataset.csv')

# Display the first few rows of the dataset
print(df.head())

# Basic information about the dataset
print(df.info())

# Summary statistics of the dataset
print(df.describe())

Step 4: Data Preprocessing
Perform data preprocessing steps including handling missing values, feature selection, encoding, and scaling.

In [None]:
# Define the target variable
TARGET = 'target_column_name'

# Handle missing values (if any)
# df = df.dropna()  # Example: Drop rows with missing values

# Feature selection
X = df.drop(columns=[TARGET, 'id'])  # Assuming 'id' is a column to be dropped
y = df[TARGET]

# Feature encoding
categorical_features = ['Course']  # Example categorical feature
numerical_features = X.select_dtypes(include=['int64', 'float64']).columns

# One-Hot Encoding for categorical features
one_hot_encoder = OneHotEncoder(handle_unknown='ignore')

# Label Encoding for the target variable
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Feature scaling
scaler = StandardScaler()

# Preprocessing pipeline
preprocessor = ColumnTransformer(
    transformers=[
        ('num', scaler, numerical_features),
        ('cat', one_hot_encoder, categorical_features)
    ],
    remainder='passthrough'
)

# Apply preprocessing to the data
X_preprocessed = preprocessor.fit_transform(X)

Step 5: Model Selection and Training
Train multiple models and evaluate their performance.

In [None]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_preprocessed, y_encoded, test_size=0.2, random_state=42)

# Define a list of models to evaluate
models = {
    'RandomForest': RandomForestClassifier(),
    'GradientBoosting': GradientBoostingClassifier()
}

# Train and evaluate each model
for model_name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)

    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    print(f'{model_name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')

Step 6: Hyperparameter Tuning
Perform hyperparameter tuning using RandomizedSearchCV.

In [None]:
# Define hyperparameters for Random Forest
param_dist_rf = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Define hyperparameters for Gradient Boosting
param_dist_gb = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.1, 0.2],
    'max_depth': [3, 5, 7],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt', 'log2']
}

# Perform RandomizedSearchCV for Random Forest
random_search_rf = RandomizedSearchCV(RandomForestClassifier(), param_distributions=param_dist_rf, n_iter=10, cv=3, scoring='f1_weighted', n_jobs=-1, verbose=2, random_state=42)
random_search_rf.fit(X_train, y_train)

# Perform RandomizedSearchCV for Gradient Boosting
random_search_gb = RandomizedSearchCV(GradientBoostingClassifier(), param_distributions=param_dist_gb, n_iter=10, cv=3, scoring='f1_weighted', n_jobs=-1, verbose=2, random_state=42)
random_search_gb.fit(X_train, y_train)

# Get the best models
best_rf = random_search_rf.best_estimator_
best_gb = random_search_gb.best_estimator_

# Evaluate the best models
for model_name, model in [('RandomForest', best_rf), ('GradientBoosting', best_gb)]:
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='weighted')
    precision = precision_score(y_test, y_pred, average='weighted')
    recall = recall_score(y_test, y_pred, average='weighted')

    print(f'{model_name} - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')

Step 7: Model Evaluation
Evaluate the best model and generate predictions for the test set.

In [None]:
# Load the best model (example: Random Forest)
best_model = best_rf

# Evaluate the model on the test set
y_pred = best_model.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = f1_score(y_test, y_pred, average='weighted')
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')

print(f'Best Model - Accuracy: {accuracy:.4f}, F1 Score: {f1:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}')

Step 8: Visualization and Results Analysis
Visualize the confusion matrix and feature importance.

In [None]:
# Confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.show()

# Feature Importance (for Random Forest)
if hasattr(best_model, 'feature_importances_'):
    importances = best_model.feature_importances_
    feature_names = np.array(numerical_features.tolist() + list(one_hot_encoder.get_feature_names_out(categorical_features)))
    feature_importance = pd.DataFrame({'feature': feature_names, 'importance': importances})
    feature_importance = feature_importance.sort_values('importance', ascending=False)

    plt.figure(figsize=(12, 10))
    sns.barplot(x='importance', y='feature', data=feature_importance.head(20))
    plt.title('Top 20 Feature Importance')
    plt.tight_layout()
    plt.show()

Step 9: Continuous Integration with MLflow
Set up MLflow for experiment tracking.

In [None]:
# Set up MLflow
mlflow.set_tracking_uri("http://localhost:5000")
mlflow.set_experiment("Student Academic Risk Prediction")

# Log the best model
with mlflow.start_run():
    mlflow.log_params(best_model.get_params())
    mlflow.log_metric("accuracy", accuracy)
    mlflow.log_metric("f1_score", f1)
    mlflow.log_metric("precision", precision)
    mlflow.log_metric("recall", recall)
    mlflow.sklearn.log_model(best_model, "best_model")

Step 10: Model Deployment with FastAPI
Deploy the model using FastAPI.

In [None]:
# Define the FastAPI app
app = FastAPI()

# Define the input data model
class PredictionInput(BaseModel):
    features: List[float]

# Define the prediction endpoint
@app.post('/predict')
def predict(input_data: PredictionInput):
    try:
        input_array = np.array(input_data.features).reshape(1, -1)
        input_array = preprocessor.transform(input_array)
        prediction = best_model.predict(input_array)
        return {"prediction": int(prediction[0])}
    except Exception as e:
        logging.error(f"Prediction error: {e}")
        raise HTTPException(status_code=500, detail="Internal Server Error")

# Run the FastAPI app
if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

Step 11: Dockerization
Create a Dockerfile to containerize the FastAPI application.

In [None]:
# Dockerfile
FROM python:3.9-slim

WORKDIR /app

COPY app.py /app/
COPY requirements.txt /app/
COPY models /app/models
COPY static /app/static

RUN pip install --no-cache-dir -r requirements.txt

EXPOSE 8000

CMD ["uvicorn", "app:app", "--host", "0.0.0.0", "--port", "8000"]

Step 12: Build and Run Docker Container
Build and run the Docker container.

In [None]:
!docker build -t academic-risk-predictor .
!docker run -p 8000:8000 academic-risk-predictor