In [37]:
!pip install transformers pandas scikit-learn mlflow torch datasets




In [38]:
import os
import pandas as pd
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, f1_score
from sklearn.preprocessing import LabelEncoder
import mlflow
import mlflow.pytorch
import torch
from torch.utils.data import Dataset

# Disable W&B tracking to avoid unnecessary warnings
os.environ["WANDB_DISABLED"] = "true"

# Set up MLflow for local tracking
mlflow.set_tracking_uri("file:///content/mlruns")  # Local MLflow tracking
mlflow.set_experiment("Text_Classification_Retraining")  # Experiment name


<Experiment: artifact_location='file:///content/mlruns/544208329798821324', creation_time=1730313347843, experiment_id='544208329798821324', last_update_time=1730313347843, lifecycle_stage='active', name='Text_Classification_Retraining', tags={}>

In [39]:
class TextClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)


In [40]:
from google.colab import files

# Upload the dataset
uploaded = files.upload()

# Load dataset
for file_name in uploaded.keys():
    data = pd.read_csv(file_name)
    print(f"{file_name} uploaded successfully!")
    print(data.head())  # Display a preview of the dataset

# Define text and label columns based on your dataset structure
X = data['Text']  # Replace with your actual text column name if different
y = data['Sentiment']  # Replace with your actual label column name if different


Saving sentimentdataset.csv to sentimentdataset.csv
sentimentdataset.csv uploaded successfully!
   Unnamed: 0.1  Unnamed: 0  \
0             0           0   
1             1           1   
2             2           2   
3             3           3   
4             4           4   

                                                Text    Sentiment  \
0   Enjoying a beautiful day at the park!        ...   Positive     
1   Traffic was terrible this morning.           ...   Negative     
2   Just finished an amazing workout! 💪          ...   Positive     
3   Excited about the upcoming weekend getaway!  ...   Positive     
4   Trying out a new recipe for dinner tonight.  ...   Neutral      

             Timestamp            User     Platform  \
0  2023-01-15 12:30:00   User123          Twitter     
1  2023-01-15 08:45:00   CommuterX        Twitter     
2  2023-01-15 15:45:00   FitnessFan      Instagram    
3  2023-01-15 18:20:00   AdventureX       Facebook    
4  2023-01-15 19:55:00   Ch

In [41]:
# Convert labels to numeric format if they are not already
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Split data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)

# Tokenization function
def preprocess_data(texts, labels, tokenizer_name='distilbert-base-uncased', max_length=128):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    encodings = tokenizer(texts.tolist(), truncation=True, padding=True, max_length=max_length)
    return TextClassificationDataset(encodings, labels)  # Use custom dataset class

# Preprocess train and test data
train_dataset = preprocess_data(X_train, y_train)
test_dataset = preprocess_data(X_test, y_test)




In [47]:
def train_model_with_mlflow(train_dataset, test_dataset, model_name='distilbert-base-uncased', epochs=3):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))

    # Log parameters with MLflow
    with mlflow.start_run():
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("epochs", epochs)

        # Training arguments
        training_args = TrainingArguments(
            output_dir='./results',
            evaluation_strategy="epoch",  # Use "epoch" to evaluate at each epoch
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=epochs,
            weight_decay=0.01,
            logging_dir='./logs',
            report_to="none"  # Disable W&B logging
        )

        # Trainer setup with custom dataset
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=train_dataset,
            eval_dataset=test_dataset
        )

        # Train and evaluate
        trainer.train()
        eval_results = trainer.evaluate()
        accuracy = eval_results.get('eval_accuracy', 0.0)  # Default to 0.0 if accuracy is missing

        # Log metrics and model artifacts to MLflow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.pytorch.log_model(model, "text_classification_model")

    return model, accuracy


In [43]:
def evaluate_model(model, X_test, y_test):
    # Set up the classifier pipeline with `top_k=None` instead of `return_all_scores=True`
    classifier = pipeline("text-classification", model=model, tokenizer="distilbert-base-uncased", top_k=None)
    predictions = classifier(X_test.tolist(), batch_size=16)

    # Extract numeric label ID from 'LABEL_x' format and convert to int
    pred_labels = [int(pred[0]['label'].split('_')[-1]) for pred in predictions]
    accuracy = accuracy_score(y_test, pred_labels)

    print("Classification Report:\n", classification_report(y_test, pred_labels))
    return accuracy


In [48]:
def monitor_and_retrain_with_mlflow(train_dataset, y_train, model, current_accuracy, threshold=0.80):
    try:
        last_run = mlflow.search_runs(order_by=["start_time desc"], max_results=1)
        last_accuracy = last_run['metrics.accuracy'].values[0] if 'metrics.accuracy' in last_run.columns else 1.0

        # Ensure current_accuracy and last_accuracy are valid numbers
        current_accuracy = current_accuracy if current_accuracy is not None else 0.0
        last_accuracy = last_accuracy if last_accuracy is not None else 1.0

        if current_accuracy < threshold * last_accuracy:
            print("Retraining triggered due to accuracy drop.")
            model, new_accuracy = train_model_with_mlflow(train_dataset, test_dataset)
            return model, new_accuracy
    except KeyError:
        print("Accuracy metric not found in MLflow run logs. Skipping retraining check.")

    return model, current_accuracy


In [45]:
def live_inference(model, tokenizer, text):
    classifier = pipeline("text-classification", model=model, tokenizer=tokenizer)
    result = classifier(text)
    predicted_label = result[0]['label']
    label_id = int(predicted_label.split("_")[-1])  # Get label ID from prediction format
    actual_label = label_encoder.inverse_transform([label_id])[0]  # Map ID to original label
    print(f"Predicted label: {actual_label}")


In [50]:
# Initial training
model, initial_accuracy = train_model_with_mlflow(train_dataset, test_dataset)

# Evaluate and monitor with error handling
model, final_accuracy = monitor_and_retrain_with_mlflow(train_dataset, y_train, model, initial_accuracy)
print("Final Model Accuracy:", final_accuracy)



Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch,Training Loss,Validation Loss
1,No log,5.421296
2,No log,5.332127
3,No log,5.289758




Final Model Accuracy: 0.0
Enter text for live inference: i went to hospital for treatment


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Predicted label:  Positive  


In [51]:
# Live inference usage with input()
test_text = input("Enter text for live inference: ")  # Prompt for user input
live_inference(model, AutoTokenizer.from_pretrained('distilbert-base-uncased'), test_text)

Enter text for live inference: i went to hospital for tretment i'm so sick


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.


Predicted label:  Positive  


In [None]:
import os
import pandas as pd
import time
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, f1_score
from sklearn.preprocessing import LabelEncoder
import mlflow
import mlflow.pytorch
import torch
from torch.utils.data import Dataset
import joblib
import smtplib
from flask import Flask, request, jsonify

# Configurations
os.environ["WANDB_DISABLED"] = "true"  # Disable W&B tracking
DATA_PATH = '/content/sentimentdataset.csv'
MLFLOW_URI = "file:///content/mlruns"  # MLflow local tracking URI
MLFLOW_EXPERIMENT = "Text_Classification_Retraining"
MODEL_NAME = "bert-base-uncased"
BATCH_SIZE = 8
EPOCHS = 3
LOGGING_STEPS = 10
OUTPUT_DIR = './results'
LABEL_ENCODER_PATH = 'label_encoder.pkl'
STALENESS_THRESHOLD = 0.75  # Accuracy threshold for staleness
EMAIL_ALERT = "admin@example.com"  # Email for alerts
MONITORING_INTERVAL = 3600  # Check every hour (3600 seconds)

# MLflow Setup
mlflow.set_tracking_uri(MLFLOW_URI)
mlflow.set_experiment(MLFLOW_EXPERIMENT)

app = Flask(__name__)  # Flask app for live inference

# Email Alert Setup
def send_alert_email(metric, current_value):
    with smtplib.SMTP("smtp.gmail.com", 587) as smtp:
        smtp.starttls()
        smtp.login("your_email@gmail.com", "your_password")
        subject = "Model Retraining Alert"
        body = f"The model's {metric} has fallen below the threshold: {current_value}. Retraining is triggered."
        msg = f'Subject: {subject}\n\n{body}'
        smtp.sendmail("your_email@gmail.com", EMAIL_ALERT, msg)

# Dataset Class
class TextClassificationDataset(Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

def compute_metrics(pred):
    labels = pred.label_ids
    preds = pred.predictions.argmax(-1)
    f1 = f1_score(labels, preds, average='weighted')
    accuracy = accuracy_score(labels, preds)
    return {'accuracy': accuracy, 'f1': f1}

# Training and Retraining Function
def train_and_log_model(train_texts, val_texts, train_labels, val_labels):
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    model = AutoModelForSequenceClassification.from_pretrained(MODEL_NAME, num_labels=len(label_encoder.classes_))
    
    train_encodings = tokenizer(list(train_texts), truncation=True, padding=True)
    val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)
    
    train_dataset = TextClassificationDataset(train_encodings, train_labels.tolist())
    val_dataset = TextClassificationDataset(val_encodings, val_labels.tolist())

    training_args = TrainingArguments(
        output_dir=OUTPUT_DIR,
        num_train_epochs=EPOCHS,
        per_device_train_batch_size=BATCH_SIZE,
        per_device_eval_batch_size=BATCH_SIZE,
        logging_dir='./logs',
        logging_steps=LOGGING_STEPS,
    )

    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=train_dataset,
        eval_dataset=val_dataset,
        compute_metrics=compute_metrics
    )

    trainer.train()
    eval_results = trainer.evaluate()

    # Log metrics with MLflow
    with mlflow.start_run():
        mlflow.log_params({"epochs": EPOCHS, "batch_size": BATCH_SIZE, "model_name": MODEL_NAME})
        mlflow.log_metrics(eval_results)
        mlflow.pytorch.log_model(model, "model")

    # Save the model and check for staleness
    joblib.dump(label_encoder, LABEL_ENCODER_PATH)
    if eval_results['eval_accuracy'] < STALENESS_THRESHOLD:
        send_alert_email("accuracy", eval_results['eval_accuracy'])
    return model, eval_results

# Monitoring and Retraining Loop
def monitoring_and_retrain():
    global model, val_texts, val_labels
    while True:
        print("Monitoring model performance...")
        tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
        val_encodings = tokenizer(list(val_texts), truncation=True, padding=True)
        val_dataset = TextClassificationDataset(val_encodings, val_labels.tolist())

        # Perform evaluation
        trainer = Trainer(model=model, eval_dataset=val_dataset, compute_metrics=compute_metrics)
        eval_results = trainer.evaluate()

        # Check staleness and retrain if necessary
        if eval_results['eval_accuracy'] < STALENESS_THRESHOLD:
            print("Performance degraded. Retraining the model...")
            model, eval_results = train_and_log_model(train_texts, val_texts, train_labels, val_labels)

        print("Monitoring completed. Waiting for the next interval...")
        time.sleep(MONITORING_INTERVAL)

# Initial Training
data = pd.read_csv(DATA_PATH)[['text', 'label']]
label_encoder = LabelEncoder()
data['label'] = label_encoder.fit_transform(data['label'])
train_texts, val_texts, train_labels, val_labels = train_test_split(data['text'], data['label'], test_size=0.2)
model, _ = train_and_log_model(train_texts, val_texts, train_labels, val_labels)

# Batch Inference
def batch_inference(texts):
    model.eval()
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    encodings = tokenizer(texts, truncation=True, padding=True, return_tensors="pt")
    outputs = model(**encodings)
    predictions = torch.argmax(outputs.logits, dim=1)
    return label_encoder.inverse_transform(predictions.cpu().numpy())

# Live Inference
@app.route('/predict', methods=['POST'])
def live_inference():
    data = request.json
    text = data['text']
    tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
    encoding = tokenizer(text, return_tensors="pt", truncation=True, padding=True)
    output = model(**encoding)
    prediction = torch.argmax(output.logits, dim=1)
    predicted_label = label_encoder.inverse_transform([prediction.item()])[0]
    return jsonify({'prediction': predicted_label})

# Start monitoring in a background thread
import threading
monitoring_thread = threading.Thread(target=monitoring_and_retrain)
monitoring_thread.daemon = True
monitoring_thread.start()

if __name__ == "__main__":
    app.run(port=5000)
