In [7]:
!pip install transformers pandas scikit-learn flask-ngrok mlflow


Collecting mlflow
  Downloading mlflow-2.17.1-py3-none-any.whl.metadata (29 kB)
Collecting mlflow-skinny==2.17.1 (from mlflow)
  Downloading mlflow_skinny-2.17.1-py3-none-any.whl.metadata (30 kB)
Collecting alembic!=1.10.0,<2 (from mlflow)
  Downloading alembic-1.13.3-py3-none-any.whl.metadata (7.4 kB)
Collecting docker<8,>=4.0.0 (from mlflow)
  Downloading docker-7.1.0-py3-none-any.whl.metadata (3.8 kB)
Collecting graphene<4 (from mlflow)
  Downloading graphene-3.4.1-py2.py3-none-any.whl.metadata (6.9 kB)
Collecting gunicorn<24 (from mlflow)
  Downloading gunicorn-23.0.0-py3-none-any.whl.metadata (4.4 kB)
Collecting databricks-sdk<1,>=0.20.0 (from mlflow-skinny==2.17.1->mlflow)
  Downloading databricks_sdk-0.36.0-py3-none-any.whl.metadata (38 kB)
Collecting Mako (from alembic!=1.10.0,<2->mlflow)
  Downloading Mako-1.3.6-py3-none-any.whl.metadata (2.9 kB)
Collecting graphql-core<3.3,>=3.1 (from graphene<4->mlflow)
  Downloading graphql_core-3.2.5-py3-none-any.whl.metadata (10 kB)
Colle

In [8]:
import pandas as pd
from transformers import AutoTokenizer
from sklearn.model_selection import train_test_split
import mlflow
import mlflow.sklearn

# Set up MLflow for local tracking
mlflow.set_tracking_uri("file:///content/mlruns")  # You can set this to your server if needed
mlflow.set_experiment("Text_Classification_Retraining")


2024/10/30 13:13:53 INFO mlflow.tracking.fluent: Experiment with name 'Text_Classification_Retraining' does not exist. Creating a new experiment.


<Experiment: artifact_location='file:///content/mlruns/304097548153515880', creation_time=1730294033072, experiment_id='304097548153515880', last_update_time=1730294033072, lifecycle_stage='active', name='Text_Classification_Retraining', tags={}>

In [10]:
from google.colab import files

# Step 3.1: Upload the dataset
uploaded = files.upload()

# Step 3.2: Load dataset
for file_name in uploaded.keys():
    data = pd.read_csv(file_name)
    print(f"{file_name} uploaded successfully!")
    print(data.head())  # Display a preview of the dataset

# Step 3.3: Define text and label columns based on your dataset structure
X = data['Text']  # Replace 'Text' if your column name differs
y = data['Sentiment']  # Replace 'Label' if your column name differs


Saving sentimentdataset.csv to sentimentdataset.csv
sentimentdataset.csv uploaded successfully!
   Unnamed: 0.1  Unnamed: 0  \
0             0           0   
1             1           1   
2             2           2   
3             3           3   
4             4           4   

                                                Text    Sentiment  \
0   Enjoying a beautiful day at the park!        ...   Positive     
1   Traffic was terrible this morning.           ...   Negative     
2   Just finished an amazing workout! 💪          ...   Positive     
3   Excited about the upcoming weekend getaway!  ...   Positive     
4   Trying out a new recipe for dinner tonight.  ...   Neutral      

             Timestamp            User     Platform  \
0  2023-01-15 12:30:00   User123          Twitter     
1  2023-01-15 08:45:00   CommuterX        Twitter     
2  2023-01-15 15:45:00   FitnessFan      Instagram    
3  2023-01-15 18:20:00   AdventureX       Facebook    
4  2023-01-15 19:55:00   Ch

In [11]:
from transformers import AutoTokenizer

# Tokenize dataset
def preprocess_data(X, tokenizer_name='distilbert-base-uncased', max_length=128):
    tokenizer = AutoTokenizer.from_pretrained(tokenizer_name)
    encodings = tokenizer(X.tolist(), truncation=True, padding=True, max_length=max_length)
    return encodings


In [12]:
import torch
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score
import smtplib
from email.mime.text import MIMEText

# Train function with MLflow logging
def train_model_with_mlflow(encodings, y, model_name='distilbert-base-uncased', epochs=3):
    model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(set(y)))

    # MLflow logging
    with mlflow.start_run():
        # Log model parameters
        mlflow.log_param("model_name", model_name)
        mlflow.log_param("epochs", epochs)

        # Training arguments
        args = TrainingArguments(
            output_dir='./results',
            evaluation_strategy="epoch",
            per_device_train_batch_size=16,
            per_device_eval_batch_size=16,
            num_train_epochs=epochs,
            weight_decay=0.01,
            logging_dir='./logs'
        )

        trainer = Trainer(
            model=model,
            args=args,
            train_dataset=encodings['train'],
            eval_dataset=encodings['eval']
        )

        # Train and evaluate
        trainer.train()
        accuracy = trainer.evaluate()['eval_accuracy']

        # Log accuracy and model artifacts to MLflow
        mlflow.log_metric("accuracy", accuracy)
        mlflow.pytorch.log_model(model, "text_classification_model")

        return model, accuracy

# Monitoring and retraining with MLflow integration
def monitor_and_retrain_with_mlflow(encodings, y, model, accuracy, threshold=0.80):
    last_run = mlflow.search_runs(order_by=["start_time desc"], max_results=1)
    last_accuracy = last_run['metrics.accuracy'].values[0] if not last_run.empty else 1.0

    if accuracy < threshold * last_accuracy:
        print("Retraining triggered due to accuracy drop.")
        send_alert_email(accuracy)
        model, new_accuracy = train_model_with_mlflow(encodings, y)
        return model, new_accuracy
    return model, accuracy


In [13]:
from sklearn.metrics import classification_report

# Function to evaluate model performance and trigger retraining if needed
def evaluate_and_monitor_with_mlflow(model, X_test, y_test, threshold=0.80):
    predictions = model.predict(X_test)
    accuracy = accuracy_score(y_test, predictions)

    print(f"Model accuracy: {accuracy:.2f}")
    model, accuracy = monitor_and_retrain_with_mlflow(encodings, y_test, model, accuracy, threshold)
    return model, accuracy


In [14]:
from transformers import pipeline

# Live inference function
def live_inference(model_name='distilbert-base-uncased'):
    classifier = pipeline("text-classification", model=model_name)
    text = input("Enter text for live inference: ")
    result = classifier(text)
    print(f"Predicted label: {result[0]['label']}")

# Batch inference function
def batch_inference(texts, model_name='distilbert-base-uncased'):
    classifier = pipeline("text-classification", model=model_name)
    results = classifier(texts, batch_size=16)
    return results


In [None]:
# Load dataset and split data
X, y = load_data(file_name)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Preprocess data
encodings = preprocess_data(X_train)
encodings['train'] = encodings
encodings['eval'] = preprocess_data(X_test)

# Initial Training
model, initial_accuracy = train_model_with_mlflow(encodings, y_train)

# Monitor and evaluate
model, accuracy = evaluate_and_monitor_with_mlflow(model, X_test, y_test)


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
[34m[1mwandb[0m: Using wandb-core as the SDK backend. Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

In [16]:
import pandas as pd

# Define load_data function
def load_data(file_name):
    # Load dataset from the uploaded file
    data = pd.read_csv(file_name)

    # Extract text and label columns; update column names if your data has different names
    X = data['Text']  # Replace 'Text' if your text column has a different name
    y = data['Sentiment']  # Replace 'Label' if your label column has a different name

    return X, y


In [None]:
from sklearn.metrics import f1_score

# Function to monitor using F1 score along with accuracy
def monitor_local_model(predictions, true_labels, accuracy_threshold=0.8, f1_threshold=0.7):
    accuracy = accuracy_score(true_labels, predictions)
    f1 = f1_score(true_labels, predictions, average='weighted')

    if accuracy < accuracy_threshold or f1 < f1_threshold:
        print("Model staleness detected. Retraining is recommended.")
        return True
    return False


In [None]:
from transformers import pipeline

# Function to perform batch inference on a list of texts
def batch_inference(texts, model_name='distilbert-base-uncased'):
    classifier = pipeline("text-classification", model=model_name)
    results = classifier(texts, batch_size=16)
    return results

# Example usage
texts = ["I love sunny days!", "I can't stand the traffic in the morning."]
predictions = batch_inference(texts)
print(predictions)
