In [3]:
!pip install streamlit -q
!pip install pytorch-lightning
!pip install optuna
!pip install transformers
!pip install arff
!pip install torch
!pip install scikit-learn
!pip install matplotlib
!pip install seaborn
!pip install tqdm
!pip install pyngrok

Collecting arff
  Downloading arff-0.9.tar.gz (4.7 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: arff
  Building wheel for arff (setup.py) ... [?25ldone
[?25h  Created wheel for arff: filename=arff-0.9-py3-none-any.whl size=4950 sha256=4bf127bed71ca954c40f212b3d78e255a3d5fbb58de3ad1da32b666ca84068a5
  Stored in directory: /root/.cache/pip/wheels/0c/39/12/4d8ff491018bce2dd9cc6034298f27399c7f4fd4063187f7be
Successfully built arff
Installing collected packages: arff
Successfully installed arff-0.9
Collecting pyngrok
  Downloading pyngrok-7.2.0-py3-none-any.whl.metadata (7.4 kB)
Downloading pyngrok-7.2.0-py3-none-any.whl (22 kB)
Installing collected packages: pyngrok
Successfully installed pyngrok-7.2.0


In [6]:
import os
import json
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer, BertModel, AdamW
import optuna
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Function to parse sparse ARFF format
def parse_arff(filename):
    with open(filename, 'r') as file:
        data_started = False
        data = []
        attributes = []

        for line in file:
            line = line.strip()
            if line.lower().startswith('@attribute'):
                attributes.append(line.split()[1])
            elif line.lower() == '@data':
                data_started = True
            elif data_started:
                if line:
                    row_data = np.zeros(len(attributes))
                    row_entries = line.strip('{}').split(',')
                    for entry in row_entries:
                        index, value = entry.split()
                        row_data[int(index)] = float(value)
                    data.append(row_data)
        return pd.DataFrame(data, columns=attributes)

# Convert feedback JSON to DataFrame
def feedback_to_dataframe(feedback_file, label_cols):
    with open(feedback_file, 'r') as f:
        feedback_data = json.load(f)

    feedback_texts = [entry["text"] for entry in feedback_data]
    feedback_labels = [entry["labels"] for entry in feedback_data]
    
    df_feedback = pd.DataFrame(feedback_labels, columns=label_cols)
    df_feedback['combined_text'] = feedback_texts

    return df_feedback

# Merge feedback with original dataset
def merge_feedback_with_dataset(original_file, feedback_file, output_file, label_cols):
    df_original = pd.read_csv(original_file)
    df_feedback = feedback_to_dataframe(feedback_file, label_cols)
    df_combined = pd.concat([df_original, df_feedback], ignore_index=True)
    df_combined.to_csv(output_file, index=False)
    print(f"Merged dataset saved to {output_file}")

# Load and parse the ARFF file
file_path = '/kaggle/input/news-papers/20ng-desc.arff'
df_original = parse_arff(file_path)

# Identify feature columns and label columns
feature_cols = df_original.columns[:-20]  # Assuming the last 20 columns are labels
label_cols = df_original.columns[-20:]

# Combine features into a single text column
df_original['combined_text'] = df_original[feature_cols].astype(str).agg(' '.join, axis=1)

# Save original dataset to CSV
original_csv_path = 'original_data.csv'
df_original.to_csv(original_csv_path, index=False)
print(f"Original data saved to {original_csv_path}")

# Check if feedback data exists and merge it with the original dataset
feedback_file = 'feedback.json'
if os.path.exists(feedback_file):
    merge_feedback_with_dataset(original_csv_path, feedback_file, 'combined_data.csv', label_cols)
    df_data = pd.read_csv('combined_data.csv')
else:
    df_data = df_original

# Split the data
df_train, df_test = train_test_split(df_data, test_size=0.3, random_state=42)
df_test, df_valid = train_test_split(df_test, test_size=0.5, random_state=42)

# Hyperparameters
MAX_LEN = 256
EPOCHS = 2  # Use a small number for quick testing

# Tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Dataset class
class CustomDataset(Dataset):
    def __init__(self, df, tokenizer, max_len, target_list):
        self.tokenizer = tokenizer
        self.df = df
        self.texts = list(df['combined_text'])
        self.targets = self.df[target_list].values
        self.max_len = max_len

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = str(self.texts[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            padding='max_length',
            return_token_type_ids=True,
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt'
        )
        return {
            'input_ids': inputs['input_ids'].flatten(),
            'attention_mask': inputs['attention_mask'].flatten(),
            'token_type_ids': inputs["token_type_ids"].flatten(),
            'targets': torch.FloatTensor(self.targets[index]),
            'text': text
        }

# Create datasets
train_dataset = CustomDataset(df_train, tokenizer, MAX_LEN, label_cols)
valid_dataset = CustomDataset(df_valid, tokenizer, MAX_LEN, label_cols)
test_dataset = CustomDataset(df_test, tokenizer, MAX_LEN, label_cols)

class BERTClassLightning(pl.LightningModule):
    def __init__(self, num_labels, learning_rate, dropout_rate):
        super(BERTClassLightning, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(768, num_labels)
        self.learning_rate = learning_rate
        self.train_losses = []
        self.val_losses = []

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        output_dropout = self.dropout(pooled_output)
        output = self.linear(output_dropout)
        return output

    def training_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']
        outputs = self(input_ids, attention_mask, token_type_ids)
        loss = nn.BCEWithLogitsLoss()(outputs, targets)
        self.train_losses.append(loss)
        self.log('train_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def validation_step(self, batch, batch_idx):
        input_ids = batch['input_ids']
        attention_mask = batch['attention_mask']
        token_type_ids = batch['token_type_ids']
        targets = batch['targets']
        outputs = self(input_ids, attention_mask, token_type_ids)
        loss = nn.BCEWithLogitsLoss()(outputs, targets)
        self.val_losses.append(loss)
        self.log('val_loss', loss, on_step=True, on_epoch=True, prog_bar=True, logger=True)
        return loss

    def on_train_epoch_end(self):
        avg_loss = torch.stack(self.train_losses).mean()
        self.log('train_loss_epoch', avg_loss, prog_bar=True, logger=True)
        self.train_losses.clear()  # Clear the list for the next epoch

    def on_validation_epoch_end(self):
        avg_loss = torch.stack(self.val_losses).mean()
        self.log('val_loss_epoch', avg_loss, prog_bar=True, logger=True)
        self.val_losses.clear()  # Clear the list for the next epoch

    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=self.learning_rate)
        return optimizer

def objective(trial):
    # Define hyperparameters to optimize, suggested by Optuna
    learning_rate = trial.suggest_float('learning_rate', 1e-5, 1e-3, log=True)
    dropout_rate = trial.suggest_float('dropout_rate', 0.1, 0.5)
    batch_size = trial.suggest_categorical('batch_size', [16, 32, 64])

    print(f"\n{'='*50}")
    print(f"Starting trial {trial.number}")
    print(f"Hyperparameters: LR={learning_rate}, Dropout={dropout_rate}, Batch Size={batch_size}")
    print(f"{'='*50}\n")

    model = BERTClassLightning(num_labels=len(label_cols), learning_rate=learning_rate, dropout_rate=dropout_rate)

    trainer = pl.Trainer(
        max_epochs=EPOCHS,
        accelerator='gpu' if torch.cuda.is_available() else 'cpu',
        devices=1,
        callbacks=[ModelCheckpoint(monitor="val_loss")],
        enable_progress_bar=False  # Disable progress bar for cleaner output
    )

    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    val_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, num_workers=2)

    trainer.fit(model, train_loader, val_loader)

    best_val_loss = trainer.callback_metrics.get("val_loss_epoch", float('inf'))

    print(f"\nTrial {trial.number} completed.")
    print(f"Best validation loss: {best_val_loss:.4f}")
    print(f"{'='*50}\n")

    return best_val_loss

def train_and_save_model():
    study = optuna.create_study(direction="minimize")
    study.optimize(objective, n_trials=1)  # Use a small number for quick testing

    best_params = study.best_params
    best_model = BERTClassLightning(num_labels=len(label_cols), 
                                    learning_rate=best_params['learning_rate'], 
                                    dropout_rate=best_params['dropout_rate'])

    trainer = pl.Trainer(
        max_epochs=EPOCHS,
        accelerator='gpu' if torch.cuda.is_available() else 'cpu',
        devices=1,
        callbacks=[ModelCheckpoint(dirpath='checkpoints', filename='best_model', save_top_k=1, monitor='val_loss')],
    )

    train_loader = DataLoader(train_dataset, batch_size=best_params['batch_size'], shuffle=True, num_workers=2)
    val_loader = DataLoader(valid_dataset, batch_size=best_params['batch_size'], shuffle=False, num_workers=2)

    trainer.fit(best_model, train_loader, val_loader)

    # Save the entire model
    os.makedirs('model', exist_ok=True)
    torch.save(best_model, 'model/full_model.pt')
    torch.save(best_model.state_dict(), 'model/model_state_dict.pt')
    best_model.bert_model.config.save_pretrained('model')
    tokenizer.save_pretrained('model')

    return trainer, best_model

# Train and save the model
trainer, best_model = train_and_save_model()
print("Model training and saving completed.")

Original data saved to original_data.csv


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

[I 2024-09-09 11:45:23,570] A new study created in memory with name: no-name-e3ba0bdc-c8e7-440b-8f3a-82f0eb1d4a2f



Starting trial 0
Hyperparameters: LR=1.2347895816423608e-05, Dropout=0.10768605914322729, Batch Size=32



model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

  self.pid = os.fork()
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 32. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 6. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
  self.pid = os.fork()
/opt/conda/lib/python3.10/site-packages/pytorch_lightning/utilities/data.py:78: Trying to infer the `batch_size` from an ambiguous collection. The batch size we found is 15. To avoid any miscalculations, use `self.log(..., batch_size=batch_size)`.
[I 2024-09-09 11:58:29,075] Trial 0 finished with value: 0.20135243237018585 and parameters: {'learning_rate': 1.2347895816423608e-05, 'dropout_rate': 0.10768605914322729, 'batch_size': 32}. Best is trial 0 with


Trial 0 completed.
Best validation loss: 0.2014





Sanity Checking: |          | 0/? [00:00<?, ?it/s]

Training: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Validation: |          | 0/? [00:00<?, ?it/s]

Model training and saving completed.


In [7]:
label_cols_list = label_cols.tolist()
model_path = '/kaggle/working/model/'

with open("app.py", "w") as f:
    f.write(f"""
import streamlit as st
import torch
from transformers import BertTokenizer
import pandas as pd
import numpy as np
import torch.nn as nn
from transformers import BertModel
import json
from datetime import datetime

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define the BERTClassLightning class
class BERTClassLightning(nn.Module):
    def __init__(self, num_labels, learning_rate, dropout_rate):
        super(BERTClassLightning, self).__init__()
        self.bert_model = BertModel.from_pretrained('bert-base-uncased')
        self.dropout = nn.Dropout(dropout_rate)
        self.linear = nn.Linear(768, num_labels)
        self.learning_rate = learning_rate

    def forward(self, input_ids, attention_mask, token_type_ids):
        outputs = self.bert_model(input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        pooled_output = outputs[1]
        output_dropout = self.dropout(pooled_output)
        output = self.linear(output_dropout)
        return output

# Load the trained model
model = torch.load(f"{model_path}full_model.pt", map_location=device)
model.eval()
model.to(device)

tokenizer = BertTokenizer.from_pretrained("{model_path}")
label_cols = {label_cols_list}  # Now correctly defined as a list

MAX_LEN = {MAX_LEN}

@st.cache_resource
def predict_category(raw_text, threshold):
    st.write(f"Raw input text: {{raw_text}}")  # Debug: Log the raw input text
    encoded_text = tokenizer.encode_plus(
        raw_text,
        max_length=MAX_LEN,
        add_special_tokens=True,
        return_token_type_ids=True,
        pad_to_max_length=True,
        return_attention_mask=True,
        return_tensors='pt',
    )

    input_ids = encoded_text['input_ids'].to(device)
    attention_mask = encoded_text['attention_mask'].to(device)
    token_type_ids = encoded_text['token_type_ids'].to(device)

    with torch.no_grad():
        output = model(input_ids, attention_mask, token_type_ids)

    probabilities = torch.sigmoid(output).cpu().numpy()[0]
    predicted_labels = (probabilities > threshold).astype(int)
    
    st.write(f"Probabilities: {{probabilities}}")  # Debug: Log the probabilities
    st.write(f"Predicted labels (binary): {{predicted_labels}}")  # Debug: Log the predicted labels

    predictions = {{label: prob for label, prob, pred in zip(label_cols, probabilities, predicted_labels) if pred == 1}}
    st.write(f"Predictions: {{predictions}}")  # Debug: Log the final predictions

    return predictions

def save_feedback(text_input, feedback_labels, additional_feedback):
    feedback = {{
        "text": text_input,
        "labels": feedback_labels,
        "additional_feedback": additional_feedback,
        "timestamp": datetime.now().isoformat()
    }}
    
    try:
        with open('feedback.json', 'r+') as f:
            data = json.load(f)
            data.append(feedback)
            f.seek(0)
            json.dump(data, f, indent=4)
    except FileNotFoundError:
        with open('feedback.json', 'w') as f:
            json.dump([feedback], f, indent=4)
    
    st.write("Feedback saved successfully!")

def main():
    st.title('Multi-Label Text Classification with BERT')
    st.write("Enter text and see the model's predictions. Provide feedback to improve the model.")

    text_input = st.text_area("Enter Text:")
    
    threshold = st.slider('Prediction Threshold', 0.0, 1.0, 0.5)  # Slider for the threshold with default value 0.5
    st.write(f"Current Threshold: {{threshold}}")

    if st.button('Predict'):
        predictions = predict_category(text_input, threshold)
        st.write("Predictions:")
        for category, probability in predictions.items():
            st.write(f"{{category}}: {{probability:.4f}}")
    else:
        predictions = {{}}  # Initialize predictions as an empty dict if no prediction has been made

    # User feedback
    st.write("Provide feedback:")
    feedback_labels = []
    for label in label_cols:
        is_correct = st.checkbox(f"Is '{{label}}' correct?", value=label in predictions)
        feedback_labels.append(1 if is_correct else 0)

    additional_feedback = st.text_area("Enter additional feedback (optional):")

    if st.button('Submit Feedback'):
        save_feedback(text_input, feedback_labels, additional_feedback)
        st.write("Feedback received and saved. Thank you!")

if __name__ == '__main__':
    main()
""")

print("Streamlit app script written to app.py")

Streamlit app script written to app.py


In [None]:
from pyngrok import ngrok
import subprocess
import time

if __name__ == '__main__':
    # Get your ngrok auth token from https://dashboard.ngrok.com/get-started/your-authtoken
    ngrok.set_auth_token("2lm7kPOQQmDWCtmCCYZbF98Xstn_2S3cjTS1wfi4M6N7LJdjj")

    # Define the port
    port = 8503

    # Start ngrok tunnel
    ngrok_tunnel = ngrok.connect(addr=port, proto="http", bind_tls=True)
    print(f"Ngrok tunnel created: {ngrok_tunnel.public_url}")

    # Start Streamlit in the background
    streamlit_process = subprocess.Popen(["streamlit", "run", "app.py", "--server.port", str(port)])

    print(f"Streamlit app is live at: {ngrok_tunnel.public_url}")

    # Keep the app running
    try:
        while True:
            time.sleep(1)
    except KeyboardInterrupt:
        print("Shutting down...")
        ngrok.disconnect(ngrok_tunnel.public_url)
        streamlit_process.terminate()

Ngrok tunnel created: https://6582-35-196-3-183.ngrok-free.app
Streamlit app is live at: https://6582-35-196-3-183.ngrok-free.app

Collecting usage statistics. To deactivate, set browser.gatherUsageStats to false.


  You can now view your Streamlit app in your browser.

  Local URL: http://localhost:8503
  Network URL: http://172.19.2.2:8503
  External URL: http://35.196.3.183:8503



  model = torch.load(f"/kaggle/working/model/full_model.pt", map_location=device)
  model = torch.load(f"/kaggle/working/model/full_model.pt", map_location=device)
  model = torch.load(f"/kaggle/working/model/full_model.pt", map_location=device)
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
  model = torch.load(f"/kaggle/working/model/full_model.pt", map_location=device)
  model = torch.load(f"/kaggle/working/model/full_model.pt", map_location=device)
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If 