In [1]:
import os

In [2]:
%pwd

'c:\\Users\\assi01\\Desktop\\projects\\AirTravel_Sentiment_Analysis\\research'

In [3]:
os.chdir('../')

In [4]:
%pwd

'c:\\Users\\assi01\\Desktop\\projects\\AirTravel_Sentiment_Analysis'

In [5]:
from dataclasses import dataclass
from pathlib import Path


@dataclass(frozen=False)
class ModelTrainingConfig:
    root_dir: Path
    base_model_path: Path
    base_tokenizer_path: Path
    model_path: Path
    tokenizer_path: Path
    train_tokenized_data_path: Path
    test_tokenized_data_path: Path
    val_tokenized_data_path: Path
    params_model_name: str
    params_eval_strategy: str
    params_save_strategy: str
    params_learning_rate: float
    params_per_device_train_batch_size: int
    params_per_device_eval_batch_size: int
    params_num_train_epochs: int
    params_weight_decay: float
    params_load_best_model_at_end: bool
    params_metric_for_best_model: str

In [6]:
from airTravelSentimentAnalysis.constants import *
from airTravelSentimentAnalysis.utils.common import read_yaml, create_directories

In [8]:
class ConfigurationManager:
    def __init__(
        self,
        config_filepath = CONFIG_FILE_PATH,
        params_filepath = PARAMS_FILE_PATH):

        self.config = read_yaml(config_filepath)
        self.params = read_yaml(params_filepath)

        create_directories([self.config.artifacts_root])


    
    def get_model_training_config(self) -> ModelTrainingConfig:
        config = self.config.model_training
        config["base_model_path"] = self.config.prepare_base_model.base_model_path
        config["base_tokenizer_path"] = self.config.prepare_base_model.base_tokenizer_path
        params_training = self.params.TRAINING_ARGUMENTS
        print(f"Training arguments: {params_training}")
        create_directories([config.root_dir])
        
        model_training_config = ModelTrainingConfig(
            root_dir=Path(config.root_dir),
            base_model_path=Path(config.base_model_path),
            base_tokenizer_path=Path(config.base_tokenizer_path),
            model_path=Path(config.model_path),
            tokenizer_path=Path(config.tokenizer_path),
            train_tokenized_data_path=Path(config.train_tokenized_data_path),
            test_tokenized_data_path=Path(config.test_tokenized_data_path),
            val_tokenized_data_path=Path(config.val_tokenized_data_path),
            params_model_name=self.params.MODEL_NAME,
            params_eval_strategy=params_training.eval_strategy,
            params_save_strategy=params_training.save_strategy,
            params_learning_rate=params_training.learning_rate,
            params_per_device_train_batch_size=params_training.per_device_train_batch_size,
            params_per_device_eval_batch_size=params_training.per_device_eval_batch_size,
            params_num_train_epochs=params_training.num_train_epochs,
            params_weight_decay=params_training.weight_decay,
            params_load_best_model_at_end=params_training.load_best_model_at_end,
            params_metric_for_best_model=params_training.metric_for_best_model,
        )

        return model_training_config

In [None]:
pip install -q peft

In [None]:

pip install torchinfo

In [None]:
pip install -q dagshub mlflow

In [9]:
import dagshub
dad = dagshub.init(
    repo_owner="ashish.student2025",
    repo_name="AirTravel_SentimentAnalysis",
    mlflow=True,
)
import mlflow
mlflow.set_tracking_uri(
    "https://dagshub.com/ashish.student2025/AirTravel_SentimentAnalysis.mlflow"
)

[2025-05-24 10:42:17,619: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"]


[2025-05-24 10:42:17,623: INFO: helpers: Accessing as ashish.student2025]
[2025-05-24 10:42:18,204: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/repos/ashish.student2025/AirTravel_SentimentAnalysis "HTTP/1.1 200 OK"]
[2025-05-24 10:42:18,712: INFO: _client: HTTP Request: GET https://dagshub.com/api/v1/user "HTTP/1.1 200 OK"]


[2025-05-24 10:42:18,715: INFO: helpers: Initialized MLflow to track repo "ashish.student2025/AirTravel_SentimentAnalysis"]


[2025-05-24 10:42:18,716: INFO: helpers: Repository ashish.student2025/AirTravel_SentimentAnalysis initialized!]


In [10]:
os.environ["MLFLOW_TRACKING_URI"] ="https://dagshub.com\\ashish.student2025\AirTravel_SentimentAnalysis.mlflow"

In [None]:
os.getenv("MLFLOW_TRACKING_URI").split(os.sep)

In [None]:
pip install --upgrade "transformers>=4.37.0"

In [None]:
import transformers

print(transformers.__version__)

In [11]:
from peft import LoraConfig, TaskType
from torchinfo import summary
from peft import get_peft_model

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# tokenized_train_dataset = load_from_disk("artifacts/text_processing/train")
# tokenized_train_dataset = tokenized_train_dataset.rename_column("intent", "labels")

In [None]:
# tokenized_train_dataset=tokenized_train_dataset.select(range(20))

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
import numpy as np

def compute_metrics(eval_pred):
        print("*******************************************************************compute_metrics called +++++++++++++++++++++++++++++++++++++++++++++++")
        logits, labels = eval_pred
        predictions = np.argmax(logits,axis=-1)  # Predicted class is the index of max logit
        precision = precision_score(labels, predictions, average="weighted")
        recall = recall_score(labels, predictions, average="weighted")
        eval_f1 = f1_score(labels, predictions, average="weighted")
        accuracy = accuracy_score(labels, predictions)
        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
        }

In [None]:
from transformers import TrainingArguments
from transformers import Trainer
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from datasets import load_from_disk
from torchinfo import summary

training_args = TrainingArguments(
    output_dir="artifacts/model_training/model.h5",
    eval_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,  # Learning rate was 2e-5
    per_device_train_batch_size=8,  # Batch size for training
    per_device_eval_batch_size=8,  # Batch size for evaluation
    num_train_epochs=4,  # Number of training epochs
    weight_decay=0.01,  # Weight decay for regularization
    load_best_model_at_end=True,  # Load best model after training
    metric_for_best_model="f1",
)

model = AutoModelForSequenceClassification.from_pretrained(
    "artifacts/prepare_base_model/base_model.h5"
)
print("*******************without peft*******************")
print(summary(model))
print("*******************Model*******************")
print(model)
tokenizer = AutoTokenizer.from_pretrained("artifacts/model_training/tokenizer.h5")
tokenized_train_dataset = load_from_disk("artifacts/text_processing/train")
tokenized_train_dataset = tokenized_train_dataset.rename_column("intent", "labels")
tokenized_test_dataset = load_from_disk("artifacts/text_processing/test")
tokenized_val_dataset = load_from_disk("artifacts/text_processing/val")
tokenized_val_dataset = tokenized_val_dataset.rename_column("intent", "labels")
tokenized_train_dataset = tokenized_train_dataset.select(range(20))
tokenized_test_dataset = tokenized_test_dataset.select(range(20))
tokenized_val_dataset = tokenized_val_dataset.select(range(20))
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_test_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
)
trainer.train()

In [None]:
from transformers import TrainingArguments
from transformers import Trainer
from transformers import EarlyStoppingCallback
from transformers import pipeline
import numpy as np
from transformers import AutoTokenizer
from transformers import AutoModelForSequenceClassification
from datasets import load_from_disk
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

class ModelTraining:
    def __init__(self, config: ModelTrainingConfig):
        self.config = config

    def compute_metrics(self,eval_pred):
        print("*******************************************************************compute_metrics called +++++++++++++++++++++++++++++++++++++++++++++++")
        logits, labels = eval_pred
        predictions = np.argmax(logits,axis=-1)  # Predicted class is the index of max logit
        precision = precision_score(labels, predictions, average="weighted")
        recall = recall_score(labels, predictions, average="weighted")
        eval_f1 = f1_score(labels, predictions, average="weighted")
        accuracy = accuracy_score(labels, predictions)
        return {
            "accuracy": accuracy,
            "precision": precision,
            "recall": recall,
            "f1": f1,
        }

    def train(self):
        print(self.config)
        training_args = TrainingArguments(
            output_dir=self.config.model_path,
            eval_strategy=self.config.params_eval_strategy,
            save_strategy=self.config.params_save_strategy,
            learning_rate=float(self.config.params_learning_rate),
            per_device_train_batch_size=self.config.params_per_device_train_batch_size,
            per_device_eval_batch_size=self.config.params_per_device_eval_batch_size,
            num_train_epochs=self.config.params_num_train_epochs,
            weight_decay=self.config.params_weight_decay,
            logging_strategy="epoch",
            load_best_model_at_end=self.config.params_load_best_model_at_end,
            metric_for_best_model=self.config.params_metric_for_best_model,
        )

        lora_config = LoraConfig(
            task_type=TaskType.SEQ_CLS,
            r=1,
            lora_alpha=1,
            lora_dropout=0.1,
            target_modules=["q_lin", "v_lin"]
        )
        model = AutoModelForSequenceClassification.from_pretrained(self.config.base_model_path)
        print("*******************without peft*******************")
        print(summary(model))
        print("*******************Model*******************")
        print(model)
        print("*******************with peft*******************")        
        peft_model = get_peft_model(model, lora_config)
        print(summary(peft_model))
        tokenizer = AutoTokenizer.from_pretrained(self.config.base_tokenizer_path)
        tokenized_train_dataset = load_from_disk(self.config.train_tokenized_data_path)
        tokenized_train_dataset = tokenized_train_dataset.rename_column("intent", "labels")
        tokenized_test_dataset = load_from_disk(self.config.test_tokenized_data_path) 
        tokenized_test_dataset = tokenized_test_dataset.rename_column(
            "intent", "labels"
        )
        tokenized_val_dataset = load_from_disk(self.config.val_tokenized_data_path)
        tokenized_val_dataset = tokenized_val_dataset.rename_column("intent", "labels")
        tokenized_train_dataset = tokenized_train_dataset.select(range(20))
        tokenized_test_dataset = tokenized_test_dataset.select(range(20))
        tokenized_val_dataset = tokenized_val_dataset.select(range(20))
        trainer = Trainer(
            model=model,
            args=training_args,
            train_dataset=tokenized_train_dataset,
            eval_dataset=tokenized_test_dataset,
            tokenizer=tokenizer,
            compute_metrics=self.compute_metrics,
        )
        trainer.train()
        # mlflow.set_experiment("Air Travel Sentiment Analysis")
        # with mlflow.start_run() as run:
        #     trainer.train()
        #     trainer.evaluate()
        #     mlflow.log_metrics(metrics)
        # trainer.save_model(self.config.model_path)
        # tokenizer.save_pretrained(self.config.tokenizer_path)
        # tuned_pipeline = pipeline(
        #     task="text-classification",
        #     model=trainer.model,
        #     batch_size=8,
        #     tokenizer=tokenizer,
        #     device="cpu",
        # )
        # model_config = {"batch_size": 8}
        # signature = mlflow.models.infer_signature(
        #     ["This is a test!", "And this is also a test."],
        #     mlflow.transformers.generate_signature_output(
        #         tuned_pipeline, ["This is a test response!", "So is this."]
        #     ),
        #     params=model_config,
        # )
        # # Log the pipeline to the existing training run
        # with mlflow.start_run(run_id=run.info.run_id):
        #     model_info = mlflow.transformers.log_model(
        #         transformers_model=tuned_pipeline,
        #         artifact_path="fine_tuned",
        #         signature=signature,
        #         input_example=["Pass in a string", "And have it mark as spam or not."],
        #         model_config=model_config,
        #     )
        # print("Model saved in run %s" % model_info.model_uri)
        # Load our saved model in the native transformers format
        # loaded = mlflow.transformers.load_model(model_uri=model_info.model_uri)

        # decoded_texts = ()
        # for i in range(len(tokenized_val_dataset)):
        #     decoded_text = tokenizer.decode(
        #         tokenized_val_dataset[i]["input_ids"],
        #         skip_special_tokens=True,
        #         clean_up_tokenization_spaces=True,
        #     )
        #     b = list(decoded_texts)
        #     b.append(decoded_text)
        #     decoded_texts = tuple(b)
        # validate the performance of our fine-tuning
        # loaded(tokenized_val_dataset)

In [None]:
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained(
    "artifacts/model_training/model.h5"
)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("artifacts/model_training/tokenizer.h5")

In [None]:
tuned_pipeline = pipeline(
    task="text-classification",
    model=model,
    batch_size=8,
    tokenizer=tokenizer,
    device="cpu",
)

In [None]:
loaded = mlflow.transformers.load_model(
    model_uri="runs:/d6cd4da81c884535a043ab0947b394da/fine_tuned"
)

In [None]:
validation_text = (
    "I'd like information about flight ticket prices from Chicago to Madrid"
)
loaded(validation_text)

In [None]:
os.sep

In [None]:
os.getenv("MLFLOW_TRACKING_URI").split(os.sep)

In [None]:
pip uninstall -y transformers

In [None]:
pip install transformers==4.40.2

In [None]:
pip show transformers

In [None]:
pip install torch torchvision

In [None]:
try:
    config = ConfigurationManager()
    model_training_config = config.get_model_training_config()
    print("Model training config: ", model_training_config)
    model_training = ModelTraining(config=model_training_config)
    model_training.train()
except Exception as e:
    raise e

In [None]:
import transformers

print("Transformers version:", transformers.__version__)
print("Transformers location:", transformers.__file__)