In [None]:
#!pip install transformers[torch]

!pip install transformers==4.28.1
import transformers
print(transformers.__version__)


In [None]:
!pip install datasets

In [None]:
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import BertTokenizer, BertForSequenceClassification, AdamW
from sklearn.model_selection import train_test_split
from datasets import load_dataset


# Load IMDb dataset
imdb_dataset = load_dataset("imdb")

In [None]:
### Summary of Sentiment Analysis steps ###

# Data Loading and Preprocessing: Loads the IMDb movie review dataset, creates a custom dataset class, and splits the data into training and validation sets.
# Model Initialization: Initializes a pre-trained BERT model for sequence classification and moves it to the appropriate device (GPU if available, otherwise CPU).
# Training and Evaluation: Defines the training parameters, initializes a Trainer object, trains the model, and saves it to a directory.
# Prediction: Defines functions to prepare input text and predict sentiment. An example is given to predict the sentiment of a movie review.


In [None]:
# Shuffle the training dataset and select the first 10,000 records
train_dataset = imdb_dataset['train'].shuffle(seed=42).select(range(10000))

In [None]:

# Define the dataset class
class CustomDataset(Dataset):
    def __init__(self, texts, labels, tokenizer, max_length):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer.encode_plus(
            text,
            add_special_tokens=True,
            max_length=self.max_length,
            return_token_type_ids=False,
            padding='max_length',
            truncation=True,
            return_attention_mask=True,
            return_tensors='pt',
        )
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(label, dtype=torch.long)
        }



# Determine if a GPU is available and use it
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Extract texts and labels
texts = train_dataset['text']
labels = [1 if label == 1 else 0 for label in train_dataset['label']]

# Split the data into training and validation sets
train_texts, val_texts, train_labels, val_labels = train_test_split(texts, labels, test_size=0.2, random_state=42)

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Define dataset and dataloaders
train_dataset = CustomDataset(train_texts, train_labels, tokenizer, max_length=128)
val_dataset = CustomDataset(val_texts, val_labels, tokenizer, max_length=128)

train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
val_dataloader = DataLoader(val_dataset, batch_size=16, shuffle=False)

# Initialize BERT model
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2).to(device)

# Define optimizer
optimizer = AdamW(model.parameters(), lr=5e-5)






model.eval()
total_val_loss = 0
num_val_steps = 0
for batch in val_dataloader:
    with torch.no_grad():
        # Move each batch to GPU
        batch = {k: v.to(device) for k, v in batch.items()}

        outputs = model(**batch)
        loss = outputs.loss
        total_val_loss += loss.item()
        num_val_steps += 1

avg_val_loss = total_val_loss / num_val_steps
print(f"Validation loss: {avg_val_loss}")


tokenizer.save_pretrained('./final_model')





In [None]:



from transformers import Trainer, TrainingArguments

# Define training arguments
training_args = TrainingArguments(
    output_dir="./model_output",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    save_steps=500,
    logging_dir='./logs',
    logging_steps=500,
    evaluation_strategy="steps",
    eval_steps=500,
    load_best_model_at_end=True  # Load the best model at the end
)

# Initialize the Trainer with the GPU-enabled model
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset  # Optional, for validation
)

# Start training
trainer.train()

# save the model to a directory
model.save_pretrained("./final_model")



In [None]:


model_path = './final_model'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Make sure to evaluate the model
model.eval()




def prepare_text(text):
    inputs = tokenizer(text, return_tensors="pt", truncation=True, padding="max_length", max_length=512)
    return inputs


def predict_sentiment(text):
    # Prepare the text
    inputs = prepare_text(text)

    # Move inputs to the same device as model
    inputs = {k: v.to(model.device) for k, v in inputs.items()}

    # Prediction
    with torch.no_grad():
        outputs = model(**inputs)
        prediction = torch.nn.functional.softmax(outputs.logits, dim=-1)

    # Assuming the classes are [0, 1] where '0' is negative and '1' is positive
    classes = ['Negative', 'Positive']
    predicted_class = classes[prediction.argmax()]
    return predicted_class, prediction[0].tolist()



In [None]:
# Example usage
review = "A phenomenal film, blending mesmerizing visuals with outstanding character development."
sentiment, scores = predict_sentiment(review)
print(f"Sentiment: {sentiment}, Scores: {scores}")

In [None]:
# Deployment options in AWS

In [None]:
# 1. Sagemaker Endpoint

In [None]:
from sagemaker.huggingface import HuggingFaceModel

# Create a HuggingFaceModel object
model = HuggingFaceModel(
    model_data="s3://<your-bucket>/final_model.tar.gz", # Replace with your model path
    role="<your-iam-role>", # Replace with your IAM role
    transformers_version="4.17",
    pytorch_version="1.9",
    py_version="py38",
)

# Deploy the model to a SageMaker endpoint
predictor = model.deploy(
    initial_instance_count=1,
    instance_type="ml.m5.large",
)

# Make predictions
predictions = predictor.predict({"inputs": "This movie was great!"})

In [None]:
# 2. AWS Elastic Beanstalk

In [None]:
from flask import Flask, request, jsonify
from transformers import BertTokenizer, BertForSequenceClassification

app = Flask(__name__)

# Load your model and tokenizer
model_path = "./final_model"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

@app.route("/", methods=["POST"])
def predict():
    data = request.get_json()
    text = data["text"]
    # Preprocess and predict
    # ...
    return jsonify({"sentiment": sentiment})

if __name__ == "__main__":
    app.run(debug=True, host="0.0.0.0")

In [None]:
# 3. AWS Lambda

In [None]:
from transformers import BertTokenizer, BertForSequenceClassification

# Load your model and tokenizer
model_path = "./final_model"
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

def lambda_handler(event, context):
    text = event["text"]
    # Preprocess and predict
    # ...
    return {"sentiment": sentiment}

In [None]:
# open source deployment

In [None]:
# 1. Deploying as a Web Service with Flask or FastAPI


In [None]:
from flask import Flask, request, jsonify
from transformers import pipeline

app = Flask(__name__)

# Load sentiment analysis pipeline
classifier = pipeline("sentiment-analysis", model="path/to/your/model")

@app.route("/predict", methods=["POST"])
def predict():
    text = request.json.get("text")
    result = classifier(text)[0]
    return jsonify({"sentiment": result["label"], "score": result["score"]})

if __name__ == "__main__":
    app.run(debug=True)

In [None]:
# 2. Using Docker for Containerization

In [None]:
FROM python:3.9

WORKDIR /app

COPY requirements.txt .
RUN pip install --no-cache-dir -r requirements.txt

COPY . .

CMD ["flask", "run", "--host=0.0.0.0"]

In [None]:
# 3. Deploying to Platforms like Heroku or Google Cloud Run

# Create an account on the platform.
# Push your Docker image to the platform's registry.
# Deploy the image as a web service.



In [None]:
# 4. Using Open-Source Model Serving Frameworks like TorchServe or TensorFlow Serving:

# Package your model in the required format for the framework.
# Configure the framework to serve your model.
# Deploy the framework on a server.
