# TASK 1

In [29]:
import os
import shutil
import random
from google.colab import drive

In [30]:
# Mount Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
# Define directories
base_dir = '/content/drive/MyDrive/chicken_duck_dataset'
chicken_dir = os.path.join(base_dir, 'chicken')
duck_dir = os.path.join(base_dir, 'duck')
train_dir = os.path.join(base_dir, 'train')
val_dir = os.path.join(base_dir, 'val')

In [32]:
# Verify chicken and duck folders exist
if not os.path.exists(chicken_dir) or not os.path.exists(duck_dir):
    print("Error: 'chicken' or 'duck' folder not found in /MyDrive/chicken_duck_dataset")
    print("Current contents of /MyDrive:")
    !ls /content/drive/MyDrive
    print("Current contents of /MyDrive/chicken_duck_dataset (if it exists):")
    !ls /content/drive/MyDrive/chicken_duck_dataset
    print("Check Drive and move/re-upload 'chicken' and 'duck' folders to /MyDrive/chicken_duck_dataset")
else:
    # Clear existing train and val folders to start fresh
    for folder in [os.path.join(train_dir, 'chicken'), os.path.join(train_dir, 'duck'),
                   os.path.join(val_dir, 'chicken'), os.path.join(val_dir, 'duck')]:
        if os.path.exists(folder):
            for file in os.listdir(folder):
                os.remove(os.path.join(folder, file))
        os.makedirs(folder, exist_ok=True)

In [33]:
# Function to split images
def split_images(source_dir, train_dest, val_dest, train_size=80, val_size=20):
    images = [f for f in os.listdir(source_dir) if f.endswith(('.jpg', '.png'))]
    if len(images) < train_size + val_size:
        print(f"Warning: Only {len(images)} images in {source_dir}, need {train_size + val_size}")
        return False
    random.shuffle(images)  # Randomize for split
    for img in images[:train_size]:
        shutil.move(os.path.join(source_dir, img), os.path.join(train_dest, img))
    for img in images[train_size:train_size + val_size]:
        shutil.move(os.path.join(source_dir, img), os.path.join(val_dest, img))
    return True

In [34]:
# Split chicken and duck images
success_chicken = split_images(chicken_dir, os.path.join(train_dir, 'chicken'), os.path.join(val_dir, 'chicken'))
success_duck = split_images(duck_dir, os.path.join(train_dir, 'duck'), os.path.join(val_dir, 'duck'))

In [35]:
# Verify counts
if success_chicken and success_duck:
    print("Training images:")
    print(f"Chicken: {len(os.listdir(os.path.join(train_dir, 'chicken')))}")
    print(f"Duck: {len(os.listdir(os.path.join(train_dir, 'duck')))}")
    print("Validation images:")
    print(f"Chicken: {len(os.listdir(os.path.join(val_dir, 'chicken')))}")
    print(f"Duck: {len(os.listdir(os.path.join(val_dir, 'duck')))}")
else:
    print("Splitting failed. Check image counts and try again.")

Training images:
Chicken: 80
Duck: 80
Validation images:
Chicken: 20
Duck: 20


In [36]:
# Install required libraries
!pip install torch torchvision scikit-learn

# Import libraries
import os
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision import datasets, models, transforms
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import numpy as np
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Set device (GPU if available)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
Using device: cuda


In [37]:
# Define base directory
base_dir = '/content/drive/MyDrive/chicken_duck_dataset'

# Data augmentation and normalization
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop(224),  # Resize to 224x224
        transforms.RandomHorizontalFlip(),  # Prevent overfitting
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])  # ImageNet norms
    ]),
    'val': transforms.Compose([
        transforms.Resize(256),
        transforms.CenterCrop(224),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

# Load datasets
image_datasets = {
    x: datasets.ImageFolder(os.path.join(base_dir, x), data_transforms[x])
    for x in ['train', 'val']
}
dataloaders = {
    x: DataLoader(image_datasets[x], batch_size=16, shuffle=True, num_workers=2)
    for x in ['train', 'val']
}
dataset_sizes = {x: len(image_datasets[x]) for x in ['train', 'val']}
class_names = image_datasets['train'].classes

# Verify dataset
print(f"Classes: {class_names}")
print(f"Dataset sizes: {dataset_sizes}")
print("Training images:")
print(f"Chicken: {len(os.listdir(os.path.join(base_dir, 'train', 'chicken')))}")
print(f"Duck: {len(os.listdir(os.path.join(base_dir, 'train', 'duck')))}")
print("Validation images:")
print(f"Chicken: {len(os.listdir(os.path.join(base_dir, 'val', 'chicken')))}")
print(f"Duck: {len(os.listdir(os.path.join(base_dir, 'val', 'duck')))}")

Classes: ['chicken', 'duck']
Dataset sizes: {'train': 160, 'val': 40}
Training images:
Chicken: 80
Duck: 80
Validation images:
Chicken: 20
Duck: 20


#########

In [41]:
# Load pre-trained ResNet18
model = models.resnet18(pretrained=True)

# Freeze all layers
for param in model.parameters():
    param.requires_grad = False

# Unfreeze layer3, layer4, and fc
for param in model.layer3.parameters():
    param.requires_grad = True
for param in model.layer4.parameters():
    param.requires_grad = True

# Replace final layer
num_ftrs = model.fc.in_features
model.fc = nn.Linear(num_ftrs, len(class_names))  # Chicken, duck
for param in model.fc.parameters():
    param.requires_grad = True

# Move model to GPU
model = model.to(device)

# Define loss and optimizer (lower lr for more layers)
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(
    [
        {'params': model.layer3.parameters(), 'lr': 0.0001},
        {'params': model.layer4.parameters(), 'lr': 0.0001},
        {'params': model.fc.parameters(), 'lr': 0.001}
    ],
    momentum=0.9
)

# Verify model setup
print("Model moved to:", device)
print("Final layer:", model.fc)
print("Trainable parameters:")
for name, param in model.named_parameters():
    if param.requires_grad:
        print(f"  {name}")

Model moved to: cuda
Final layer: Linear(in_features=512, out_features=2, bias=True)
Trainable parameters:
  layer3.0.conv1.weight
  layer3.0.bn1.weight
  layer3.0.bn1.bias
  layer3.0.conv2.weight
  layer3.0.bn2.weight
  layer3.0.bn2.bias
  layer3.0.downsample.0.weight
  layer3.0.downsample.1.weight
  layer3.0.downsample.1.bias
  layer3.1.conv1.weight
  layer3.1.bn1.weight
  layer3.1.bn1.bias
  layer3.1.conv2.weight
  layer3.1.bn2.weight
  layer3.1.bn2.bias
  layer4.0.conv1.weight
  layer4.0.bn1.weight
  layer4.0.bn1.bias
  layer4.0.conv2.weight
  layer4.0.bn2.weight
  layer4.0.bn2.bias
  layer4.0.downsample.0.weight
  layer4.0.downsample.1.weight
  layer4.0.downsample.1.bias
  layer4.1.conv1.weight
  layer4.1.bn1.weight
  layer4.1.bn1.bias
  layer4.1.conv2.weight
  layer4.1.bn2.weight
  layer4.1.bn2.bias
  fc.weight
  fc.bias


In [42]:
# Training function
def train_model(model, criterion, optimizer, num_epochs=5):
    best_acc = 0.0
    for epoch in range(num_epochs):
        print(f'Epoch {epoch}/{num_epochs - 1}')
        for phase in ['train', 'val']:
            if phase == 'train':
                model.train()  # Training mode
            else:
                model.eval()   # Evaluation mode
            running_loss = 0.0
            running_corrects = 0
            for inputs, labels in dataloaders[phase]:
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                with torch.set_grad_enabled(phase == 'train'):
                    outputs = model(inputs)
                    _, preds = torch.max(outputs, 1)
                    loss = criterion(outputs, labels)
                    if phase == 'train':
                        loss.backward()
                        optimizer.step()
                running_loss += loss.item() * inputs.size(0)
                running_corrects += torch.sum(preds == labels.data)
            epoch_loss = running_loss / dataset_sizes[phase]
            epoch_acc = running_corrects.double() / dataset_sizes[phase]
            print(f'{phase} Loss: {epoch_loss:.4f} Acc: {epoch_acc:.4f}')
            if phase == 'val' and epoch_acc > best_acc:
                best_acc = epoch_acc
                torch.save(model.state_dict(), os.path.join(base_dir, 'best_model.pth'))
        print()
    print(f'Best val Acc: {best_acc:.4f}')
    return model

# Train the model
model = train_model(model, criterion, optimizer, num_epochs=5)

Epoch 0/4
train Loss: 0.7021 Acc: 0.5312
val Loss: 0.4082 Acc: 0.8500

Epoch 1/4
train Loss: 0.3816 Acc: 0.8563
val Loss: 0.2035 Acc: 0.9500

Epoch 2/4
train Loss: 0.1881 Acc: 0.9688
val Loss: 0.1264 Acc: 1.0000

Epoch 3/4
train Loss: 0.1399 Acc: 0.9688
val Loss: 0.1015 Acc: 0.9500

Epoch 4/4
train Loss: 0.1126 Acc: 0.9750
val Loss: 0.0867 Acc: 0.9500

Best val Acc: 1.0000


# TASK 2

In [43]:
!pip install transformers datasets scikit-learn torch



In [44]:
from google.colab import drive
from sklearn.model_selection import train_test_split
from transformers import BertTokenizer
from datasets import Dataset

In [45]:
# Define dataset directory
dataset_dir = '/content/drive/MyDrive/sentiment_analysis'

In [46]:
# Verify dataset files
print("Dataset files:")
!ls {dataset_dir}


Dataset files:
test.csv  train.csv


In [48]:
# Load train and test datasets
import pandas as pd
train_df = pd.read_csv(os.path.join(dataset_dir, 'train.csv'), encoding='latin-1')
test_df = pd.read_csv(os.path.join(dataset_dir, 'test.csv'), encoding='latin-1')

In [50]:
# Check dataset structure
print("\nTrain dataset info:")
print(train_df.info())
print("\nTrain dataset head:")
print(train_df.head())
print("\nSentiment distribution in train:")
print(train_df['sentiment'].value_counts())



Train dataset info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27481 entries, 0 to 27480
Data columns (total 10 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   textID            27481 non-null  object 
 1   text              27480 non-null  object 
 2   selected_text     27480 non-null  object 
 3   sentiment         27481 non-null  object 
 4   Time of Tweet     27481 non-null  object 
 5   Age of User       27481 non-null  object 
 6   Country           27481 non-null  object 
 7   Population -2020  27481 non-null  int64  
 8   Land Area (Km²)   27481 non-null  float64
 9   Density (P/Km²)   27481 non-null  int64  
dtypes: float64(1), int64(2), object(7)
memory usage: 2.1+ MB
None

Train dataset head:
       textID                                               text  \
0  cb774db0d1                I`d have responded, if I were going   
1  549e992a42      Sooo SAD I will miss you here in San Diego!!!   
2  088c60f138 

In [51]:
# Handle missing or invalid text
train_df = train_df.dropna(subset=['text', 'sentiment'])
train_df = train_df[train_df['text'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]
test_df = test_df.dropna(subset=['text', 'sentiment'])
test_df = test_df[test_df['text'].apply(lambda x: isinstance(x, str) and len(x.strip()) > 0)]


In [53]:
train_df.head()

Unnamed: 0,textID,text,selected_text,sentiment,Time of Tweet,Age of User,Country,Population -2020,Land Area (Km²),Density (P/Km²)
0,cb774db0d1,"I`d have responded, if I were going","I`d have responded, if I were going",neutral,morning,0-20,Afghanistan,38928346,652860.0,60
1,549e992a42,Sooo SAD I will miss you here in San Diego!!!,Sooo SAD,negative,noon,21-30,Albania,2877797,27400.0,105
2,088c60f138,my boss is bullying me...,bullying me,negative,night,31-45,Algeria,43851044,2381740.0,18
3,9642c003ef,what interview! leave me alone,leave me alone,negative,morning,46-60,Andorra,77265,470.0,164
4,358bd9e861,"Sons of ****, why couldn`t they put them on t...","Sons of ****,",negative,noon,60-70,Angola,32866272,1246700.0,26


In [54]:
# Map sentiments to integers
sentiment_map = {'positive': 0, 'neutral': 1, 'negative': 2}
train_df['label'] = train_df['sentiment'].map(sentiment_map)
test_df['label'] = test_df['sentiment'].map(sentiment_map)

In [55]:
# Verify label mapping
print("\nLabel mapping:")
print(sentiment_map)
print("\nTrain labels distribution:")
print(train_df['label'].value_counts())



Label mapping:
{'positive': 0, 'neutral': 1, 'negative': 2}

Train labels distribution:
label
1    11117
0     8582
2     7781
Name: count, dtype: int64


In [56]:
# Split train into train and validation (80-20)
train_data, val_data = train_test_split(train_df, test_size=0.2, stratify=train_df['label'], random_state=42)


In [57]:
# Convert to Hugging Face Dataset
train_dataset = Dataset.from_pandas(train_data[['text', 'label']])
val_dataset = Dataset.from_pandas(val_data[['text', 'label']])
test_dataset = Dataset.from_pandas(test_df[['text', 'label']])

In [58]:
# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

In [59]:
# Tokenize function
def tokenize_function(examples):
    return tokenizer(examples['text'], padding='max_length', truncation=True, max_length=128)


In [60]:
# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
val_dataset = val_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

Map:   0%|          | 0/21984 [00:00<?, ? examples/s]

Map:   0%|          | 0/5496 [00:00<?, ? examples/s]

Map:   0%|          | 0/3534 [00:00<?, ? examples/s]

In [61]:
# Set format for PyTorch
train_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
val_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format('torch', columns=['input_ids', 'attention_mask', 'label'])

In [62]:
# Verify dataset sizes
print("\nDataset sizes:")
print(f"Train: {len(train_dataset)}")
print(f"Validation: {len(val_dataset)}")
print(f"Test: {len(test_dataset)}")


Dataset sizes:
Train: 21984
Validation: 5496
Test: 3534


In [64]:
!pip install --upgrade transformers



In [65]:
from transformers import BertForSequenceClassification, Trainer, TrainingArguments
import torch
import os
from sklearn.metrics import classification_report
import numpy as np

# Define device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Load pre-trained BERT model
model = BertForSequenceClassification.from_pretrained(
    'bert-base-uncased',
    num_labels=3  # Positive, neutral, negative
)
model = model.to(device)

# Define training arguments
training_args = TrainingArguments(
    output_dir=os.path.join(dataset_dir, 'bert_model'),
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    eval_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    logging_dir=os.path.join(dataset_dir, 'logs'),
    logging_steps=100,
    learning_rate=2e-5,
    weight_decay=0.01,
    fp16=True  # Mixed precision for faster training
)

# Define compute_metrics function for evaluation
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    report = classification_report(labels, predictions, output_dict=True)
    return {
        'accuracy': report['accuracy'],
        'precision': report['macro avg']['precision'],
        'recall': report['macro avg']['recall'],
        'f1': report['macro avg']['f1-score']
    }

# Initialize Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics
)

# Train the model
print("Starting training...")
trainer.train()

# Save the best model
trainer.save_model(os.path.join(dataset_dir, 'bert_model'))
tokenizer.save_pretrained(os.path.join(dataset_dir, 'bert_model'))

# Verify saved model
print("\nSaved model files:")
!ls {os.path.join(dataset_dir, 'bert_model')}

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Starting training...


[34m[1mwandb[0m: Using wandb-core as the SDK backend.  Please refer to https://wandb.me/wandb-core for more information.


<IPython.core.display.Javascript object>

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
wandb: Paste an API key from your profile and hit enter:

 ··········


[34m[1mwandb[0m: No netrc file found, creating one.
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mchandranath157200000[0m ([33mchandranath157200000-chennai-mathematical-institute[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.5064,0.500472,0.796761,0.799273,0.801262,0.800132
2,0.4078,0.546207,0.803311,0.804951,0.807994,0.806357
3,0.2969,0.628494,0.796761,0.799845,0.800141,0.799943



Saved model files:
checkpoint-1374  config.json		  tokenizer_config.json
checkpoint-2748  model.safetensors	  training_args.bin
checkpoint-4122  special_tokens_map.json  vocab.txt


In [67]:
print(model)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSdpaSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e