## Step 1: Verify GPU Availability

## 1. Verify GPU Availability

In [None]:
import torch
import os
import sys

print("=" * 50)
print("GPU Status")
print("=" * 50)
print(f"GPU Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print("✅ GPU is ready!")
else:
    print("\n⚠️  WARNING: No GPU detected!")
    print("Please go to: Runtime → Change runtime type → Select GPU")
    print("Then restart this notebook.")
    sys.exit(1)

## 2. Clone Repository and Install Dependencies

In [None]:
print("\n" + "=" * 50)
print("Cloning Repository")
print("=" * 50)

if not os.path.exists('/content/image-captioning-app'):
    !git clone https://github.com/AlexSkogum/image-captioning-app.git /content/image-captioning-app
    print("✅ Repository cloned")
else:
    print("✅ Repository already exists")

os.chdir('/content/image-captioning-app')
print(f"Working directory: {os.getcwd()}")

In [None]:
print("\n" + "=" * 50)
print("Installing Dependencies")
print("=" * 50)

!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q fastapi uvicorn gradio pillow pandas numpy requests nltk pyyaml kaggle

print("✅ All dependencies installed")

## 3. Setup Kaggle Credentials

In [None]:
print("\n" + "=" * 50)
print("Kaggle Credentials Setup")
print("=" * 50)

from google.colab import files
import json

kaggle_dir = os.path.expanduser('~/.kaggle')
os.makedirs(kaggle_dir, exist_ok=True)

print("\nChoose one option:")
print("[A] Upload kaggle.json from your computer")
print("[B] Enter credentials manually")

# Option A: Upload file (uncomment to use)
# print("\nUploading kaggle.json...")
# uploaded = files.upload()
# for filename in uploaded.keys():
#     if filename == 'kaggle.json':
#         !mv {filename} {kaggle_dir}/
#         !chmod 600 {kaggle_dir}/kaggle.json
#         print("✅ kaggle.json uploaded and configured")

# Option B: Manual entry (replace with your credentials)
kaggle_config = {
    "username": "YOUR_KAGGLE_USERNAME",
    "key": "YOUR_KAGGLE_API_KEY"
}

config_path = os.path.join(kaggle_dir, 'kaggle.json')
with open(config_path, 'w') as f:
    json.dump(kaggle_config, f)

!chmod 600 {config_path}
print(f"\n⚠️  IMPORTANT: Replace YOUR_KAGGLE_USERNAME and YOUR_KAGGLE_API_KEY above!")

## 4. Download Flickr8k Dataset

In [None]:
print("\n" + "=" * 50)
print("Downloading Flickr8k Dataset")
print("=" * 50)
print("\nThis may take 5-10 minutes...\n")

!mkdir -p data
!kaggle datasets download -d shadabhussain/flickr8k -p data/ --unzip

if os.path.exists('data/Images') and len(os.listdir('data/Images')) > 0:
    print(f"\n✅ Dataset downloaded successfully")
    print(f"Total images: {len(os.listdir('data/Images'))}")
else:
    print("\n⚠️  Download may have failed. Check Kaggle credentials above.")

## 5. Prepare Dataset and Build Vocabulary

In [None]:
print("\n" + "=" * 50)
print("Preparing Dataset")
print("=" * 50)

!python scripts/prepare_flickr8k.py

import pandas as pd
if os.path.exists('data/captions.csv'):
    df = pd.read_csv('data/captions.csv')
    print(f"\n✅ Dataset prepared successfully")
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"\nFirst 3 samples:")
    print(df.head(3).to_string())
else:
    print("⚠️  captions.csv not found.")

In [None]:
print("\n" + "=" * 50)
print("Building Vocabulary")
print("=" * 50)

!python scripts/build_vocab.py

import pickle
if os.path.exists('data/vocab.pkl'):
    with open('data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    print(f"\n✅ Vocabulary built successfully")
    print(f"Vocabulary size: {len(vocab)}")
else:
    print("⚠️  vocab.pkl not found.")

## 6. Train the Model

This will train the model on GPU. Estimated time: 10-30 minutes.

In [None]:
print("\n" + "=" * 50)
print("Training Model on GPU")
print("=" * 50)
print(f"\nGPU: {torch.cuda.get_device_name(0)}")
print(f"CUDA Version: {torch.version.cuda}")
print("\nTraining will start below...\n")

!python -m src.train --config configs/config.yaml

if os.path.exists('checkpoints/best.pth'):
    checkpoint_size = os.path.getsize('checkpoints/best.pth') / 1e6
    print(f"\n✅ Training complete!")
    print(f"Checkpoint saved: checkpoints/best.pth ({checkpoint_size:.2f} MB)")
else:
    print("\n⚠️  Checkpoint not found. Check training output above for errors.")

## 7. Download the Trained Checkpoint

In [None]:
print("\n" + "=" * 50)
print("Downloading Checkpoint")
print("=" * 50)

from google.colab import files

checkpoint_path = 'checkpoints/best.pth'

if os.path.exists(checkpoint_path):
    print(f"\nDownloading {checkpoint_path}...")
    print("\n⏳ The file will appear in your Downloads folder.\n")
    files.download(checkpoint_path)
    print(f"\n✅ Download started!")
else:
    print(f"⚠️  Checkpoint not found at {checkpoint_path}")
    print("Make sure training completed successfully above.")

## 8. Next Steps: Use the Model Locally

### Step 1: Download Checkpoint
The trained checkpoint should be downloading now.

### Step 2: Place in Local Repository
```bash
# Put best.pth in your local repo:
# <your-repo>/checkpoints/best.pth
```

### Step 3: Run API and Gradio UI Locally
```bash
# Terminal 1: Start API
python -m uvicorn src.api.main:app --reload --port 8000

# Terminal 2: Start Gradio UI
python web/gradio_app.py
```

### Step 4: Open in Browser
**http://localhost:7860**

## 1. Verify GPU Availability

In [None]:
import torch
import os
import sys

print("=" * 50)
print("GPU Status")
print("=" * 50)
print(f"GPU Available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"Device Name: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
    print("GPU is ready!")
else:
    print("\nWARNING: No GPU detected!")
    print("Please go to: Runtime → Change runtime type → Select GPU")
    print("Then restart this notebook.")
    sys.exit(1)

## 2. Clone Repository and Install Dependencies

In [None]:
print("\n" + "=" * 50)
print("Cloning Repository")
print("=" * 50)

# Clone if not already cloned
if not os.path.exists('/content/image-captioning-app'):
    !git clone https://github.com/AlexSkogum/image-captioning-app.git /content/image-captioning-app
    print("Repository cloned")
else:
    print("Repository already exists")

os.chdir('/content/image-captioning-app')
print(f"Working directory: {os.getcwd()}")

In [None]:
print("\n" + "=" * 50)
print("Installing Dependencies")
print("=" * 50)

!pip install -q torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118
!pip install -q fastapi uvicorn gradio pillow pandas numpy requests nltk pyyaml kaggle

print("All dependencies installed")

## 3. Setup Kaggle Credentials

In [None]:
print("\n" + "=" * 50)
print("Kaggle Credentials Setup")
print("=" * 50)

from google.colab import files
import json

# Create .kaggle directory
kaggle_dir = os.path.expanduser('~/.kaggle')
os.makedirs(kaggle_dir, exist_ok=True)

print("\nChoose one option:")
print("[A] Upload kaggle.json from your computer")
print("[B] Enter credentials manually")
print("\nTo use Option A: Uncomment the code below and run")
print("To use Option B: Replace YOUR_USERNAME and YOUR_KEY below")

# Option A: Upload file (uncomment to use)
# print("\nUploading kaggle.json...")
# uploaded = files.upload()
# for filename in uploaded.keys():
#     if filename == 'kaggle.json':
#         !mv {filename} {kaggle_dir}/
#         !chmod 600 {kaggle_dir}/kaggle.json
#         print("kaggle.json uploaded and configured")

# Option B: Manual entry (uncomment and fill in your credentials)
kaggle_config = {
    "username": "YOUR_KAGGLE_USERNAME",
    "key": "YOUR_KAGGLE_API_KEY"
}

config_path = os.path.join(kaggle_dir, 'kaggle.json')
with open(config_path, 'w') as f:
    json.dump(kaggle_config, f)

!chmod 600 {config_path}
print(f"\nIMPORTANT: Replace credentials in the cell above!")
print(f"Location: ~/.kaggle/kaggle.json")

## 4. Download Flickr8k Dataset

In [None]:
print("\n" + "=" * 50)
print("Downloading Flickr8k Dataset")
print("=" * 50)
print("\nThis may take 5-10 minutes...\n")

!mkdir -p data
!kaggle datasets download -d shadabhussain/flickr8k -p data/ --unzip 2>&1 | grep -E '(Downloading|100%|Unzipping)' || true

# Check if download was successful
if os.path.exists('data/Images') and len(os.listdir('data/Images')) > 0:
    print(f"Dataset downloaded successfully")
    print(f"Total images: {len(os.listdir('data/Images'))}")
else:
    print("Download may have failed. Check Kaggle credentials above.")

## 5. Prepare Dataset and Build Vocabulary

In [None]:
print("\n" + "=" * 50)
print("Preparing Dataset")
print("=" * 50)

# Run dataset preparation script
!python scripts/prepare_flickr8k.py

# Verify captions.csv was created
import pandas as pd
if os.path.exists('data/captions.csv'):
    df = pd.read_csv('data/captions.csv')
    print(f"\nDataset prepared successfully")
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print(f"\nFirst 3 samples:")
    print(df.head(3).to_string())
else:
    print("captions.csv not found. Check prepare_flickr8k.py script.")

In [None]:
print("\n" + "=" * 50)
print("Building Vocabulary")
print("=" * 50)

!python scripts/build_vocab.py

# Check vocab file
import pickle
if os.path.exists('data/vocab.pkl'):
    with open('data/vocab.pkl', 'rb') as f:
        vocab = pickle.load(f)
    print(f"\n✅ Vocabulary built successfully")
    print(f"Vocabulary size: {len(vocab)}")
else:
    print("vocab.pkl not found.")

## 6. Train the Model

This will train the model on GPU. Estimated time: 10-30 minutes depending on epochs and batch size.

In [None]:
print("\n" + "=" * 50)
print("Training Model on GPU")
print("=" * 50)
print(f"\nGPU: {torch.cuda.get_device_name(0)}")
print(f"CUDA Version: {torch.version.cuda}")
print("\nTraining will start below...\n")

# Run training
!python -m src.train --config configs/config.yaml 2>&1

# Verify checkpoint was saved
if os.path.exists('checkpoints/best.pth'):
    checkpoint_size = os.path.getsize('checkpoints/best.pth') / 1e6
    print(f"\nTraining complete!")
    print(f"Checkpoint saved: checkpoints/best.pth ({checkpoint_size:.2f} MB)")
else:
    print("\nCheckpoint not found. Check training output above for errors.")

## 7. Download the Trained Checkpoint

In [None]:
print("\n" + "=" * 50)
print("Downloading Checkpoint")
print("=" * 50)

from google.colab import files

checkpoint_path = 'checkpoints/best.pth'

if os.path.exists(checkpoint_path):
    print(f"\nDownloading {checkpoint_path}...")
    print("\n⏳ The file will appear in your Downloads folder in a few seconds.\n")
    files.download(checkpoint_path)
    print(f"\nDownload started!")
else:
    print(f"Checkpoint not found at {checkpoint_path}")
    print("\nMake sure training completed successfully above.")

## 8. Next Steps: Use the Model Locally

### Step 1: Download Checkpoint
The trained checkpoint should be downloading to your computer right now.

### Step 2: Place in Local Repository
```bash
# After downloading, place best.pth in your local repo:
# <your-repo>/checkpoints/best.pth
```

### Step 3: Run API and Gradio UI Locally
```bash
# Terminal 1: Start API
python -m uvicorn src.api.main:app --reload --port 8000

# Terminal 2: Start Gradio UI
python web/gradio_app.py
```

### Step 4: Open in Browser
**http://localhost:7860**

---

## Troubleshooting
,
,
3
,

In [1]:
import torch

print("GPU Available:", torch.cuda.is_available())
if torch.cuda.is_available():
    print("GPU Device:", torch.cuda.get_device_name(0))
    print("GPU Memory:", torch.cuda.get_device_properties(0).total_memory / 1e9, "GB")
else:
    print(" No GPU detected!")
    print("Go to Runtime → Change runtime type → GPU")

GPU Available: False
 No GPU detected!
Go to Runtime → Change runtime type → GPU


## Step 2: Clone Repository and Install Dependencies

In [2]:
!git clone https://github.com/AlexSkogum/image-captioning-app.git
%cd image-captioning-app
!pwd

c:\Users\alexa\image_captioning_app\image-captioning-app


fatal: destination path 'image-captioning-app' already exists and is not an empty directory.
'pwd' is not recognized as an internal or external command,
operable program or batch file.
'pwd' is not recognized as an internal or external command,
operable program or batch file.


In [3]:
!pip install -q torch torchvision fastapi gradio pillow pandas numpy requests nltk pyyaml
print("Dependencies installed")

Dependencies installed


## Step 3: Configure Kaggle API and Download Dataset

In [4]:
import json
import os
from pathlib import Path

os.makedirs('/root/.kaggle', exist_ok=True)

# Option A: Upload kaggle.json from your computer
# from google.colab import files
# files.upload()  # Select kaggle.json
# !mv kaggle.json /root/.kaggle/
# !chmod 600 /root/.kaggle/kaggle.json

# Option B: Create kaggle.json with your credentials
# Replace the values below with your Kaggle credentials
kaggle_config = {
    "username": "YOUR_KAGGLE_USERNAME",
    "key": "YOUR_KAGGLE_API_KEY"
}

with open('/root/.kaggle/kaggle.json', 'w') as f:
    json.dump(kaggle_config, f)

!chmod 600 /root/.kaggle/kaggle.json
print("Kaggle configured (replace credentials above with your actual keys)")

Kaggle configured (replace credentials above with your actual keys)


'chmod' is not recognized as an internal or external command,
operable program or batch file.


In [6]:
# Download and extract Flickr8k dataset
!mkdir -p data
!kaggle datasets download -d shadabhussain/flickr8k -p data/ --unzip
print("Flickr8k dataset downloaded")

^C
Flickr8k dataset downloaded
Flickr8k dataset downloaded



  0%|          | 0.00/2.13G [00:00<?, ?B/s]
  1%|          | 12.0M/2.13G [00:00<00:19, 114MB/s]
  1%|          | 24.0M/2.13G [00:00<00:18, 119MB/s]
  2%|▏         | 36.0M/2.13G [00:00<00:19, 116MB/s]
  2%|▏         | 48.0M/2.13G [00:00<00:19, 116MB/s]
  3%|▎         | 60.0M/2.13G [00:00<00:18, 119MB/s]
  3%|▎         | 72.0M/2.13G [00:00<00:18, 120MB/s]
  4%|▍         | 84.0M/2.13G [00:00<00:20, 107MB/s]
  4%|▍         | 96.0M/2.13G [00:00<00:19, 110MB/s]
  5%|▍         | 107M/2.13G [00:01<00:21, 102MB/s] 
  5%|▌         | 117M/2.13G [00:01<00:21, 101MB/s]
  6%|▌         | 128M/2.13G [00:01<00:22, 97.5MB/s]
  6%|▋         | 139M/2.13G [00:01<00:21, 101MB/s] 
  7%|▋         | 152M/2.13G [00:01<00:19, 110MB/s]
  8%|▊         | 164M/2.13G [00:01<00:19, 111MB/s]
  8%|▊         | 175M/2.13G [00:01<00:20, 104MB/s]
  9%|▊         | 188M/2.13G [00:01<00:20, 99.6MB/s]
  9%|▉         | 201M/2.13G [00:01<00:19, 108MB/s] 
 10%|▉         | 212M/2.13G [00:02<00:26, 78.6MB/s]
 10%|█         | 225M/2

Dataset URL: https://www.kaggle.com/datasets/shadabhussain/flickr8k
License(s): unknown
Downloading flickr8k.zip to data



## Step 4: Prepare Dataset and Build Vocabulary

In [None]:
# Prepare the dataset (creates captions.csv)
!python scripts/prepare_flickr8k.py
print("Dataset prepared")

In [None]:
# Build vocabulary from captions
!python scripts/build_vocab.py
print("Vocabulary built")

In [None]:
# Verify dataset preparation
import pandas as pd

if os.path.exists('data/captions.csv'):
    df = pd.read_csv('data/captions.csv')
    print(f"Dataset shape: {df.shape}")
    print(f"Columns: {list(df.columns)}")
    print("\nFirst 3 rows:")
    print(df.head(3))
else:
    print("  captions.csv not found")

## Step 5: Train the Model

In [7]:
# Train the model on GPU
!python -m src.train --config configs/config.yaml
print("Training complete!")

Training complete!


Traceback (most recent call last):
  File "<frozen runpy>", line 198, in _run_module_as_main
  File "<frozen runpy>", line 88, in _run_code
  File "c:\Users\alexa\image_captioning_app\image-captioning-app\src\train.py", line 85, in <module>
    main()
  File "c:\Users\alexa\image_captioning_app\image-captioning-app\src\train.py", line 60, in main
    vocab = Vocabulary.load('data/vocab.pkl')
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\alexa\image_captioning_app\image-captioning-app\src\data.py", line 67, in load
    with open(path, 'rb') as f:
         ^^^^^^^^^^^^^^^^
FileNotFoundError: [Errno 2] No such file or directory: 'data/vocab.pkl'


## Step 6: Download the Trained Checkpoint

In [None]:
from google.colab import files
import os

checkpoint_path = 'checkpoints/best.pth'
if os.path.exists(checkpoint_path):
    files.download(checkpoint_path)
    print(f" Downloaded {checkpoint_path}")
else:
    print(f" Checkpoint not found at {checkpoint_path}")

## Next Steps: Use the Trained Model Locally

1. **Download** the checkpoint file (`best.pth`) from above
2. **Place it** in your local `checkpoints/` folder
3. **Restart** your local API and Gradio UI:

```bash
# Terminal 1: Start API
python -m uvicorn src.api.main:app --reload --port 8000

# Terminal 2: Start Gradio UI
python web/gradio_app.py
```

Then open: **http://localhost:7860**