In [None]:
!pip


Usage:   
  pip3 <command> [options]

Commands:
  install                     Install packages.
  download                    Download packages.
  uninstall                   Uninstall packages.
  freeze                      Output installed packages in requirements format.
  inspect                     Inspect the python environment.
  list                        List installed packages.
  show                        Show information about installed packages.
  check                       Verify installed packages have compatible dependencies.
  config                      Manage local and global configuration.
  search                      Search PyPI for packages.
  cache                       Inspect and manage pip's wheel cache.
  index                       Inspect information available from package indexes.
  wheel                       Build wheels from your requirements.
  hash                        Compute hashes of package archives.
  completion                  A helper c

In [None]:
!apt


apt 2.0.9 (amd64)
Usage: apt [options] command

apt is a commandline package manager and provides commands for
searching and managing as well as querying information about packages.
It provides the same functionality as the specialized APT tools,
like apt-get and apt-cache, but enables options more suitable for
interactive use by default.

Most used commands:
  list - list packages based on package names
  search - search in package descriptions
  show - show package details
  install - install packages
  reinstall - reinstall packages
  remove - remove packages
  autoremove - Remove automatically all unused packages
  update - update list of available packages
  upgrade - upgrade the system by installing/upgrading packages
  full-upgrade - upgrade the system by removing/installing/upgrading packages
  edit-sources - edit the source information file
  satisfy - satisfy dependency strings

See apt(8) for more information about the available commands.
Configuration options and syntax is 

In [None]:
pip install opendatasets


Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Installing collected packages: opendatasets
Successfully installed opendatasets-0.1.22


In [None]:
!pip install torch torchvision



In [None]:

import opendatasets as od
import pandas

od.download(
    "https://www.kaggle.com/datasets/preatcher/standard-ocr-dataset")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username: harshsainb21me028
Your Kaggle Key: ··········
Downloading standard-ocr-dataset.zip to ./standard-ocr-dataset


100%|██████████| 46.2M/46.2M [00:02<00:00, 21.6MB/s]





In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
from torchvision.datasets import DatasetFolder
from torchvision.models import resnet50
from sklearn.model_selection import train_test_split



In [None]:
class patchEmbed(nn.Module):
    def __init__ (self, img_size , patch_size, in_chans = 3, embed_dim = 768):
        super().__init__()
        self.img_size = img_size
        self.patch_size = patch_size
        self.n_patches = (img_size // patch_size)**2

        self.proj = nn.Conv2d(
            in_chans,
            embed_dim,
            kernal_size = patch_size,
            stride = patch_size,
        )

    def forward(self, x):
        x = self.proj(
            x
        )
        x = x.flatten(2)
        x = x.transpose(1,2)

        return x

In [None]:
class Attention(nn.Module):
    def __init__(self, dim, n_heads = 12, qkv_bias = True, attn_p = 0. , proj_p = 0.):
        super().__init__()
        self.n_head = n_heads
        self.dim = dim // n_heads
        self.scale = self.head_dim ** -0.5

        self.qkv = nn.linear(dim, dim *3 , bias = qkv_bias)
        self.attn_drop = nn.Dropout(attn_p)
        self.proj = nn.Linear(dim, dim)
        self.proj_drop = nn.Dropout(proj_p)

    def forward(self, x):
        n_samples, n_tokens, dim = x.shape

        if dim != self.dim:
            raise ValueError

        qkv = self.qkv(x)
        qkv = qkv.reshape(
            n_samples, n_tokens,3 , self.n_heads, self.head_dim
        )
        qkv = qkv.permute(
            2,0,3,1,4
        )
        q, k , v = qkv[0],qkv[1],qkv[2]
        k_t = k.transpose(-2,-1)
        dp = (
            q @ k_t
        )*self.scale
        attn = dp.softmax(dim = 1)
        attn = self.attn_drop(attn)

        weighted_avg = attn @ v
        weighted_avg = weighted_avg.transpose(
            1,2
        )
        weighted_avg = weighted_avg.flatten(
            2
        )

        x = self.proj(weighted_avg)
        x = self.proj_drop(x)

        return x

In [None]:
class MLP(nn.Module):
    def __init__(self, in_features, hidden_features, out_features, p=0.):
        super().__init__()
        self.fc1 = nn.Linear(in_features , hidden_features)
        self.act = nn.GELU()
        self.fc2 = nn.Linear(hidden_features, out_features)
        self.drop = nn.Dropout(p)

    def forward(self, x):
        x = self.fc1(x)
        x = self.act(x)
        x = self.drop(x)
        x = self.fc2(x)
        x = self.drop(x)

        return x

In [None]:
class Block(nn.Module):
    def __init__(self, dim, n_heads , mlp_ratio = 4.0, qkv_bias = True, attn_p = 0. , p = 0.):
        super.__int__()
        self.norm1 = nn.LayerNorm(dim,eps = 1e-6)
        self.attn = Attention(
            dim,
            n_heads = n_heads,
            qkv_bias = qkv_bias,
            attn_p = attn_p,
            proj_p = p
        )
        self.norm2 = nn.LayerNorm(dim, eps = 1e-6)
        hidden_features = int(dim*mlp_ratio)
        self.mlp = MLP(
            in_featurea = dim,
            hidden_features = hidden_features,
            out_featrures = dim,
        )

    def forward(self, x):
        x = x + self.attn(self.norm1(x))
        x = x + self.mlp(self.norm2(x))

        return x

In [None]:
class VisionTransformer(nn.Module):
    def __init__(self,
                img_size = 384,
                patch_size = 16,
                in_chans = 3,
                embed_dim = 768,
                depth = 12,
                n_heads = 12,
                mlp_ratio = 4 ,
                qkv_bias = True,
                p =0.,
                attn_p = 0. ,
      ):
        super().__init__()
        self.patch_embed = patchEmbed(
            img_size = img_size,
            patch_size = patch_size,
            in_chans = in_chans,
            embed_dim =embed_dim,
        )
        self.cls_token = nn.parameter(torch.zeros(1,1,embed_dim))
        self.pos_embed = nn.parameter(
            torch.zeroes(1, 1 + self.patch_embed.n_patches, embed_dim)
        )
        self.pos_drop = nn.Dropout(p = p)

        self.blocks =Block(
                    dim = embed_dim,
                    n_heads = n_heads,
                    mlp_ratio = mlp_ratio,
                    qkv_bias = qkv_bais,
                    p = p,
                    attn_p = attn_p,
                  )

        self.norm = nn.LayerNorm(embed_dim, eps = 1e-6)
        self.head = nn.Linear(embed_dim)

    def forward(self,x):
        n_samples = x.shapes[0]
        x= self.patch_embed(x)

        cls_token = self.cls_token.expend(
            n_samples, -1,-1
        )
        x = torch.cat((cls_token, x), dim = 1)
        x = x + self.pos_embed
        print(x.shape, self.pos_embed.shape)
        x = self.pos_drop(x)

        for block in self.blocks:
            x = block(x)

        x = self.norm(x)

        cls_token_final = x[:, 0]
        x =self.head(cls_token_final)

        return x

In [None]:
# Load and preprocess the Kaggle dataset
import torchvision
transform = transforms.Compose([
    transforms.Resize((224, 224)),
    transforms.ToTensor(),
    transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])

traindataset = DatasetFolder('/content/standard-ocr-dataset/data/training_data', loader=torchvision.datasets.folder.default_loader, transform=transform,extensions = 'png')
train_dataset,val_dataset=torch.utils.data.random_split(traindataset,(0.8,0.2))

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=64, shuffle=True)


In [None]:

model = VisionTransformer()

# Define loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# Training loop
num_epochs = 10
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(num_epochs):
    model.train()
    for images,labels in train_loader:
        images = images.to(device)
        labels = labels.to(device)

        optimizer.zero_grad()

        outputs = model(images)
        loss = criterion(outputs, labels)

        loss.backward()
        optimizer.step()

    # Evaluation
    model.eval()
    val_loss = 0
    val_correct = 0
    val_total = 0

    with torch.no_grad():
        for images, labels in val_loader:
            images = images.to(device)
            labels = labels.to(device)

            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)

            val_total += labels.size(0)
            val_correct += (predicted == labels).sum().item()
            val_loss += criterion(outputs, labels).item()

    val_accuracy = 100 * val_correct / val_total
    val_loss /= len(val_loader)

    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.2f}%")

print("Training finished.")

TypeError: ignored

In [None]:
# Testing
test_dataset = DatasetFolder('/content/standard-ocr-dataset/data/training_data', loader=torchvision.datasets.folder.default_loader, transform=transform,extensions = 'png')
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

model.eval()
predictions = []

with torch.no_grad():
    for images, _ in test_loader:
        images = images.to(device)
        # # outputs = model(images)
        # _, predicted = torch.max(outputs.data, 1)
        # predictions.extend(predicted.cpu().numpy())