In [1]:
import os
import openai
import json
import pandas as pd
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity


In [2]:
!pip install transformers
!pip install torch




In [3]:
!git clone https://github.com/Ahmed8501/Masters.git


fatal: destination path 'Masters' already exists and is not an empty directory.


In [4]:
%cd Masters/notebooks


/content/Masters/notebooks


In [5]:
from google.colab import drive
drive.mount('/content/drive')

# Navigate to the upload location
# %cd /content/drive/MyDrive/spider_data

# Unzip it
# !unzip -q database.zip


Mounted at /content/drive


## Checking spider data

In [6]:
# Check the number of entries in train_spider.json
with open('/content/Masters/data/spider/train_spider.json') as f:
    train_data = json.load(f)

print(len(train_data))

7000


In [7]:
# Check the number of entries in dev.json
with open('/content/Masters/data/spider/dev.json') as f:
    train_data = json.load(f)

print(len(train_data))

1034


In [8]:
# Check the number of schemas in tables.json

with open('/content/Masters/data/spider/tables.json') as f:
    train_data = json.load(f)

print(len(train_data))

166


In [9]:
# sample = train_data[0]
# print(sample['question'])
# print(sample['sql'])
# print(sample['db_id'])


In [10]:
#Checking tables in a specific database
db_path = '/content/drive/MyDrive/spider_data/database/concert_singer'
import sqlite3

conn = sqlite3.connect(f'{db_path}/concert_singer.sqlite')
cursor = conn.cursor()

cursor.execute("SELECT name FROM sqlite_master WHERE type='table';")
tables = cursor.fetchall()

print("Tables in concert_singer:", tables)


Tables in concert_singer: [('stadium',), ('singer',), ('concert',), ('singer_in_concert',)]


In [11]:
# This lists:

# column names
# data types
# if it's a primary key

cursor.execute("PRAGMA table_info(singer);")
print(cursor.fetchall())


[(0, 'Singer_ID', 'INT', 0, None, 1), (1, 'Name', 'TEXT', 0, None, 0), (2, 'Country', 'TEXT', 0, None, 0), (3, 'Song_Name', 'TEXT', 0, None, 0), (4, 'Song_release_year', 'TEXT', 0, None, 0), (5, 'Age', 'INT', 0, None, 0), (6, 'Is_male', 'bool', 0, None, 0)]


## **Schema Linking Annotations**

## **LCSP Stage 1**

In [12]:
import json
import torch
from torch.utils.data import Dataset

class ContrastiveNL2SQLDataset(Dataset):
    def __init__(self, path, tokenizer, max_len=128):
        with open(path, 'r', encoding='utf-8') as f:
            self.data = [json.loads(line) for line in f]
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __getitem__(self, idx):
        item = self.data[idx]
        encoding = self.tokenizer(
            text=item['query'],
            text_pair=item['schema_element'],
            padding='max_length',
            truncation=True,
            max_length=self.max_len,
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(item['label'], dtype=torch.float)
        }

    def __len__(self):
        return len(self.data)


In [13]:
import sys
sys.path.append('/content/Masters/src')


In [14]:
import torch
torch.cuda.is_available()


True

In [15]:
import torch
from torch.utils.data import DataLoader
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
from tqdm import tqdm
from contrastive_dataset import ContrastiveNL2SQLDataset

# 🔧 Settings
model_name = "sentence-transformers/all-MiniLM-L6-v2"
batch_size = 64
epochs = 3
lr = 2e-5
max_len = 128
dataset_path = "/content/drive/My Drive/masters/contrastive_pairs.jsonl"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# 📦 Load Tokenizer and Model
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModel.from_pretrained(model_name)
model.to(device)

# 🧾 Load Dataset
dataset = ContrastiveNL2SQLDataset(dataset_path, tokenizer, max_len=max_len)
loader = DataLoader(dataset, batch_size=batch_size, shuffle=True, num_workers=2, pin_memory=True)

# 🧠 Define Contrastive Head
class LCSPScorer(nn.Module):
    def __init__(self, encoder):
        super().__init__()
        self.encoder = encoder
        self.linear = nn.Linear(encoder.config.hidden_size, 1)

    def forward(self, input_ids, attention_mask):
        output = self.encoder(input_ids=input_ids, attention_mask=attention_mask)
        cls = output.last_hidden_state[:, 0, :]  # use [CLS] token
        return self.linear(cls).squeeze(-1)

scorer = LCSPScorer(model).to(device)
optimizer = torch.optim.AdamW(scorer.parameters(), lr=lr)
criterion = nn.BCEWithLogitsLoss()

# 🏋️ Train
for epoch in range(epochs):
    scorer.train()
    total_loss = 0
    for batch in tqdm(loader, desc=f"Epoch {epoch+1}"):
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)

        scores = scorer(input_ids, attention_mask)
        loss = criterion(scores, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        total_loss += loss.item()

    print(f"Epoch {epoch+1}: Loss = {total_loss:.4f}")




The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Epoch 1: 100%|██████████| 4210/4210 [13:01<00:00,  5.39it/s]


Epoch 1: Loss = 602.3726


Epoch 2: 100%|██████████| 4210/4210 [13:02<00:00,  5.38it/s]


Epoch 2: Loss = 361.5817


Epoch 3: 100%|██████████| 4210/4210 [13:01<00:00,  5.39it/s]


Epoch 3: Loss = 260.9400


RuntimeError: Parent directory Models does not exist.

In [16]:
# 💾 Save
torch.save(scorer.state_dict(), "/content/drive/My Drive/masters/lcsp_encoder.pt")
