In [None]:
!pip install requests tqdm

In [None]:
import requests
from tqdm import tqdm
import json

GITHUB_API_URL = 'https://api.github.com/search/repositories'
GITHUB_README_URL = 'https://api.github.com/repos/{}/readme'

# GitHub API 토큰 설정
GITHUB_TOKEN = '' 

headers = {
    'Accept': 'application/vnd.github.v3+json',
    'Authorization': f'token {GITHUB_TOKEN}'
}

def search_github_repositories(query, max_repos=100):
    repositories = []
    page = 1
    while len(repositories) < max_repos:
        params = {
            'q': query,
            'sort': 'stars',
            'order': 'desc',
            'per_page': 100,
            'page': page
        }
        response = requests.get(GITHUB_API_URL, headers=headers, params=params)
        if response.status_code != 200:
            print(f'Error fetching repositories: {response.status_code}')
            break
        data = response.json()
        items = data.get('items', [])
        if not items:
            break
        repositories.extend(items)
        page += 1
    return repositories[:max_repos]

def get_readme(repo_full_name):
    url = GITHUB_README_URL.format(repo_full_name)
    response = requests.get(url, headers={'Accept': 'application/vnd.github.v3.raw', 'Authorization': f'token {GITHUB_TOKEN}'})
    if response.status_code == 200:
        return response.text
    else:
        return None

In [None]:
import re

def clean_readme(readme_text):
    # README 텍스트에서 이미지, 링크, 뱃지 등을 제거
    readme_text = re.sub(r'!\[.*?\]\(.*?\)', '', readme_text)
    readme_text = re.sub(r'<img[^>]*>', '', readme_text)
    readme_text = re.sub(r'\[.*?\]\(.*?\)', '', readme_text)
    readme_text = re.sub(r'image:[^\s]+\[[^\]]*\]', '', readme_text)
    readme_text = re.sub(r'http[s]?://\S+', '', readme_text)
    readme_text = re.sub(r'\[!\[.*?\]\(.*?\)\]', '', readme_text)
    readme_text = re.sub(r'\s+', ' ', readme_text).strip()
    
    return readme_text


In [None]:
# 관련 상위 max_repos개 리포지토리의 README 수집
query = 'spring framework'
max_repos = 50  #수정필요

repositories = search_github_repositories(query, max_repos)

readme_data = []

for repo in tqdm(repositories):
    full_name = repo['full_name']
    readme = get_readme(full_name)
    if readme:
        cleaned_readme = clean_readme(readme)
        if cleaned_readme: 
            readme_data.append({
                'repo_name': full_name,
                'readme': cleaned_readme
            })

print(f'총 {len(readme_data)}개의 README를 수집')


In [None]:
# 전체 readme_data 출력
for i, item in enumerate(readme_data):
    print(f"=== Repository {i+1} ===")
    print(f"Repo Name: {item['repo_name']}")
    print(f"README Content (Preview): {item['readme'][:500]}...")  
    print("=" * 50)


In [None]:
!pip install transformers sentencepiece

In [None]:
from transformers import T5ForConditionalGeneration, T5Tokenizer
import torch

tokenizer = T5Tokenizer.from_pretrained('t5-base')
t5_model = T5ForConditionalGeneration.from_pretrained('t5-base')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
t5_model = t5_model.to(device)

# T5모델로 query 생성하는 함수
def generate_query(document_text, max_length=64):
    input_text = "Generate a search query for the following document: " + document_text
    inputs = tokenizer.encode(input_text, return_tensors='pt', max_length=512, truncation=True).to(device)

    outputs = t5_model.generate(
        inputs, 
        max_length=max_length, 
        num_beams=5, 
        early_stopping=True, 
        num_return_sequences=1
    )
    query = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    return query


In [None]:
# 데이터셋 생성
dataset = []
for data in tqdm(readme_data):
    readme = data['readme']  
    query = generate_query(readme)
    dataset.append({
        'query': query,
        'document': readme,
        'label': 1.0  
    })

print(f'총 {len(dataset)}개의 쿼리-문서 쌍이 생성')

# 출력 확인
for i, item in enumerate(dataset):
    print(f"=== Data Pair {i+1} ===")
    print(f"Query: {item['query']}")
    print(f"Document: {item['document'][:500]}...") 
    print(f"Label: {item['label']}")
    print("=" * 50)


In [None]:
from transformers import AutoModelForSequenceClassification, AutoTokenizer, AdamW
from torch.utils.data import DataLoader, Dataset
import torch
from sklearn.model_selection import train_test_split
from tqdm import tqdm

class CrossEncoderDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.data = data
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.data)

    def __getitem__(self, idx):
        query = self.data[idx]['query']
        document = self.data[idx]['document']
        label = self.data[idx].get('label', 0.0)  
        
        encoding = self.tokenizer(
            query,
            document,
            truncation=True,
            max_length=self.max_length,
            padding='max_length',
            return_tensors='pt'
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.float)
        }


In [None]:
model_name = "cross-encoder/ms-marco-MiniLM-L-6-v2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
cross_encoder = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=1)

# 데이터셋 준비
train_data, val_data = train_test_split(dataset, test_size=0.2, random_state=42)

train_dataset = CrossEncoderDataset(train_data, tokenizer)
val_dataset = CrossEncoderDataset(val_data, tokenizer)

train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=8)

optimizer = AdamW(cross_encoder.parameters(), lr=5e-5)
cross_encoder = cross_encoder.to(device)

# 파인튜닝
epochs = 2 #수정필요
for epoch in range(epochs):
    print(f"Epoch {epoch + 1}/{epochs}")
    cross_encoder.train()
    train_loss = 0
    for batch in tqdm(train_loader, desc="Training"):
        optimizer.zero_grad()
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].unsqueeze(1).to(device)
        
        outputs = cross_encoder(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        train_loss += loss.item()
        loss.backward()
        optimizer.step()
    print(f"Training Loss: {train_loss / len(train_loader)}")
    
    # Validation
    cross_encoder.eval()
    val_loss = 0
    with torch.no_grad():
        for batch in tqdm(val_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].unsqueeze(1).to(device)
            
            outputs = cross_encoder(input_ids, attention_mask=attention_mask, labels=labels)
            loss = outputs.loss
            val_loss += loss.item()
    print(f"Validation Loss: {val_loss / len(val_loader)}")


In [None]:
# 테스트
#수정필요
test_queries = [
    {"query": "Spring Framework", "document": "This is the machine learning project"},
    {"query": "Spring Framework", "document": "This project is made by spring framework"}
]

test_dataset = CrossEncoderDataset(test_queries, tokenizer)
test_loader = DataLoader(test_dataset, batch_size=1)

cross_encoder.eval()
for batch in tqdm(test_loader, desc="Testing"):
    input_ids = batch['input_ids'].to(device)
    attention_mask = batch['attention_mask'].to(device)
    with torch.no_grad():
        outputs = cross_encoder(input_ids, attention_mask=attention_mask)
        scores = outputs.logits.squeeze(-1).cpu().numpy()
        print(f"Score: {scores}")