In [None]:
import requests
import os

# Set up GitHub credentials
GITHUB_TOKEN = "github_token"  # Replace with your actual token
HEADERS = {"Authorization": f"token {GITHUB_TOKEN}"}

# Function to get repository contents
def get_repo_contents(owner, repo, path=""):
    url = f"https://api.github.com/repos/{owner}/{repo}/contents/{path}?per_page=10"
    session = requests.Session()
    session.headers.update({"Authorization": f"token {GITHUB_TOKEN}"})
    response = session.get(url)

    #response = requests.get(url, headers=HEADERS, timeout=30)

    if response.status_code == 200:
        return response.json()
    else:
        print(f"Error {response.status_code}: {response.json()}")
        return []

# Function to download files from a repo
def download_repo_code(owner, repo, path="", save_dir="github_code"):
    os.makedirs(save_dir, exist_ok=True)  # Create the main directory if it doesn't exist
    contents = get_repo_contents(owner, repo, path)

    for file in contents:
        if file["type"] == "file" and file["name"].endswith((".py", ".js", ".java", ".md", ".txt")):
            # Download the file
            file_content = requests.get(file["download_url"]).text
            file_path = os.path.join(save_dir, file["name"])

            with open(file_path, "w", encoding="utf-8") as f:
                f.write(file_content)
            print(f"Downloaded: {file['name']}")

        elif file["type"] == "dir":
            new_dir = os.path.join(save_dir, file["name"])
            os.makedirs(new_dir, exist_ok=True)
            print(f"📂 Entering directory: {file['name']}")
            download_repo_code(owner, repo, file["path"], new_dir)  # Recursively fetch subdirectories

# Example usage (Start with a small repo)
download_repo_code("haoel", "leetcode")  # Replace with any repo of interest


Downloaded: README.md
📂 Entering directory: algorithms
📂 Entering directory: cpp
📂 Entering directory: 3Sum
📂 Entering directory: 3SumClosest
📂 Entering directory: 4Sum
📂 Entering directory: FindValidMatrixGivenRowAndColumnSums
📂 Entering directory: LRUCache
📂 Entering directory: NumberOfWaysToSplitString
📂 Entering directory: UTF8Validation
📂 Entering directory: accountsMerge
📂 Entering directory: addAndSearchWord
📂 Entering directory: addBinary
📂 Entering directory: addDigits
📂 Entering directory: addStrings
📂 Entering directory: addToArrayFormOfInteger
📂 Entering directory: addTwoNumbers
📂 Entering directory: additiveNumber
📂 Entering directory: anagrams
📂 Entering directory: arithmeticSlices
📂 Entering directory: backspaceStringCompare
📂 Entering directory: balancedBinaryTree
📂 Entering directory: basicCalculator
📂 Entering directory: bestTimeToBuyAndSellStock
📂 Entering directory: binarySearchTreeIterator
📂 Entering directory: binaryTreeInorderTraversal
📂 Entering directory: binar

In [3]:
import torch
print("Is CUDA available? ", torch.cuda.is_available())
print("GPU Device: ", torch.cuda.get_device_name(0) if torch.cuda.is_available() else "No GPU found")

Is CUDA available?  True
GPU Device:  Tesla T4


In [4]:
!ls -R github_code

github_code:
algorithms  database  README.md  scripts  shell

github_code/algorithms:
cpp  golang  java  python

github_code/algorithms/cpp:
3Sum
3SumClosest
4Sum
accountsMerge
addAndSearchWord
addBinary
addDigits
additiveNumber
addStrings
addToArrayFormOfInteger
addTwoNumbers
anagrams
arithmeticSlices
backspaceStringCompare
balancedBinaryTree
basicCalculator
bestTimeToBuyAndSellStock
binarySearchTreeIterator
binaryTreeInorderTraversal
binaryTreeLevelOrderTraversal
binaryTreeMaximumPathSum
binaryTreePaths
binaryTreePostorderTraversal
binaryTreePreorderTraversal
binaryTreeRightSideView
binaryTreeUpsideDown
binaryTreeZigzagLevelOrderTraversal
binaryWatch
bitwiseANDOfNumbersRange
brokenCalculator
buddyStrings
buildingBoxes
bulbSwitcher
bullsAndCows
burstBalloons
calculateMoneyInLeetcodeBank
candy
checkIfArrayIsSortedAndRotated
checkIfBinaryStringHasAtMostOneSegmentOfOnes
checkIfNumberIsASumOfPowersOfThree
checkIfOneStringSwapCanMakeStringsEqual
checkIfTheSentenceIsPangram
checkIfWordEqual

In [5]:
!find github_code -type f | wc -l

192


In [6]:
!du -sh github_code

3.3M	github_code


In [7]:
!find github_code -type f | sed -n 's/..*\.//p' | sort | uniq -c

    106 java
      4 md
     82 py


In [8]:
!find github_code -type f -name "*.md" -delete

In [9]:
!find github_code -type f | sed -n 's/..*\.//p' | sort | uniq -c

    106 java
     82 py


In [10]:
import os

def clean_code_file(file_path):
    with open(file_path, "r", encoding="utf-8") as f:
        lines = f.readlines()

    # Strip extra whitespaces and empty lines
    cleaned_lines = [line.rstrip() for line in lines if line.strip()]

    with open(file_path, "w", encoding="utf-8") as f:
        f.write("\n".join(cleaned_lines) + "\n")

# Apply to all Java & Python files
for root, _, files in os.walk("github_code"):
    for file in files:
        if file.endswith(".java") or file.endswith(".py"):
            clean_code_file(os.path.join(root, file))

print("Code formatting cleaned successfully!")

Code formatting cleaned successfully!


In [11]:
{"language": "python", "code": "def hello():\n    print('Hello World!')"}
{"language": "java", "code": "public class Hello { public static void main(String[] args) { System.out.println(\"Hello World\"); } }"}

{'language': 'java',
 'code': 'public class Hello { public static void main(String[] args) { System.out.println("Hello World"); } }'}

In [12]:
import json

data = []
for root, _, files in os.walk("github_code"):
    for file in files:
        file_path = os.path.join(root, file)
        with open(file_path, "r", encoding="utf-8") as f:
            code_content = f.read()

        # Identify language based on file extension
        language = "python" if file.endswith(".py") else "java"

        # Append to dataset
        data.append({"language": language, "code": code_content})

# Save as JSONL file
with open("dataset.jsonl", "w", encoding="utf-8") as f:
    for entry in data:
        f.write(json.dumps(entry) + "\n")

print("Dataset saved as dataset.jsonl")

Dataset saved as dataset.jsonl


In [15]:
from transformers import AutoTokenizer
from datasets import load_dataset

output_file = "dataset.jsonl"

# Load CodeLlama tokenizer
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")

tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def tokenize_function(examples):
    return tokenizer(examples["code"], truncation=True, padding="max_length", max_length=512)

# Apply tokenization
from datasets import load_dataset

dataset = load_dataset("json", data_files=output_file, split="train")
tokenized_dataset = dataset.map(tokenize_function, batched=True)

# Save tokenized dataset
tokenized_dataset.save_to_disk("tokenized_code_data")
print("Tokenization complete. Dataset saved.")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/749 [00:00<?, ?B/s]

tokenizer.model:   0%|          | 0.00/500k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.84M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/411 [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Map:   0%|          | 0/188 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/188 [00:00<?, ? examples/s]

Tokenization complete. Dataset saved.


In [17]:
from huggingface_hub import login

login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [14]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.3.2-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.3.2-py3-none-any.whl (485 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m485.4/485.4 kB[0m [31m19.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading multiprocess-0.70.16-py311-none-any.whl (143 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m143.5/143.5 kB[0m [31m15.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading

In [16]:
from huggingface_hub import list_models

models = list_models(author="MetaAI")
for model in models:
    print(model.modelId)

In [17]:
print(tokenized_dataset)

Dataset({
    features: ['language', 'code', 'input_ids', 'attention_mask'],
    num_rows: 188
})


In [18]:
import torch
from torch.utils.data import Dataset

class CodeDataset(Dataset):
    def __init__(self, tokenized_data):
        self.tokenized_data = tokenized_data

    def __len__(self):
        return len(self.tokenized_data)

    def __getitem__(self, idx):
        item = self.tokenized_data[idx]
        return {
            "input_ids": torch.tensor(item["input_ids"]),
            "attention_mask": torch.tensor(item["attention_mask"]),
        }

# Convert tokenized dataset to PyTorch format
train_dataset = CodeDataset(tokenized_dataset)

In [19]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Set to False because CodeLlama does autoregressive learning, not masked learning
)

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained("codellama/CodeLlama-7b-hf")

# Load model and force it onto GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = AutoModelForCausalLM.from_pretrained("codellama/CodeLlama-7b-hf").to(device)

print("Model loaded on:", next(model.parameters()).device)  # Should print: cuda:0


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [1]:
print(next(model.parameters()).device)


NameError: name 'model' is not defined