In [1]:
import pandas as pd
import numpy as np
import cleaner
import re
import torch
import nltk

from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModel
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stopwords_en = stopwords.words('english')

lemmatizer = WordNetLemmatizer()

bert_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
bert_model = AutoModel.from_pretrained("microsoft/codebert-base")

tqdm.pandas()

[nltk_data] Downloading package stopwords to /home/ae/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ae/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ae/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
def clean_comments(contents, language) -> str:
    delimiters = cleaner.lang(language)
    
    if delimiters is not None:
        contents = cleaner.string(contents, *delimiters)
        
    return contents

def clean_extras(contents) -> str:
    contents = re.sub(r'[\r]', '', contents)
    contents = re.sub(r'\w{15,}', '', contents)
    contents = re.sub('[\n]{2,}', '\n', contents)
    contents = re.sub('[\t]{2,}', '\t', contents)
    contents = re.sub(r'[ ]+', ' ', contents)

    contents = "\n".join([line for line in contents.splitlines() if len(line.strip()) != 0])

    return contents

def tokenize(source: str):
    tokens = bert_tokenizer.tokenize(source, return_tensors="pt", truncation=True, max_length=512)
    lemmas = [lemmatizer.lemmatize(token) for token in tokens if token not in stopwords_en]
    return bert_tokenizer.convert_tokens_to_ids(lemmas)

def vectorize(tokens) -> np.ndarray:
    with torch.no_grad():
            outputs = bert_model(torch.tensor(tokens)[None,:])
        
    # We use the [CLS] token's embedding as the representation of the entire code snippet
    cls_embedding = outputs.last_hidden_state[:, 0, :].squeeze().cpu().numpy()

    return cls_embedding

In [3]:
def read_content(path, origin):
    path = origin + "/" + path
    
    file = open(path, 'rb')
    contents = file.read().decode(errors = "ignore")
    file.close()
    
    return contents

In [4]:
def split_code_into_chunks(code, max_chars):
    lines = code.split('\n')
    chunks = []
    current_chunk = []
    current_length = 0
    
    for line in lines:
        token_len = len(tokenize(line))
        if current_length + token_len + 1 > max_chars:
            chunks.append('\n'.join(current_chunk))
            current_chunk = [line]
            current_length = token_len + 1  # +1 for the newline character
        else:
            current_chunk.append(line)
            current_length += token_len + 1  # +1 for the newline character
    
    # Add the last chunk
    # if current_chunk:
        # chunks.append('\n'.join(current_chunk))
    
    return chunks


In [5]:
def gen_chunk_entries(df: pd.DataFrame, ENTRIES = 3000, MAX_CHARS = 512) -> pd.DataFrame:
    status = {}
    bufs = {}
    
    for lang in list(TARGET):
        status[lang] = 0
    
    new_rows = []
    total = 0

    with tqdm(total=ENTRIES * len(TARGET), position=0, leave=True) as pbar:
        for _, row in df.iterrows():
            lang = row.language
            
            if status[lang] < ENTRIES:
                for chunk in split_code_into_chunks(row.source, MAX_CHARS):
                    if status[lang] < ENTRIES:
                        new_rows.append({'language': lang, 'source': chunk})
                        status[lang] += 1
                        total += 1
                        pbar.update()

    tqdm._instances.clear()
    return pd.DataFrame(new_rows)

In [6]:
TARGET = [
    'C', 'C#', 'C++', 'Dart', 'Elixir', 'Go', 'JSON', 'Java', 
    'Javascript', 'Julia', 'Kotlin', 'Markdown', 'Ruby', 'Rust', 'Python'
]

In [7]:
df = pd.read_csv("/home/ae/repos/archivos/dataset.csv")
df = df[df['language'].isin(TARGET)]

del df['file_size']
del df['line_count']
del df['extension']

df.head()

Unnamed: 0,id,file_path,language
0,1,Markdown/000001.md,Markdown
3,4,Markdown/000004.md,Markdown
4,5,Markdown/000005.md,Markdown
5,6,Markdown/000006.md,Markdown
6,7,JSON/000007.json,JSON


In [8]:
df['source'] = df['file_path'].progress_apply(lambda x: read_content(x, "/home/ae/repos/archivos/dataset"))
df.head()

100%|██████████████████████████████████████████████████████████████████████████████████████████| 74851/74851 [00:07<00:00, 10056.81it/s]


Unnamed: 0,id,file_path,language,source
0,1,Markdown/000001.md,Markdown,# Contributing\n\n| Component | Bui...
3,4,Markdown/000004.md,Markdown,# Azure SDK for .NET\n\n[![Packages](https://i...
4,5,Markdown/000005.md,Markdown,<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK ...
5,6,Markdown/000006.md,Markdown,# Support\n\n## How to file issues and get hel...
6,7,JSON/000007.json,JSON,"{\n ""$schema"": ""https://raw.githubusercontent..."


In [9]:
df['source'] = df.progress_apply(lambda r: clean_comments(r.source, r.language), axis=1)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 74851/74851 [00:15<00:00, 4741.71it/s]


In [10]:
df['source'] = df['source'].progress_apply(clean_extras)

100%|███████████████████████████████████████████████████████████████████████████████████████████| 74851/74851 [01:12<00:00, 1035.09it/s]


In [11]:
df_chunk = gen_chunk_entries(df, 3000)

 94%|██████████████████████████████████████████████████████████████████████████████████████▉     | 42501/45000 [06:38<00:23, 106.77it/s]


In [12]:
df_chunk['tokens'] = df_chunk['source'].progress_apply(tokenize)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 42501/42501 [02:26<00:00, 289.81it/s]


In [None]:
df_chunk['vector'] = df_chunk['tokens'].progress_apply(vectorize)

 36%|███████████████████████████████▎                                                        | 15101/42501 [2:47:05<43:17:55,  5.69s/it]

In [None]:
df_chunk['language'].value_counts().plot(kind="bar")

In [None]:
df_chunk.to_pickle("bert_clean_3000x512_aprox")