In [63]:
import pandas as pd
import numpy as np
import cleaner
import re
import torch
import nltk

from tqdm import tqdm
from nltk import word_tokenize
from nltk.corpus import stopwords
from transformers import AutoTokenizer, AutoModel
from nltk.stem.wordnet import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

stopwords_en = stopwords.words('english')

lemmatizer = WordNetLemmatizer()

bert_tokenizer = AutoTokenizer.from_pretrained("microsoft/codebert-base")
bert_model = AutoModel.from_pretrained("microsoft/codebert-base")

tqdm.pandas()

[nltk_data] Downloading package stopwords to /home/ae/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ae/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/ae/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
TARGET = [
    'C', 'C#', 'C++', 'Dart', 'Elixir', 'Go', 'JSON', 'Java', 
    'Javascript', 'Julia', 'Kotlin', 'Markdown', 'Ruby', 'Rust', 'Python'
]

In [32]:
df = pd.read_csv("/home/ae/repos/archivos/dataset.csv")
df = df[df['language'].isin(TARGET)]

del df['file_size']
del df['line_count']
del df['extension']

df.head()

Unnamed: 0,id,file_path,language
0,1,Markdown/000001.md,Markdown
3,4,Markdown/000004.md,Markdown
4,5,Markdown/000005.md,Markdown
5,6,Markdown/000006.md,Markdown
6,7,JSON/000007.json,JSON


In [33]:
def read_content(path, origin):
    path = origin + "/" + path
    
    file = open(path, 'rb')
    contents = file.read().decode(errors = "ignore")
    file.close()
    
    return contents

In [34]:
df['source'] = df['file_path'].apply(lambda x: read_content(x, "/home/ae/repos/archivos/dataset"))
df.head()

Unnamed: 0,id,file_path,language,source
0,1,Markdown/000001.md,Markdown,# Contributing\n\n| Component | Bui...
3,4,Markdown/000004.md,Markdown,# Azure SDK for .NET\n\n[![Packages](https://i...
4,5,Markdown/000005.md,Markdown,<!-- BEGIN MICROSOFT SECURITY.MD V0.0.5 BLOCK ...
5,6,Markdown/000006.md,Markdown,# Support\n\n## How to file issues and get hel...
6,7,JSON/000007.json,JSON,"{\n ""$schema"": ""https://raw.githubusercontent..."


In [35]:
df_clean = df.copy()
df_dirty = df.copy()

In [70]:
def clean_comments(contents, language) -> str:
    delimiters = cleaner.lang(language)
    
    if delimiters is not None:
        contents = cleaner.string(contents, *delimiters)
        
    return contents

def clean_extras(contents) -> str:
    contents = re.sub(r'[\r]', '', contents)
    contents = re.sub(r'\w{15,}', '', contents)
    contents = re.sub('[\n]{2,}', '\n', contents)
    contents = re.sub('[\t]{2,}', '\t', contents)
    contents = re.sub(r'[ ]+', ' ', contents)

    contents = "\n".join([line for line in contents.splitlines() if len(line.strip()) != 0])

    return contents

def tokenize(source: str) -> torch.Tensor:
    tokens = bert_tokenizer.tokenize(source)
    lemmas = (lemmatizer.lemmatize(token) for token in tokens if token not in stopwords_en)
    return torch.tensor(bert_tokenizer.convert_tokens_to_ids(lemmas))[None,:]
    

def vectorize(tokens: torch.Tensor) -> np.ndarray:    
    embed = bert_model(tokens[1])
    
    return embed.detach().numpy()

In [37]:
df_clean['source'] = df_clean.apply(lambda r: clean_comments(r.source, r.language), axis=1)
df_clean.head()

Unnamed: 0,id,file_path,language,source
0,1,Markdown/000001.md,Markdown,# Contributing\n\n| Component | Bui...
3,4,Markdown/000004.md,Markdown,# Azure SDK for .NET\n\n[![Packages](https://i...
4,5,Markdown/000005.md,Markdown,\n\n## Security\n\nMicrosoft takes the securit...
5,6,Markdown/000006.md,Markdown,# Support\n\n## How to file issues and get hel...
6,7,JSON/000007.json,JSON,"{\n ""$schema"": ""https:\n ""meta"": {\n ""aut..."


In [38]:
df_clean['source'] = df_clean['source'].apply(clean_extras)
df_dirty['source'] = df_dirty['source'].apply(clean_extras)

In [72]:
def split_code_into_chunks(code, max_chars):
    lines = code.split('\n')
    chunks = []
    current_chunk = []
    current_length = 0
    
    for line in lines:
        if current_length + len(line) + 1 > max_chars:
            chunks.append('\n'.join(current_chunk))
            current_chunk = [line]
            current_length = len(line) + 1  # +1 for the newline character
        else:
            current_chunk.append(line)
            current_length += len(line) + 1  # +1 for the newline character
    
    # Add the last chunk
    # if current_chunk:
        # chunks.append('\n'.join(current_chunk))
    
    return chunks


In [73]:
def gen_chunk_entries(df: pd.DataFrame, ENTRIES = 3000, MAX_CHARS = 512) -> pd.DataFrame:
    status = {}
    bufs = {}
    
    for lang in list(TARGET):
        status[lang] = 0
    
    new_rows = []
    for _, row in df.iterrows():
        language = row.language
        
        if status[language] < ENTRIES:
            for chunk in split_code_into_chunks(row.source, MAX_CHARS):
                if MAX_CHARS * 0.8 >= float(len(chunk)) and status[language] < ENTRIES:
                    new_rows.append({'language': language, 'source': chunk})
                    status[language] += 1
    
    return pd.DataFrame(new_rows)


In [90]:
df_clean_chunks = gen_chunk_entries(df_clean, 500)
df_dirty_chunks = gen_chunk_entries(df_dirty, 500)

In [91]:
df_clean_chunks['tokens'] = df_clean_chunks['source'].progress_apply(tokenize)
df_dirty_chunks['tokens'] = df_dirty_chunks['source'].progress_apply(tokenize)

100%|████████████████████████| 3830/3830 [00:03<00:00, 1005.42it/s]
100%|█████████████████████████| 4186/4186 [00:04<00:00, 928.47it/s]


In [92]:
df_clean_chunks = df_clean_chunks[df_clean_chunks['tokens'].apply(lambda x: x.shape) != (1, 0)]
df_dirty_chunks = df_dirty_chunks[df_dirty_chunks['tokens'].apply(lambda x: x.shape) != (1, 0)]

In [89]:
df_clean_chunks['vector'] = df_clean_chunks['tokens'].progress_apply(vectorize)
df_dirty_chunks['vector'] = df_dirty_chunks['tokens'].progress_apply(vectorize)

  0%|                            | 1/3809 [00:00<00:05, 646.87it/s]


IndexError: index 1 is out of bounds for dimension 0 with size 1