<a href="https://colab.research.google.com/github/Agustin-Galarza/socksv5-proxy-protos/blob/main/tp_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import data

### Imports

In [None]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from dataclasses import dataclass
import json
from typing import Dict, List
from pandas import DataFrame

## Get credentials to acces the drive folder

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

## Read file and store all the parsed programs

In [None]:
REPOS_TO_SCAN = -1
FILES_PER_REPO = -1
filename = "Repos_TypeScriptJan2022_duplicates_removed3.jsonl"

files: List[List[str]] = []
with open(f"../corpus/ts_dataset/datafiles/raw_corpus/{filename}") as dataset:
    for line_no, line in enumerate(dataset):
        if REPOS_TO_SCAN != -1 and line_no == REPOS_TO_SCAN:
            break
        repository = json.loads(line)
        filesdata: Dict = repository.get("filedata")
        for i, file in enumerate(filesdata.values()):
            if FILES_PER_REPO != -1 and i == FILES_PER_REPO:
                break
            tokens = file.get("tokens")
            files.append(tokens)

## Function and type definitions

In [None]:
@dataclass
class FunctionData:
    name: str
    params: List[str]
    block: List[str]
err_file = open("./errors.log", "+a")

def extract_single_function(tokens: List[str]) -> FunctionData:
    tokens_copy = tokens[:]
    tokens.pop(0)  # remove 'function' keyword
    fn_name: str = tokens.pop(0) if tokens[0] != "(" else "anonymous"
    params: List[str] = []

    if tokens[0] == "<":
        while tokens.pop(0) != '>':
            pass

    # Get args
    is_type_def = False
    braces: List[str] = []
    sharp_braces: List[str] = []
    parenthesis: List[str] = []

    while True:
        token = tokens.pop(0)
        if is_type_def:
            if token == "{":
                braces.append(token)
            elif token == "}":
                braces.pop()
            elif token == "<":
                sharp_braces.append(token)
            elif token == ">":
                sharp_braces.pop()
            elif token == "," and len(braces) == 0 and len(sharp_braces) == 0:
                is_type_def = False
        else:
            if token == "(":
                parenthesis.append(token)
            elif token == ")":
                if len(parenthesis) == 0:
                    print("Bad Parenthesis", file=err_file)
                    print("raw:", " ".join(tokens_copy[:100]), file=err_file)
                    print("params:", params, file=err_file)
                    print("====================================================================================================================", file=err_file)
                    return None
                parenthesis.pop()
                if len(parenthesis) == 0:
                    break
            elif token != "," and token != "{" and token != "}":
                params.append(token)
            elif token == ":":
                is_type_def = True

    if tokens[0] == ";":
        return None
    braces.clear()
    sharp_braces.clear()
    body_tokens: List[str] = []

    for token in tokens:

        body_tokens.append(token)

        # Parse function body
        if token == "{" or token.startswith("{"):
            braces.append(token)
        elif token == "}" or token.endswith("}"):
            if len(braces) == 0:
                print("Bad Program", file=err_file)
                print("raw:", " ".join(tokens_copy[:200]), file=err_file)
                print("params:", params, file=err_file)
                print("body:", body_tokens, file=err_file)
                print("====================================================================================================================", file=err_file)
                return None
            braces.pop()
            if len(braces) == 0:
                return FunctionData(fn_name, params, body_tokens)

        elif token == "<":
            sharp_braces.append(token)

    print("Bad Parsing", file=err_file)
    print("last token:", file=err_file)
    print("raw:", " ".join(tokens_copy[:200]), file=err_file)
    print("params:", params, file=err_file)
    print("body:", body_tokens, file=err_file)
    print("====================================================================================================================", file=err_file)
    return None

def extract_functions(file: List[str]) -> List[FunctionData]:
    functions: List[FunctionData] = []

    token_no = 0
    while token_no < len(file):
        token = file[token_no]
        if token == "function":
            try:
                extracted_fn: FunctionData = extract_single_function(file[token_no:])
                functions.append(extracted_fn)
            except:
                print("Unexpected error", file=err_file)
                print(" ".join(file[token_no:token_no+200]), file=err_file)
                print("====================================================================================================================", file=err_file)

        token_no += 1
    return [fn for fn in functions if fn is not None]


## Manipulate Data

In [None]:
functions: List[FunctionData] = []
for file in files:
    functions.extend(extract_functions(file))

df = DataFrame(functions)

df.to_csv("corpus.csv")