<a href="https://colab.research.google.com/github/Agustin-Galarza/tp_nlp/blob/main/tp_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Import data

### Imports

In [1]:
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from dataclasses import dataclass
import json
from typing import Dict, List, Optional, Iterable
from pandas import DataFrame
from enum import Enum
import sys

## Get credentials to acces the drive folder

In [2]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

## Read file and store all the parsed programs

In [3]:
drive_filename = "Repos_TypeScriptJan2022_duplicates_removed3.jsonl"

datadir = "./sample_data"
filename = "test_data.jsonl"

# Download the file from Drive
results = drive.ListFile({'q': f'title = "{drive_filename}"'}).GetList()
if len(results) == 0:
  raise Exception("No file found")
if len(results) > 1:
  raise Exception("Too many results")
file = results[0]
file.GetContentFile(f'{datadir}/{filename}')

# Open the file and separate the files

## Each file has a collection of repositories from Github
REPOS_TO_SCAN = -1 # -1 to scan all repos
FILES_PER_REPO = -1  # -1 to get all files from each selected repo

files: List[List[str]] = []
with open(f"{datadir}/{filename}") as dataset:
    for line_no, line in enumerate(dataset):
        if REPOS_TO_SCAN != -1 and line_no == REPOS_TO_SCAN:
            break
        repository = json.loads(line)
        filesdata: Dict = repository.get("filedata")
        for i, file in enumerate(filesdata.values()):
            if FILES_PER_REPO != -1 and i == FILES_PER_REPO:
                break
            tokens = file.get("tokens")
            files.append(tokens)

## Function and type definitions

In [4]:
@dataclass
class FunctionData:
    name: str
    params: List[str]
    block: List[str]

    def copy(self) -> "FunctionData":
        return FunctionData(self.name[:], self.params[:], self.block[:])


class ParametersParsingData:
    is_type_def = False
    braces: List[str] = []
    sharp_braces: List[str] = []
    parenthesis: List[str] = []


class BlockParsingData:
    braces: List[str] = []


class ReturnTypeData:
    containers: int = 0


class Tokenizer:
    def __init__(self, tokens):
        self.tokens = tokens
        self.tokens_len = len(tokens)
        self.current_index = 0
        self.marker = 0

    def current_token(self) -> Optional[str]:
        return (
            self.tokens[self.current_index]
            if self.current_index < self.tokens_len
            else None
        )

    def consume_token(self) -> Optional[str]:
        current_token = self.current_token()
        self.current_index += 1
        return current_token

    def is_current_token(self, token: str) -> bool:
        current = self.current_token()
        return current == token

    def set_marker(self) -> None:
        """Sets the marker to the current position"""
        self.marker = self.current_index

    def from_marker_to(self, n: int) -> List[str]:
        """Returns the list of tokens from the marked up to n tokens forward"""
        if n < 0:
            raise ValueError(f"n must be positive but is {n}")
        return self.tokens[self.marker : self.marker + n + 1]

    def from_marker_to_current(self) -> List[str]:
        """Returns the list of tokens from the marked token to the current token"""
        return self.tokens[self.marker : self.current_index + 1]


class State(Enum):
    Start = "Start"
    End = "End"
    Error = "Error"
    FunctionName = "FunctionName"
    FunctionGeneric = "FunctionGeneric"
    Parameters = "Parameters"
    ParseParameter = "ParseParameter"
    ReturnType = "ReturnType"
    ParseReturnType = "ParseReturnType"
    FunctionBlock = "FunctionBlock"
    ParseBlockContent = "ParseBlockContent"

    def is_end_state(self, state: "State") -> bool:
        return [State.End, State.Error] in state


class FunctionParser:
    def __init__(self, file, error_file=sys.stderr):
        self.tokens = file
        self.functions: List[FunctionData] = []
        self.error_file = error_file

    def __invalid_function_name(self, name: str) -> bool:
        invalid_chars_in_name = set("{}()<>")
        return any((c in invalid_chars_in_name) for c in name)

    def __build_err_msg_for_function(
        self, title: str, tokenizer: Tokenizer, fn_data: FunctionData
    ) -> None:
        self.error_msg = f"""
            {title}
            raw: {" ".join(tokenizer.from_marker_to(150))}
            current: {" ".join(tokenizer.from_marker_to_current())}
            name: {fn_data.name}
            params: {fn_data.params}
            block: {fn_data.block}
            ====================================================================================================================
            """

    def parse(self) -> List[FunctionData]:
        tokenizer = Tokenizer(self.tokens)
        state = State.Start
        current_function = FunctionData(None, [], [])
        self.block_state = BlockParsingData()

        while tokenizer.current_token() is not None and state is not State.Error:
            if state == State.Start:
                token = tokenizer.consume_token()
                if token == "function":
                    tokenizer.set_marker()
                    state = State.FunctionName

            elif state == State.FunctionName:
                function_name = tokenizer.current_token()
                if function_name == '*': # Only god knows why this is an error
                    current_function = None
                    state = State.End
                if self.__invalid_function_name(function_name):
                    function_name = "anonymous"
                else:
                    tokenizer.consume_token()
                state = (
                    State.FunctionGeneric
                    if tokenizer.is_current_token("<")
                    else State.Parameters
                )
                # Clear current function data to load new
                current_function.name = function_name
                current_function.params.clear()
                current_function.block.clear()

            elif state == State.FunctionGeneric:
                # Ignore function type declaration
                while not tokenizer.is_current_token(">"):
                    tokenizer.consume_token()
                tokenizer.consume_token()
                state = State.Parameters

            elif state == State.Parameters:
                if not tokenizer.is_current_token("("):
                    self.error_msg = self.__build_err_msg_for_function(
                        "Unrecongnized token for Parameters",
                        tokenizer,
                        current_function,
                    )
                    state = State.Error
                    continue
                self.params_state = ParametersParsingData()
                state = State.ParseParameter

            elif state == State.ParseParameter:
                token: str = tokenizer.consume_token()

                if token == "(":
                    self.params_state.parenthesis.append(token)
                elif token == ")":
                    if len(self.params_state.parenthesis) == 0:
                        self.__build_err_msg_for_function(
                            "Bad Parenthesis", tokenizer, current_function
                        )
                        state = State.Error
                        continue
                    try:
                      self.params_state.parenthesis.pop()
                    except:
                      current_function = None
                      state = State.End
                    if len(self.params_state.parenthesis) == 0:
                        state = State.FunctionBlock
                        continue

                elif self.params_state.is_type_def:
                    if token == "{":
                        self.params_state.braces.append(token)
                    elif token == "}":
                      try:
                        self.params_state.braces.pop()
                      except:
                        current_function = None
                        state = State.End
                    elif token == "<":
                        self.params_state.sharp_braces.append(token)
                    elif token == ">":
                        try:
                          self.params_state.sharp_braces.pop()
                        except:
                          current_function = None
                          state = State.End
                    elif (
                        token == ","
                        and len(self.params_state.braces) == 0
                        and len(self.params_state.sharp_braces) == 0
                    ):
                        self.params_state.is_type_def = False
                else:
                    if token == ":":
                        self.params_state.is_type_def = True
                    elif token != "," and token != "{" and token != "}":
                        current_function.params.append(token)

            elif state == State.ReturnType:
                self.return_type_data = ReturnTypeData()
                if tokenizer.is_current_token("{"):
                    self.return_type_data.containers += 1
                    tokenizer.consume_token()
                state = State.ParseReturnType

            elif state == State.ParseReturnType:
                while True:
                    token = tokenizer.current_token()
                    if token == "{":
                        if self.return_type_data.containers == 0:
                            break
                        self.return_type_data.containers += 1
                    elif token in ["(", "[", "<"]:
                        self.return_type_data.containers += 1
                    elif token in [")", "]", ">", "}"]:
                        self.return_type_data.containers -= 1

                    tokenizer.consume_token()

                state = State.FunctionBlock

            elif state == State.FunctionBlock:
                if tokenizer.is_current_token(";"):
                    # Is a function call
                    current_function = None
                    state = State.End
                elif tokenizer.is_current_token(":"):
                    tokenizer.consume_token()
                    state = State.ReturnType
                elif not tokenizer.is_current_token("{"):
                    self.__build_err_msg_for_function(
                        "Unrecognized token for Function Block",
                        tokenizer,
                        current_function,
                    )
                    state = State.Error
                else:
                    self.block_state = BlockParsingData()
                    state = State.ParseBlockContent

            elif state == State.ParseBlockContent:
                token = tokenizer.consume_token()
                current_function.block.append(token)

                if token == "{" or token.startswith("{"):
                    self.block_state.braces.append(token)
                elif token == "}" or token.endswith("}"):
                    if len(self.block_state.braces) == 0:
                        self.__build_err_msg_for_function(
                            "Bad Program", tokenizer, current_function
                        )
                        state = State.Error
                        continue
                    try:
                      self.block_state.braces.pop()
                    except:
                      current_function = None
                      state = State.End
                    if len(self.block_state.braces) == 0:
                        state = State.End
            elif state == State.End:
                if current_function is not None:
                  self.functions.append(current_function)
                else:
                  current_function = FunctionData(None, [], [])
                state = State.Start

        if state not in [State.Start, State.End, State.Error]:
            self.__build_err_msg_for_function(
                "Reached end of file wihtout completion", tokenizer, current_function
            )
            state = State.Error
        if state is State.Error:
            if self.error_msg is None:
                self.__build_err_msg_for_function(
                    "Unknown error", tokenizer, current_function
                )

            print(self.error_msg, file=self.error_file)

        return self.functions


def extract_functions(
    tokens_list: Iterable[List[str]],
) -> List[FunctionData]:
    functions: List[FunctionData] = []
    error_file = open("./errors.log", "a+")
    error_file.truncate(0)
    for tokens in tokens_list:
        parser = FunctionParser(tokens, error_file)
        functions.extend(parser.parse())
    error_file.close()
    return [fn for fn in functions if fn is not None]


## Manipulate Data

In [5]:
good_functions = extract_functions(files)


### Commutate function names to get bad results

In [10]:
import random as rnd

def switch_function_name(fn1: FunctionData, fn2: FunctionData) -> FunctionData:
  return FunctionData(fn2.name, fn1.params, fn1.block)

good_fns_amount = len(good_functions)

bad_functions: List[FunctionData] = []
comparisons = []

for i, fn in enumerate(good_functions):
  other_index = i
  while other_index == i:
    other_index = rnd.randint(0, good_fns_amount - 1)

  other_fn = good_functions[other_index]

  comparisons.append((fn.name, other_fn.name))
  bad_functions.append(switch_function_name(fn, other_fn))

# Just to check that the switchs are well made
DataFrame(comparisons).to_csv('./comparisons.csv')

functions = good_functions + bad_functions


### Build Final Corpus

In [None]:
!pip install -U -q gensim
from gensim.models import Word2Vec
import gensim.downloader
import re

def parse_cammel_case(fn_name: str) -> List[str]:
  return re.findall("[a-z]+|[A-Z][a-z]*", fn_name)