In [2]:
import base64
import os

In [3]:
from tree_sitter import Language, Parser

In [4]:
from divergen.codebase import Codebase

In [53]:
with open("../secrets.json") as secrets:
    secrets_dict = eval(secrets.read())
    open_api_key = base64.b64decode(secrets_dict["openai_api_key"]).decode('ascii')
    os.environ["OPENAI_API_KEY"] = open_api_key
    if "organization_id" in secrets_dict.keys():
        openai_organization = base64.b64decode(secrets_dict["organization_id"]).decode('ascii')
        os.environ["OPENAI_ORGANIZATION"] = openai_organization
del open_api_key, openai_organization, secrets_dict

# Divergen Basic Use

In [11]:
os.chdir("../examples/ml_repo_s")

In [12]:
codebase_test = Codebase()

In [13]:
codebase_test.parse_modules()

In [17]:
processing_class = codebase_test.get_modules()[0].get_entities()[0]

In [26]:
processing_class.body_idx

5

In [25]:
processing_class.module.node.body

[<ast.Import at 0x1071e3ca0>,
 <ast.Import at 0x1071e3c40>,
 <ast.ImportFrom at 0x1071e3be0>,
 <ast.ImportFrom at 0x1071e3b80>,
 <ast.ImportFrom at 0x1071e3b20>,
 <ast.ClassDef at 0x1071e3a90>,
 <ast.ClassDef at 0x1071e16f0>,
 <ast.If at 0x1071eb070>]

In [29]:
processing_class.module.node.body[processing_class.body_idx]

<ast.ClassDef at 0x1071e3a90>

In [43]:
print(processing_class.module.code)

import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

class ProcessingPipeline:

    def __init__(self):
        self.data = None

    def load_data(self, file_path):
        try:
            self.data = pd.read_csv(file_path)
            print('Data loaded successfully.')
        except Exception as e:
            print(f'Error loading data: {str(e)}')

    def explore_data(self):
        if self.data is not None:
            print('Data Exploration:')
            print(self.data.describe())
        else:
            print('No data loaded for exploration.')

    def clean_data(self):
        if self.data is not None:
            print('Data Cleaning:')
            self.data = self.data.drop_duplicates()
            self.data = self.data.dropna()
            print('Data cleaned.')
        else:
            print('No data loaded fo

In [44]:
processing_class.module.node.body = processing_class.module.node.body[1:]

In [47]:
processing_class.module.merge_entities("code")

In [111]:
processing_class.module.name

'ml_pipeline'

In [48]:
print(processing_class.module.code)

import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report

class ProcessingPipeline:

    def __init__(self):
        self.data = None

    def load_data(self, file_path):
        try:
            self.data = pd.read_csv(file_path)
            print('Data loaded successfully.')
        except Exception as e:
            print(f'Error loading data: {str(e)}')

    def explore_data(self):
        if self.data is not None:
            print('Data Exploration:')
            print(self.data.describe())
        else:
            print('No data loaded for exploration.')

    def clean_data(self):
        if self.data is not None:
            print('Data Cleaning:')
            self.data = self.data.drop_duplicates()
            self.data = self.data.dropna()
            print('Data cleaned.')
        else:
            print('No data loaded for cleaning.')

    d

# Tree Sitter

## Divergen like

In [55]:
os.chdir("../../notebooks/")

In [4]:
# Run it just once
# Language.build_library(
#   # Store the library in the `build` directory
#   'build/my-languages.so',

#   # Include one or more languages
#   [
#     '/Users/itortouch/GiTor/tree-sitter-java'
#   ]
# )

In [142]:
class ts_BaseEntity:
    def __init__(self, node, parser, code: str = "", modified: bool = False) -> None:
        self.node = node
        self.parser = parser
        self.code = code
        self.modified = modified

    def get(self, attr):
        return getattr(self, attr)

    def modify(self, attr, value):
        setattr(self, attr, value)
        self.modified = True
        if attr == "code":
            self.update_node(value)

    def update_node(self, code: str):
        if isinstance(self, ts_Module):
            node = self.parser.parse(bytes(code, "utf8")).root_node
            self.node = node
            self.set_code()
        elif isinstance(self, ts_Entity):
            # TODO: With this node update all start/end bytes references breaks. Review.
            node = self.parser.parse(bytes(code, "utf8")).root_node
            self.node = node
            self.update_module_node()

    def set_code(self):
        self.code = self.text.decode()


class ts_Entity(ts_BaseEntity):
    def __init__(
        self,
        node,
        parser,
        module,
        body_idx,
    ) -> None:
        super().__init__(node, parser, node.text.decode())
        self.module = module
        self.body_idx = body_idx
        # self.node = node
        # self.code = node.text.decode()

    # def model_post_init(self, __context: Any) -> None:
    #     self.set_code()

    def update_module_node(self):
        self.module.body[self.body_idx] = self.node
        # TODO: Due to recursive call at modify an entity we're updating module code. Review.
        self.module.merge_entities()


class ts_Module(ts_BaseEntity):
    def __init__(self, name: str, node, parser) -> None:
        super().__init__(node, parser, node.text.decode())
        self.name = name
        # self.node = node
        # self.code = node.text.decode()
        self.body = [child for child in node.children]
        self._entities: list[ts_Entity] = []

    def parse_entities(self):
        for idx, node in enumerate(self.body):
            if node.type in [
                "package_declaration",
                "line_comment",
                "class_declaration",
                "block_comment",
                "class_declaration",
            ]:
                self._entities.append(
                    ts_Entity(node=node, parser=self.parser, module=self, body_idx=idx)
                )

    def merge_entities(self):
        self.code = "\n".join([child.text.decode() for child in self.body])


class ts_Codebase:
    def __init__(self) -> None:
        JAVA_LANGUAGE = Language("build/my-languages.so", "java")
        self.parser = Parser()
        self.parser.set_language(JAVA_LANGUAGE)
        self._modules: list[ts_Module] = []

    def parse_module(self, dir_path: str, module_name: str, verbose: bool = False):
        module_path = os.path.join(dir_path, module_name)
        with open(module_path, "r") as file:
            code = file.read()

        tree = self.parser.parse(bytes(code, "utf8"))
        module = ts_Module(
            name=module_name.strip(".java"), node=tree.root_node, parser=self.parser
        )
        module.parse_entities()

        return module

    def parse_modules(self, codebase, verbose=False):
        for dirpath, _, files in os.walk(codebase):
            for filename in files:
                if filename.endswith(".java"):
                    if verbose:
                        print(f"**** PARSING {filename} ****")
                    self._modules.append(self.parse_module(dirpath, filename))

In [136]:
# JAVA_LANGUAGE = Language("build/my-languages.so", "java")
# parser = Parser()
# parser.set_language(JAVA_LANGUAGE)

In [143]:
# Provide the path to the directory
codebase_path = '../examples/java_repo_s/src/'
codebase_modules = ts_Codebase()
codebase_modules.parse_modules(codebase_path)

In [144]:
[m.name for m in codebase_modules._modules]

['MainApplication', 'StringUtil', 'UserEntity']

In [146]:
print(codebase_modules._modules[2].code)

package com.example.module;

// Random inline comment
public class UserEntity {
    private String name;
    private int age;

    public UserEntity(String name, int age) {
        this.name = name;
        this.age = age;
    }

    public String getName() {
        return name;
    }

    public int getAge() {
        return age;
    }
}

/**
 * Test JavaDoc.
 */
class AdminEntity extends UserEntity {
    private boolean isAdmin;

    public AdminEntity(String name, int age, boolean isAdmin) {
        super(name, age);
        this.isAdmin = isAdmin;
    }

    public boolean getIsAdmin(){
        return this.isAdmin;
    }
}


In [148]:
print(codebase_modules._modules[2]._entities[2].code)

public class UserEntity {
    private String name;
    private int age;

    public UserEntity(String name, int age) {
        this.name = name;
        this.age = age;
    }

    public String getName() {
        return name;
    }

    public int getAge() {
        return age;
    }
}


In [149]:
edited_entity = """public class UserEntity {
    private String name;
    private int age;
    // Test edit comment
    public UserEntity(String name, int age) {
        this.name = name;
        this.age = age;
    }

    public String getName() {
        return name;
    }

    public int getAge() {
        return age;
    }
}"""

In [150]:
codebase_modules._modules[2]._entities[2].modify("code", edited_entity)

In [151]:
print(codebase_modules._modules[2]._entities[2].code)

public class UserEntity {
    private String name;
    private int age;
    // Test edit comment
    public UserEntity(String name, int age) {
        this.name = name;
        this.age = age;
    }

    public String getName() {
        return name;
    }

    public int getAge() {
        return age;
    }
}


In [152]:
print(codebase_modules._modules[2].code)

package com.example.module;
// Random inline comment
public class UserEntity {
    private String name;
    private int age;
    // Test edit comment
    public UserEntity(String name, int age) {
        this.name = name;
        this.age = age;
    }

    public String getName() {
        return name;
    }

    public int getAge() {
        return age;
    }
}
/**
 * Test JavaDoc.
 */
class AdminEntity extends UserEntity {
    private boolean isAdmin;

    public AdminEntity(String name, int age, boolean isAdmin) {
        super(name, age);
        this.isAdmin = isAdmin;
    }

    public boolean getIsAdmin(){
        return this.isAdmin;
    }
}


## Playing with ts_nodes

In [80]:
def get_child_types(node):
    return [child.type for child in node.children]
def get_child_names(node):
    return [node.field_name_for_child(i) for i in range(node.child_count)]
def count_child_sons(node):
    return [child.child_count for child in node.children]
def get_child_codes(node):
    return [child.text.decode() for child in node.children]

In [78]:
print(user_entity_node.sexp())

(program (package_declaration (scoped_identifier scope: (scoped_identifier scope: (identifier) name: (identifier)) name: (identifier))) (line_comment) (class_declaration (modifiers) name: (identifier) body: (class_body (field_declaration (modifiers) type: (type_identifier) declarator: (variable_declarator name: (identifier))) (field_declaration (modifiers) type: (integral_type) declarator: (variable_declarator name: (identifier))) (constructor_declaration (modifiers) name: (identifier) parameters: (formal_parameters (formal_parameter type: (type_identifier) name: (identifier)) (formal_parameter type: (integral_type) name: (identifier))) body: (constructor_body (expression_statement (assignment_expression left: (field_access object: (this) field: (identifier)) right: (identifier))) (expression_statement (assignment_expression left: (field_access object: (this) field: (identifier)) right: (identifier))))) (method_declaration (modifiers) type: (type_identifier) name: (identifier) paramete

In [79]:
print(get_child_types(user_entity_node))
print(get_child_names(user_entity_node))
print(count_child_sons(user_entity_node))

['package_declaration', 'line_comment', 'class_declaration', 'block_comment', 'class_declaration']
[None, None, None, None, None]
[3, 0, 4, 0, 4]


In [86]:
print("\n".join(get_child_codes(user_entity_node)))

package com.example.module;
// Random inline comment
public class UserEntity {
    private String name;
    private int age;

    public UserEntity(String name, int age) {
        this.name = name;
        this.age = age;
    }

    public String getName() {
        return name;
    }

    public int getAge() {
        return age;
    }
}
/**
 * Test JavaDoc.
 */
class AdminEntity extends UserEntity {
    private boolean isAdmin;

    public AdminEntity(String name, int age, boolean isAdmin) {
        super(name, age);
        this.isAdmin = isAdmin;
    }

    public boolean getIsAdmin(){
        return this.isAdmin;
    }
}


In [87]:
print(user_entity_node.text.decode())

package com.example.module;

// Random inline comment
public class UserEntity {
    private String name;
    private int age;

    public UserEntity(String name, int age) {
        this.name = name;
        this.age = age;
    }

    public String getName() {
        return name;
    }

    public int getAge() {
        return age;
    }
}

/**
 * Test JavaDoc.
 */
class AdminEntity extends UserEntity {
    private boolean isAdmin;

    public AdminEntity(String name, int age, boolean isAdmin) {
        super(name, age);
        this.isAdmin = isAdmin;
    }

    public boolean getIsAdmin(){
        return this.isAdmin;
    }
}


In [93]:
print(get_child_types(root_node.child(3)))
print(get_child_names(root_node.child(3)))
print(count_child_sons(root_node.child(3)))

['modifiers', 'class', 'identifier', 'class_body']
[None, None, 'name', 'body']
[1, 0, 0, 3]


In [96]:
print(root_node.child(3).text.decode())

public class MainApplication {
    public static void main(String[] args) {
        UserEntity user = new UserEntity("John Doe", 30);
        StringUtil stringUtil = new StringUtil();
        String message = stringUtil.capitalize(user.getName());

        System.out.println("Hello, " + message + ". You are " + user.getAge() + " years old.");
    }
}


In [108]:
root_node.children[3].has_changes

False

In [106]:
print(root_node.child(3).child(3).text.decode())

{
    public static void main(String[] args) {
        UserEntity user = new UserEntity("John Doe", 30);
        StringUtil stringUtil = new StringUtil();
        String message = stringUtil.capitalize(user.getName());

        System.out.println("Hello, " + message + ". You are " + user.getAge() + " years old.");
    }
}


In [97]:
print(get_child_types(root_node.child(3).child(3)))
print(get_child_names(root_node.child(3).child(3)))
print(count_child_sons(root_node.child(3).child(3)))

['{', 'method_declaration', '}']
[None, None, None]
[0, 5, 0]


In [8]:
class_name = root_node.children[0].child_by_field_name("name")
class_name

<Node type=identifier, start_point=(0, 13), end_point=(0, 20)>

In [None]:
def process_node(node, code, replacement):
    # Replace "java" by "py" in the start of the code for example
    if node.type == 'identifier' and code[node.start_byte:node.end_byte] == b'java':
        code[node.start_byte:node.end_byte] = bytes(replacement, 'utf-8')
    for child in node.children:
        process_node(child, code, replacement)

def parse_file(file_path, replacement):
    with open(file_path, 'r') as file:
        code = file.read()

    parser = Parser()
    parser.set_language(JAVA_LANGUAGE)
    tree = parser.parse(bytes(code, "utf8"))

    process_node(tree.root_node, code, replacement)
    return code