**Initialization**
- *obs. gitpython and pyvis packages installed in SA anaconda environment*
- Cloning Zeeguu-API

In [40]:
# For new environement, install the following packages
# install GitPython, and installs pyvis using the system's Python executable.
#gitpyhon is used to interact with git repositories, and pyvis is used to visualize the graph.
import sys
sys.version
!{sys.executable} -m pip install gitpython
!{sys.executable} -m pip install pyvis
!{sys.executable} -m pip install ast
!{sys.executable} -m pip install matplotlib
!{sys.executable} -m pip install scipy

Collecting ast
  Using cached AST-0.0.2.tar.gz (19 kB)
  Preparing metadata (setup.py): started
  Preparing metadata (setup.py): finished with status 'error'


  error: subprocess-exited-with-error
  
  × python setup.py egg_info did not run successfully.
  │ exit code: 1
  ╰─> [8 lines of output]
      Traceback (most recent call last):
        File "<string>", line 2, in <module>
        File "<pip-setuptools-caller>", line 34, in <module>
        File "C:\Users\carl9\AppData\Local\Temp\pip-install-qo62wqu_\ast_936866e84f574afd819bf29e123bf963\setup.py", line 6, in <module>
          README = codecs.open(os.path.join(here, 'AST/README'), encoding='utf8').read()
                   ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
        File "<frozen codecs>", line 918, in open
      FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\carl9\\AppData\\Local\\Temp\\pip-install-qo62wqu_\\ast_936866e84f574afd819bf29e123bf963\\AST/README'
      [end of output]
  
  note: This error originates from a subprocess, and is likely not a problem with pip.
error: metadata-generation-failed

× Encountered error while generati

Collecting scipy
  Downloading scipy-1.13.0-cp312-cp312-win_amd64.whl.metadata (60 kB)
     ---------------------------------------- 0.0/60.6 kB ? eta -:--:--
     ------ --------------------------------- 10.2/60.6 kB ? eta -:--:--
     ------------------- ------------------ 30.7/60.6 kB 330.3 kB/s eta 0:00:01
     -------------------------------------- 60.6/60.6 kB 402.7 kB/s eta 0:00:00
Downloading scipy-1.13.0-cp312-cp312-win_amd64.whl (45.9 MB)
   ---------------------------------------- 0.0/45.9 MB ? eta -:--:--
   ---------------------------------------- 0.3/45.9 MB 7.7 MB/s eta 0:00:06
    --------------------------------------- 0.7/45.9 MB 9.1 MB/s eta 0:00:05
    --------------------------------------- 0.9/45.9 MB 7.2 MB/s eta 0:00:07
   - -------------------------------------- 1.6/45.9 MB 9.8 MB/s eta 0:00:05
   -- ------------------------------------- 2.4/45.9 MB 10.7 MB/s eta 0:00:05
   -- ------------------------------------- 2.9/45.9 MB 10.9 MB/s eta 0:00:04
   --- ------

In [10]:
from git import Repo
import os
import ast

# GitPython is a library that allows us to work easily with git from Python
# https://gitpython.readthedocs.io/en/stable/tutorial.html

# Let's declare a var for the path where we're going to download a repository
# Warning: this must end in /
CODE_ROOT_FOLDER="C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/"

# If the file exists, it means we've already downloaded
if not os.path.exists(CODE_ROOT_FOLDER):
  Repo.clone_from("https://github.com/zeeguu/api", CODE_ROOT_FOLDER)

**Helper Functions**

- **module_name_from_file_path()**  - *Extracts module from file path (.py files) e.g. ../core/model/user.py -> zeeguu.core.model.user*
- **get_all_files_in_repo()** - *returns list of all .py file paths from root folder*

In [24]:
# extracting a module name from a file name
def module_name_from_file_path(full_path):
    file_name = full_path[len(CODE_ROOT_FOLDER):]
    file_name = file_name.replace("/__init__.py","")
    file_name = file_name.replace("/",".")
    file_name = file_name.replace(".py","")
    return file_name

# All .py files in root folder
def get_all_files_in_repo(repo_path):
    all_files = []
    for root, dirs, files in os.walk(repo_path):
        for file in files:
            if file.endswith(".py"):
                all_files.append(os.path.join(root, file))
    return all_files

**Data Gathering**

Utilize a defined grammar to parse and extract dependencies efficiently. Employ Abstract Syntax Trees (AST) to overcome the limitations posed by regex-based methods. This includes:
1. Parse a file and generate AST (read file, then use ast package to generate AST from content)
2. Define the grammar rules for the dependency extraction
3. Traverse AST and collect dependencies based on grammar
4. *define abstractions on different kinds of AST nodes*
5. store dependencies, make em' ready for visualization!

In [44]:

# Parse a Python file and return an AST. e.g. parse_file('example.py')
def file_to_ast_parser(f_path):
    with open(f_path, 'r') as file:
        code = file.read()
    return ast.parse(code, filename=f_path)

# Define "Grammar" rules for AST traversal of a single file
# could also do some counting here for more advanced analysis  / metric aggregati
def collect_dependencies(abstract_syntax_tree):
    dependencies = set()
    module_attributes = dict()
    used_dependencies = set()

    #Define node types/grammar rules for the AST traversal:
    def visit_Import(node):
        for alias in node.names:
            #collect the module names from the import statements (also handles the case of "import os" and "import os as cool_os alias" woop)
            dependencies.add(alias.asname if alias.asname else alias.name)

    def visit_ImportFrom(node):
        if node.module:
            dependencies.add(node.module)
        # Collects the methods as well as all other attributes from the module
        for alias in node.names:
            module_attributes[alias.name] = node.module
    
    def visit_Call(node):
        if isinstance(node.func, ast.Name): #and node.func.id in dependencies:
                if node.func.id in module_attributes:
                    #print("ast Call: node.func.id: ", node.func.id)
                    used_dependencies.add(module_attributes[node.func.id])
                    
        if isinstance(node.func, ast.Attribute) and isinstance(node.func.value, ast.Name) and node.func.value.id in dependencies:
            #print("ast Call: node.func.value.id: ", node.func.value.id)
            used_dependencies.add(node.func.value.id)


    # Recursively traverse the AST from the root node to all its children
    def traverse(node):
        if isinstance(node, ast.Import):
            visit_Import(node)
        elif isinstance(node, ast.ImportFrom):
            visit_ImportFrom(node)
        elif isinstance(node, ast.Call):
            visit_Call(node)
        for child in ast.iter_child_nodes(node):
            traverse(child)
        
        
    traverse(abstract_syntax_tree)
    #print("all module attributes: ", module_attributes)
    #used_dependencies = filter_dependencies(abstract_syntax_tree, dependencies, module_attributes)
    return dependencies, used_dependencies

# Collect dependencies for all files in root folder
def collect_all_dependencies():
    all_files = get_all_files_in_repo(CODE_ROOT_FOLDER)
    all_dependencies = dict()
    all_unused_dependencies = dict()
    all_used_dependencies = dict()
    for file in all_files:
        print("Collecting dependencies for file: ", file)
        abstract_syntax_tree = file_to_ast_parser(file)
        dependencies, used_dependencies = collect_dependencies(abstract_syntax_tree)
        all_dependencies[module_name_from_file_path(file)] = dependencies
        all_unused_dependencies[module_name_from_file_path(file)] = dependencies - used_dependencies
        all_used_dependencies[module_name_from_file_path(file)] = used_dependencies
    return all_dependencies, all_unused_dependencies, all_used_dependencies




    
#We don't need this function anymore, since we traverse from top. Then it should only be if imports come from other places than the top of the file.
""" def filter_dependencies(abstract_syntax_tree, dependencies, module_attributes):
    used_dependencies = set()

    def filter_traverse(node, dependencies):
        #ast.Name include variable names, function names, class names, module names, and other names used within the code.
        
        if isinstance(node, ast.Call):
            if isinstance(node.func, ast.Name): #and node.func.id in dependencies:
                if node.func.id in module_attributes:
                    print("ast Call: node.func.id: ", node.func.id)
                    used_dependencies.add(module_attributes[node.func.id])
                    
            if isinstance(node.func, ast.Attribute) and isinstance(node.func.value, ast.Name) and node.func.value.id in dependencies:
                print("ast Call: node.func.value.id: ", node.func.value.id)
                used_dependencies.add(node.func.value.id)

        for child in ast.iter_child_nodes(node):
            filter_traverse(child, dependencies)


    filter_traverse(abstract_syntax_tree, dependencies)
    return used_dependencies """



' def filter_dependencies(abstract_syntax_tree, dependencies, module_attributes):\n    used_dependencies = set()\n\n    def filter_traverse(node, dependencies):\n        #ast.Name include variable names, function names, class names, module names, and other names used within the code.\n        \n        if isinstance(node, ast.Call):\n            if isinstance(node.func, ast.Name): #and node.func.id in dependencies:\n                if node.func.id in module_attributes:\n                    print("ast Call: node.func.id: ", node.func.id)\n                    used_dependencies.add(module_attributes[node.func.id])\n                    \n            if isinstance(node.func, ast.Attribute) and isinstance(node.func.value, ast.Name) and node.func.value.id in dependencies:\n                print("ast Call: node.func.value.id: ", node.func.value.id)\n                used_dependencies.add(node.func.value.id)\n\n        for child in ast.iter_child_nodes(node):\n            filter_traverse(child, 

I dont think the above will work, i can't seem to manage the limitations using the ast module. It is much easier to make my own tokens...
I can check if the nodes of the AST are imports etc.. but then this is just the same as gathering information with reggex.. im still bottlenecked at 
filtering unused and uncommented dependencies. 
To do it well enough, i would have to go through all code and look at which imported methods are called (import from) and which imported modules are called. This is difficult with just ast as i fx. dont catch functions calls in other calls (eg. print(func()))
Also my current approach would only catch modules e.g. X.method(), not "from .. import method -> method()" as the method itself is not recorded as a dependency but only the module from which it is imported hmm.


Okay, now i also collect the functions that has been imported from modules


In [28]:
#Testing the traversal of the AST

ast_tree = file_to_ast_parser("C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/test.py")
dependencies, used_dependencies = collect_dependencies(ast_tree)
print("all dep", dependencies)
print("used dep", used_dependencies)
print("unused dep: ", dependencies - used_dependencies)

all dep {'zeeguu.config.loader', 'urllib', 'os', 'np'}
used dep {'zeeguu.config.loader', 'np'}
unused dep:  {'urllib', 'os'}


In [45]:
#Testing the traversal of the AST for the whole repo
all_dependencies, all_unused_dependencies, all_unused_dependencies = collect_all_dependencies()

#print("all files", all_files)
print("all dep", all_dependencies)
print("unused dep: ", all_unused_dependencies)

Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/env_var_defs_default.py
Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/setup.py
Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/start.py
Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/tools\activity_monitor.py
Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/tools\add_feed.py
Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/tools\anonymize_users.py
Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/tools\article_crawler.py
Collecting dependencies 

  words = [w for w in words if re.search("\d", w) == None]
  MULTIPLE_NEWLINES = re.compile("\n\s*\n")
  """


Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/zeeguu\core\test\tests_difficulty_estimator_strategies\__init__.py
Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/zeeguu\core\user_activity_hooks\article_interaction_hooks.py
Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/zeeguu\core\user_activity_hooks\__init__.py
Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/zeeguu\core\user_statistics\activity.py
Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/zeeguu\core\user_statistics\exercise_corectness.py
Collecting dependencies for file:  C:/Users/carl9/OneDrive/Skrivebord/SA_Individual_ZeeguuAPI/reconstruction/zeeguu-api/zeeguu\core\user_

**Visualization**
- polymetrics views?


In [48]:
#Visualize with networkx and matplotlib!
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib.colors as mcolors

def nx_dependencies_graph(code_root_folder, show_unused_dependencies=False):
    G = nx.Graph()
    all_files = get_all_files_in_repo(code_root_folder)
    all_dependencies, all_unused_dependencies, all_used_dependencies = collect_all_dependencies()
    
    
    for file in all_files:
        module_name = module_name_from_file_path(file)
        if module_name not in G.nodes:
            G.add_node(module_name)
        if show_unused_dependencies:
            for each in all_dependencies[module_name]:
                G.add_edge(module_name, each)
        else:
            for each in all_used_dependencies[module_name]:
                G.add_edge(module_name, each)
    return G

def nx_dependencies_digraph(code_root_folder, show_unused_dependencies=False):
    G = nx.DiGraph()
    all_files = get_all_files_in_repo(code_root_folder)
    all_dependencies, all_unused_dependencies, all_used_dependencies = collect_all_dependencies()
    
    
    for file in all_files:
        module_name = module_name_from_file_path(file)
        if module_name not in G.nodes:
            G.add_node(module_name)
        if show_unused_dependencies:
            for each in all_dependencies[module_name]:
                G.add_edge(module_name, each)
        else:
            for each in all_used_dependencies[module_name]:
                G.add_edge(module_name, each)
    return G

# a function to draw a graph
def draw_graph(G, size, **args):
    plt.figure(figsize=size)
    nx.draw_kamada_kawai(G, **args)
    plt.show()

In [None]:
#First Graph
G = nx_dependencies_graph(CODE_ROOT_FOLDER)
draw_graph(G, (40,40), with_labels=True)

In [None]:
DG = nx_dependencies_digraph(CODE_ROOT_FOLDER)
draw_graph(DG, (40,40), with_labels=True)