# Case Study: Zeeguu/API
- Backend of a web application that supports [free reading in foreign languages](https://zeeguu.org)
- Open source [repository on GH](https://github.com/zeeguu/API/)

## Table of conentents
1. [Basic Data Gathering](#Basic Data Gathering)
    1. [Extract dependencies](#Extract dependencies)
    2. [Visualize](#Visualize)
2. [Abstraction](#Abstraction)
3. [Evolution](#Evolution)



## Basic Data Gathering

- extracting basic dependencies between python modules
- every .py file is called a module in Python
- direct relationship between file name and module name
  - file: `./zeeguu_core/model/user.py` <==>
  - module: `zeeguu_core.model.User`


In [1]:
# Credit: https://colab.research.google.com/drive/1oe_TV7936Zmmzbbgq8rzqFpxYPX7SQHP#scrollTo=Njkjj4fzUV2E
# Installing Required Dependencies
import sys
sys.version
!{sys.executable} -m pip install gitpython
!{sys.executable} -m pip install pyvis

Collecting gitpython
  Downloading GitPython-3.1.43-py3-none-any.whl.metadata (13 kB)
Collecting gitdb<5,>=4.0.1 (from gitpython)
  Downloading gitdb-4.0.11-py3-none-any.whl.metadata (1.2 kB)
Collecting smmap<6,>=3.0.1 (from gitdb<5,>=4.0.1->gitpython)
  Downloading smmap-5.0.1-py3-none-any.whl.metadata (4.3 kB)
Downloading GitPython-3.1.43-py3-none-any.whl (207 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m207.3/207.3 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading gitdb-4.0.11-py3-none-any.whl (62 kB)
[2K   [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.7/62.7 kB[0m [31m5.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading smmap-5.0.1-py3-none-any.whl (24 kB)
Installing collected packages: smmap, gitdb, gitpython
Successfully installed gitdb-4.0.11 gitpython-3.1.43 smmap-5.0.1
Collecting pyvis
  Downloading pyvis-0.3.2-py3-none-any.whl.metadata (1.7 kB)
Collecting jsonpickle>=1.4.1 (from pyvis)
  Downloading js

In [25]:
# Adopted from: https://colab.research.google.com/drive/1oe_TV7936Zmmzbbgq8rzqFpxYPX7SQHP#scrollTo=Njkjj4fzUV2E
import os
from git import Repo

# Current Working Directory
cwd = os.getcwd()
print(cwd)

# Code location
CODE_ROOT_FOLDER=f"{cwd}/data/zeeguu-api/"

# Clone the repository
if not os.path.exists(CODE_ROOT_FOLDER):
  Repo.clone_from("https://github.com/zeeguu/api", CODE_ROOT_FOLDER)



/Users/andreaskongstad/Developer/PycharmProjects/architectural-reconstruction
   21206 total


     278


In [26]:
# Count absolute lines of code and number of files 
!cd {CODE_ROOT_FOLDER} && git ls-files | grep '\.py$' | xargs wc -l | grep total
!cd {CODE_ROOT_FOLDER} && git ls-files | grep "\.py$" | wc -l

   21206 total
     278


In [3]:
# helpers
def file_path(file_name):
    return f"{CODE_ROOT_FOLDER}{file_name}"

def module_name_from_file_path(full_path):
    """
    ../core/model/user.py -> zeeguu.core.model.user
    """
    file_name = full_path[len(CODE_ROOT_FOLDER):]
    file_name = file_name.replace("/__init__.py","")
    file_name = file_name.replace("/",".")
    file_name = file_name.replace(".py","")
    return file_name
  
# Test  
File_Name = "zeeguu/core/model/user.py"
assert file_path(File_Name) == "/Users/andreaskongstad/Developer/PycharmProjects/architectural-reconstruction/data/zeeguu-api/zeeguu/core/model/user.py"
assert module_name_from_file_path(file_path(File_Name)) == "zeeguu.core.model.user"

### Naïve way of extracting imports using regular expressions
we assume that imports are always at the
- TODO for you: add full support for imports; this is not complete...

regex patterns used
- ^ beginning of line
- \S anything that is not space
- \+ at least one occurrence of previous
- ( ) capture group (read more at: https://pynative.com/python-regex-capturing-groups/)

In [7]:
import re

def import_from_line(line):
    try:
      y = re.search("^from (\S+)", line)
      if not y:
        y = re.search("^import (\S+)", line)
      return y.group(1)
    except:
      return None
  
# extracts all the imported modules from a file
# returns a module of the form zeeguu_core.model.bookmark, e.g.

def imports_from_file(file):

    all_imports = []

    lines = [line for line in open(file)]

    for line in lines:
        imp = import_from_line(line)

        if imp:
            all_imports.append(imp)

    return all_imports


assert imports_from_file(file_path('zeeguu/core/model/bookmark.py')) == ['datetime', 'sqlalchemy', 'sqlalchemy', 'sqlalchemy.orm', 'sqlalchemy.orm.exc', 'wordstats', 'zeeguu.logging', 'zeeguu.core.bookmark_quality.fit_for_study', 'zeeguu.core.definition_of_learned', 'zeeguu.core.model', 'zeeguu.core.model.sorted_exercise_log', 'zeeguu.core.model.exercise', 'zeeguu.core.model.exercise_outcome', 'zeeguu.core.model.exercise_source', 'zeeguu.core.model.language', 'zeeguu.core.model.text', 'zeeguu.core.model.user', 'zeeguu.core.model.user_word', 'zeeguu.core.util.encoding', 'zeeguu.core.model.learning_cycle', 'zeeguu', 'zeeguu.core.model']
assert imports_from_file(file_path('zeeguu/core/model/unique_code.py')) == ['datetime', 'random', 'zeeguu.core', 'sqlalchemy', 'zeeguu.core.model']

  y = re.search("^from (\S+)", line)
  y = re.search("^import (\S+)", line)


### Extract dependencies
To do that we iterate over all the python files with the help of the Path.rglob function from pathlib
And we create a network with the help of the networkx package.

In [None]:
# TODO use pyvis instead of networkx
from pathlib import Path
import networkx as nx

def dependencies_graph(code_root_folder):
    files = Path(code_root_folder).rglob("*.py")

    G = nx.Graph()

    for file in files:
        file_path = str(file)

        module_name = module_name_from_file_path(file_path)

        if module_name not in G.nodes:
            G.add_node(module_name)

        for each in imports_from_file(file_path):
            G.add_edge(module_name, each)

    return G

### Visualize

In [27]:
import matplotlib.pyplot as plt

# a function to draw a graph
def draw_graph(G, size, **args):
    plt.figure(figsize=size)
    nx.draw(G, **args)
    plt.show()
    

def dependencies_digraph(code_root_folder):
    files = Path(code_root_folder).rglob("*.py")

    G = nx.DiGraph()

    for file in files:
        file_path = str(file)

        source_module = module_name_from_file_path(file_path)

        if source_module not in G.nodes:
            G.add_node(source_module)

        for target_module in imports_from_file(file_path):

            G.add_edge(source_module, target_module)
            # print(module_name + "=>" + each + ".")

    return G

In [None]:
# Looking at the directed graph
DG = dependencies_digraph(CODE_ROOT_FOLDER)
draw_graph(DG, (40,40), with_labels=True)

## Abstraction
What do we have now:
- System: zeeguu/api
- Source View: Modules & Dependencies
- Entities: .py files in the project
- Relationships: import statements between .py files

Plan: Abstraction methods
1. Folder hierarchy
2. Aggregate dependencies using metrics. (Sum of calls)
    - Total count of explicit low-level dependencies
    - Number of distinct explicit low-level dependencies
    - Network analysis to detect rank packages: Note (It should not be that hard, the networkx package supports various methods of network analysis, e.g. centrality, HITS, pagerank.)
3. Create different level graphs and pass them to OpenAI vision model

### Filter relevant modules

In [None]:

def relevant_module(module_name):
    """
    Define relevant modules
    """
    if "test" in module_name:
        return False


    if module_name.startswith("zeeguu"):
        return True


    return False

def dependencies_digraph(code_root_folder):
    files = Path(code_root_folder).rglob("*.py")

    G = nx.DiGraph()

    for file in files:
        file_path = str(file)

        source_module = module_name_from_file_path(file_path)
        if not relevant_module(source_module):
          continue

        if source_module not in G.nodes:
            G.add_node(source_module)

        for target_module in imports_from_file(file_path):

            if relevant_module(target_module):
              G.add_edge(source_module, target_module)


    return G

# Looking at the directed graph
DG = dependencies_digraph(CODE_ROOT_FOLDER)
draw_graph(DG, (40,40), with_labels=True)

### Basic Abstraction Using Hierarchical Module Structure & Naming Conventions

- abstracting the imports between the modules along the module hierarchy
- also taking into account naming conventions to filter out external modules

In [2]:
def top_level_package(module_name, depth=1):
    """Extract parent of module at depth"""
    components = module_name.split(".")
    return ".".join(components[:depth])

assert (top_level_package("zeeguu.core.model.util") == "zeeguu")
assert (top_level_package("zeeguu.core.model.util", 2) == "zeeguu.core")


def abstracted_to_top_level(G, depth=1):
    aG = nx.DiGraph()
    for each in G.edges():
        src = top_level_package(each[0], depth)
        dst = top_level_package(each[1], depth)

        if src != dst:
          aG.add_edge(src, dst)

    return aG

ADG = abstracted_to_top_level(DG, 3)
draw_graph(ADG, (8,8), with_labels=True)

## Evolution
Plan:
1.  Churn Find hot code -- Most changed/imporant regions

In [None]:
!{sys.executable} -m pip install pydriller

In [None]:
from pydriller import Repository
REPO_DIR = 'https://github.com/zeeguu/api'

In [None]:
# for PyDriller to work we need to change directory to our local clone of the repo
%cd /data/zeeguu-api