# Case Study: Zeeguu/API
- Backend of a web application that supports [free reading in foreign languages](https://zeeguu.org)
- Open source [repository on GH](https://github.com/zeeguu/API/)

## Table of conentents
1. [Basic Data Gathering](#basic-data-gathering)
    1. [Extract dependencies](#extract-dependencies)
    2. [Visualize](#Visualize)
2. [Abstraction](#Abstraction)
3. [Evolution](#Evolution)
4. [Dynamic Analysis](#dynamic-analysis)



## Basic Data Gathering

- extracting basic dependencies between python modules
- every .py file is called a module in Python
- direct relationship between file name and module name
  - file: `./zeeguu_core/model/user.py` <==>
  - module: `zeeguu_core.model.User`


In [16]:
# Credit: https://colab.research.google.com/drive/1oe_TV7936Zmmzbbgq8rzqFpxYPX7SQHP#scrollTo=Njkjj4fzUV2E
# Installing Required Dependencies
import sys
sys.version
!{sys.executable} -m pip install gitpython
!{sys.executable} -m pip install pyvis



In [7]:
# Adopted from: https://colab.research.google.com/drive/1oe_TV7936Zmmzbbgq8rzqFpxYPX7SQHP#scrollTo=Njkjj4fzUV2E
import os
from git import Repo

# Current Working Directory
cwd = os.getcwd()
print(cwd)

# Code location
CODE_ROOT_FOLDER=f"{cwd}/data/zeeguu-api/"

# Clone the repository
if not os.path.exists(CODE_ROOT_FOLDER):
  Repo.clone_from("https://github.com/zeeguu/api", CODE_ROOT_FOLDER)



/Users/andreaskongstad/Developer/PycharmProjects/architectural-reconstruction


In [26]:
# Count absolute lines of code and number of files 
!cd {CODE_ROOT_FOLDER} && git ls-files | grep '\.py$' | xargs wc -l | grep total
!cd {CODE_ROOT_FOLDER} && git ls-files | grep "\.py$" | wc -l

   21206 total
     278


In [23]:
# helpers

def file_path(file_name):
    return f"{CODE_ROOT_FOLDER}{file_name}"


def module_name_from_file_path(full_path):
    """
    ../core/model/user.py -> zeeguu.core.model.user
    """
    file_name = full_path[len(CODE_ROOT_FOLDER):]
    file_name = file_name.replace("/__init__.py","")
    file_name = file_name.replace("/",".")
    file_name = file_name.replace(".py","")
    return file_name

File_Name = "zeeguu/core/model/user.py"
assert file_path(File_Name) == "/Users/andreaskongstad/Developer/PycharmProjects/architectural-reconstruction/data/zeeguu-api/zeeguu/core/model/user.py"
assert module_name_from_file_path(file_path(File_Name)) == "zeeguu.core.model.user"


def module_name_from_rel_path(full_path):

    # e.g. ../core/model/user.py -> zeeguu.core.model.user

    file_name = full_path.replace("/__init__.py","")
    file_name = file_name.replace("/",".")
    file_name = file_name.replace(".py","")
    return file_name

assert ("tools.migrations.teacher_dashboard_migration_1.upgrade" == module_name_from_rel_path("tools/migrations/teacher_dashboard_migration_1/upgrade.py"))
assert ("zeeguu.api") == module_name_from_rel_path("zeeguu/api/__init__.py")
  

### Naïve way of extracting imports using regular expressions
we assume that imports are always at the
- TODO for you: add full support for imports; this is not complete...

regex patterns used
- ^ beginning of line
- \S anything that is not space
- \+ at least one occurrence of previous
- ( ) capture group (read more at: https://pynative.com/python-regex-capturing-groups/)

In [7]:
import re

def import_from_line(line):
    try:
      y = re.search("^from (\S+)", line)
      if not y:
        y = re.search("^import (\S+)", line)
      return y.group(1)
    except:
      return None
  
# extracts all the imported modules from a file
# returns a module of the form zeeguu_core.model.bookmark, e.g.

def imports_from_file(file):
    all_imports = []

    lines = [line for line in open(file)]

    for line in lines:
        imp = import_from_line(line)

        if imp:
            all_imports.append(imp)

    return all_imports


assert imports_from_file(file_path('zeeguu/core/model/bookmark.py')) == ['datetime', 'sqlalchemy', 'sqlalchemy', 'sqlalchemy.orm', 'sqlalchemy.orm.exc', 'wordstats', 'zeeguu.logging', 'zeeguu.core.bookmark_quality.fit_for_study', 'zeeguu.core.definition_of_learned', 'zeeguu.core.model', 'zeeguu.core.model.sorted_exercise_log', 'zeeguu.core.model.exercise', 'zeeguu.core.model.exercise_outcome', 'zeeguu.core.model.exercise_source', 'zeeguu.core.model.language', 'zeeguu.core.model.text', 'zeeguu.core.model.user', 'zeeguu.core.model.user_word', 'zeeguu.core.util.encoding', 'zeeguu.core.model.learning_cycle', 'zeeguu', 'zeeguu.core.model']
assert imports_from_file(file_path('zeeguu/core/model/unique_code.py')) == ['datetime', 'random', 'zeeguu.core', 'sqlalchemy', 'zeeguu.core.model']

  y = re.search("^from (\S+)", line)
  y = re.search("^import (\S+)", line)


### Extract dependencies
To do that we iterate over all the python files with the help of the Path.rglob function from pathlib
And we create a network with the help of the networkx package.

In [None]:
# TODO use pyvis instead of networkx
from pathlib import Path
import networkx as nx

def dependencies_graph(code_root_folder):
    files = Path(code_root_folder).rglob("*.py")

    G = nx.Graph()

    for file in files:
        file_path = str(file)

        module_name = module_name_from_file_path(file_path)

        if module_name not in G.nodes:
            G.add_node(module_name)

        for each in imports_from_file(file_path):
            G.add_edge(module_name, each)

    return G

### Visualize

In [29]:
import matplotlib.pyplot as plt

# a function to draw a graph
def draw_graph(G, size, **args):
    plt.figure(figsize=size)
    nx.draw(G, **args)
    plt.show()
    

def dependencies_digraph(code_root_folder):
    files = Path(code_root_folder).rglob("*.py")

    G = nx.DiGraph()

    for file in files:
        file_path = str(file)

        source_module = module_name_from_file_path(file_path)

        if source_module not in G.nodes:
            G.add_node(source_module)

        for target_module in imports_from_file(file_path):

            G.add_edge(source_module, target_module)
            # print(module_name + "=>" + each + ".")

    return G

In [None]:
# Looking at the directed graph
DG = dependencies_digraph(CODE_ROOT_FOLDER)
draw_graph(DG, (40,40), with_labels=True)

## Abstraction
What do we have now:
- System: zeeguu/api
- Source View: Modules & Dependencies
- Entities: .py files in the project
- Relationships: import statements between .py files

Plan: Abstraction methods
1. Folder hierarchy
2. Aggregate dependencies using metrics. (Sum of calls)
    - Total count of explicit low-level dependencies
    - Number of distinct explicit low-level dependencies
    - Network analysis to detect rank packages: Note (It should not be that hard, the networkx package supports various methods of network analysis, e.g. centrality, HITS, pagerank.)
3. Create different level graphs and pass them to OpenAI vision model

### Filter relevant modules

In [None]:

def relevant_module(module_name):
    """
    Define relevant modules
    """
    if "test" in module_name:
        return False
    if module_name.startswith("zeeguu"):
        return True


    return False

def dependencies_digraph(code_root_folder):
    files = Path(code_root_folder).rglob("*.py")

    G = nx.DiGraph()

    for file in files:
        file_path = str(file)

        source_module = module_name_from_file_path(file_path)
        if not relevant_module(source_module):
          continue

        if source_module not in G.nodes:
            G.add_node(source_module)

        for target_module in imports_from_file(file_path):

            if relevant_module(target_module):
              G.add_edge(source_module, target_module)


    return G

# Looking at the directed graph
DG = dependencies_digraph(CODE_ROOT_FOLDER)
draw_graph(DG, (40,40), with_labels=True)

### Basic Abstraction Using Hierarchical Module Structure & Naming Conventions

- abstracting the imports between the modules along the module hierarchy
- also taking into account naming conventions to filter out external modules

In [11]:
def top_level_package(module_name, depth=1):
    """Extract parent of module at depth"""
    components = module_name.split(".")
    return ".".join(components[:depth])

assert (top_level_package("zeeguu.core.model.util") == "zeeguu")
assert (top_level_package("zeeguu.core.model.util", 2) == "zeeguu.core")


def abstracted_to_top_level(G, depth=1):
    aG = nx.DiGraph()
    for each in G.edges():
        src = top_level_package(each[0], depth)
        dst = top_level_package(each[1], depth)

        if src != dst:
          aG.add_edge(src, dst)

    return aG

ADG = abstracted_to_top_level(DG, 3)
draw_graph(ADG, (8,8), with_labels=True)

NameError: name 'DG' is not defined

## Evolution
Plan:
1.  Churn Find hot code -- Most changed/imporant regions

2. Extract multiple complementary module views from your case study system
3. Ensure that your layouts are readable - limit the number of nodes in a view, use a different layout in networkx, or use a different library than networkx
4. Augment each of the previously obtained module views by mapping the above-computed churn metric on the color of a given node

In [17]:
!{sys.executable} -m pip install pydriller

Collecting pydriller
  Downloading PyDriller-2.6-py3-none-any.whl.metadata (1.3 kB)
Collecting types-pytz (from pydriller)
  Downloading types_pytz-2024.1.0.20240417-py3-none-any.whl.metadata (1.5 kB)
Collecting lizard (from pydriller)
  Downloading lizard-1.17.10-py2.py3-none-any.whl.metadata (15 kB)
Downloading PyDriller-2.6-py3-none-any.whl (33 kB)
Downloading lizard-1.17.10-py2.py3-none-any.whl (66 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m66.0/66.0 kB[0m [31m6.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading types_pytz-2024.1.0.20240417-py3-none-any.whl (5.2 kB)
Installing collected packages: lizard, types-pytz, pydriller
Successfully installed lizard-1.17.10 pydriller-2.6 types-pytz-2024.1.0.20240417


In [18]:
from pydriller import Repository
REPO_DIR = 'https://github.com/zeeguu/api'

In [19]:
# for PyDriller to work we need to change directory to our local clone of the repo
%cd {CODE_ROOT_FOLDER}

/Users/andreaskongstad/Developer/PycharmProjects/architectural-reconstruction/data/zeeguu-api


In [25]:
from collections import defaultdict
from pydriller import ModificationType

all_commits = list(Repository(REPO_DIR).traverse_commits())

def print_out_commit_details(commits):
    """ Usage: print_out_commit_details(all_commits[0:1])"""
    for commit in commits:
        print(commit)
        for each in commit.modified_files:
            print(f"{commit.author.name} {each.change_type} {each.filename}\n -{each.old_path}\n -{each.new_path}")


def commit_counts(all_commits):
    commit_counts = defaultdict(int)

    for commit in all_commits:
        for file in commit.modified_files:
            commit_counts[file.new_path] += 1

    return commit_counts


def commit_counts_better(all_commits):
    commit_counts = {}
    for commit in all_commits:
        for modification in commit.modified_files:

            new_path = modification.new_path
            old_path = modification.old_path

            try:
                if modification.change_type == ModificationType.RENAME:
                    commit_counts[new_path]=commit_counts.get(old_path,0)+1
                    commit_counts.pop(old_path)

                elif modification.change_type == ModificationType.DELETE:
                    commit_counts.pop(old_path, '')

                elif modification.change_type == ModificationType.ADD:
                    commit_counts[new_path] = 1

                else: # modification to existing file
                        commit_counts [old_path] += 1
            except Exception as e:
                print("something went wrong with: " + str(modification))
                pass
        return commit_counts
        
        

# sort by number of commits in decreasing order
commit_counts = commit_counts_better(all_commits)
sorted_commits = sorted(commit_counts.items(), key=lambda x: x[1], reverse=True)[:42]
# discussion: What is ("None", 103) ?

In [26]:
def package_activity():
    package_activity = defaultdict(int)

    for path, count in commit_counts.items():
        if ".py" in str(path):
            l2_module = top_level_package(module_name_from_rel_path(path), 2)
            package_activity[l2_module] += count

    return package_activity

package_activity = package_activity()
sorted_sizes = sorted(package_activity.items(), key=lambda x: x[1], reverse=True)

In [33]:
plt.figure(figsize=(7,7))
nx.draw_networkx(ADG, with_labels=True, node_size = sizes, node_color='r')
plt.show()


NameError: name 'nx' is not defined

<Figure size 700x700 with 0 Axes>

## Dynamic Analysis
Not as relavent for project.
Plan: Dont know if i will do this yet.

In [32]:
import inspect

def methods_in_class(cls):
    """ Returns all the methods in a class """
    return [
		(name, object) 
		for (name, object) 
			in cls.__dict__.items() 
		if hasattr(object, '__call__')]
    
def log_decorator( function ):
    """ A decorator that logs the function on call """
    def decorated( *args, **kwargs ):
        print (f'I have been called: {function}')
        return function( *args,**kwargs )
    return decorated

def decorate_methods( cls, decorator ):
    """ Decorates all the methods in a class with a log_decorator"""
    methods = methods_in_class(cls)
    for name, method in methods:
	    setattr( cls, name, decorator ( method ))
    return cls


def caller(): 
	callee()

def callee():
    """ Prints the name of the calling function"""
    print(inspect.stack()[1].function)

caller()



In [None]:
# Decoreate the user class:
from zeeguu.core.model import User
decorate_methods(User, log_decorator)

u= User.find_by_id(534)
u.bookmark_count()

# to see even further one can instrument also third party libraries!
from sqlalchemy.orm.query import Query
decorate_methods(Query, log_decorator)
