## Objective

This notebook aimes to automate the process of extracting and analyzing import statements from multiple Jupyter Notebooks within a GitHub repository. The main idea is to collect all the import statements from the code cells of these notebooks to check for dependency breakdowns and version conflicts upon an update in the virtual environment.

### Outline

- **GitHub authentication and repository access**

- **Extracting import statements**

- **Processing import statements and validation**:
 

In [1]:
import os
import json
from github import Github

%load_ext lab_black

### GitHub Authenticaton

Accessed the specified repository and retrieved all notebook files within it

In [None]:
# retrieve the GitHub token from environment variable
token = os.getenv("GITHUB_TOKEN_prj_pkg")

# authenticate to github
g = Github(token)

# get the authenticated user
user = g.get_user()

In [None]:
# print the authenticated user's login
print(f"Authenticated as: {user.login}")

# define the organization and repository name
org_name = "epfl-exts"  # Replace with the actual organization name
repo_name = "adsml-ibex"  # Replace with the actual repository name

# check if you find the repository
repo = g.get_repo(f"{org_name}/{repo_name}")
print(f"Found repository: {repo.full_name}")

Authenticated as: AmirKhalilzadeh
Found repository: epfl-exts/adsml-ibex


### Extracting import statements

Define functions to recursively get all notebook files and extract import statements from the code cells. Then collect all import statements from the notebooks and stored them in a list.

In [None]:
# recursively get all notebook files in a repository
def get_notebook_files(repo, path=""):
    contents = repo.get_contents(path)
    notebooks = []
    for content_file in contents:
        if content_file.type == "dir":
            notebooks.extend(get_notebook_files(repo, content_file.path))
        elif content_file.name == "notebook.ipynb":
            notebooks.append(content_file.path)
    return notebooks


# extract import statements from a notebook
def extract_import_statements(notebook_content):
    import_statements = []
    for cell in notebook_content["cells"]:
        if cell["cell_type"] == "code":
            for line in cell["source"]:
                if line.startswith("import") or line.startswith("from"):
                    import_statements.append(line.strip())
    return import_statements

In [5]:
# Get all notebook files in the repository
notebook_files = get_notebook_files(repo)

# Extract the base names of the notebook files
notebook_names = [os.path.basename(file) for file in notebook_files]

# Print the unique names
display(set(notebook_names))
len(notebook_names)

{'notebook.ipynb'}

346

### Processing import statements

In [6]:
# Collect all import statements
all_import_statements = []
for notebook_file in notebook_files:
    file_content = repo.get_contents(notebook_file).decoded_content.decode("utf-8")
    notebook_content = json.loads(file_content)
    all_import_statements.extend(extract_import_statements(notebook_content))

set(all_import_statements)

{'from IPython.display import display',
 'from PIL import Image',
 'from bs4 import BeautifulSoup',
 'from collections import Counter',
 'from components import QuizzComponent',
 'from gensim.models import Phrases',
 'from itertools import product',
 'from matplotlib import pyplot as plt',
 'from matplotlib.patches import Ellipse',
 'from mpl_toolkits.mplot3d import Axes3D',
 'from nltk.corpus import stopwords',
 'from nltk.stem.porter import PorterStemmer',
 'from nltk.tokenize import TreebankWordTokenizer',
 'from nltk.tokenize import sent_tokenize',
 'from pandas import json_normalize',
 'from pandas.plotting import autocorrelation_plot, lag_plot',
 'from pandas.plotting import lag_plot',
 'from pandas.tseries.offsets import *',
 'from scipy import stats',
 'from scipy.io import wavfile',
 'from scipy.linalg import lstsq',
 'from scipy.signal import hilbert',
 'from scipy.stats import skew, kurtosis',
 'from scipy.stats import zscore',
 'from sklearn import datasets',
 'from sklearn

In [7]:
display(len(set(all_import_statements)))

# Remove specific import statements
all_import_statements = [
    stmt
    for stmt in all_import_statements
    if stmt != "from components import QuizzComponent" and stmt != "import w, sys"
]

# Display the updated list
len(set(all_import_statements))

150

148

In [8]:
# Convert the list to a set to get unique import statements
unique_import_statements = set(all_import_statements)

# Save the unique import statements to a file
with open("unique_import_statements.txt", "w") as file:
    for statement in unique_import_statements:
        file.write(statement + "\n")

### Validation

Activate the environment and check whether the import statements are fine

In [1]:
# load unique_import_statements.txt file
with open("unique_import_statements.txt", "r") as file:
    unique_import_statements = file.readlines()

len(unique_import_statements)

146

In [2]:
# check if import statements work
def check_imports(import_statements):
	for statement in import_statements:
		try:
			exec(statement)
			print(f"Successfully imported: {statement}")
		except Exception as e:
			print(f"Failed to import: {statement} - Error: {e}")

# check all collected import statements
check_imports(set(unique_import_statements))




Successfully imported: from sklearn.preprocessing import PolynomialFeatures

Successfully imported: from nltk.corpus import stopwords

Successfully imported: from scipy.linalg import lstsq

Successfully imported: from sklearn.metrics import r2_score

Successfully imported: import random

Successfully imported: import pickle

Successfully imported: from sklearn.preprocessing import scale

Successfully imported: from sklearn.metrics import confusion_matrix

Successfully imported: from sklearn.metrics import roc_curve

Successfully imported: import tensorflow as tf

Failed to import: import noisereduce as nr
 - Error: No module named 'torch'
Successfully imported: from tensorflow import keras

Successfully imported: import IPython.display as ipd

Successfully imported: from sklearn.tree import DecisionTreeClassifier

Successfully imported: import os

Successfully imported: import numpy as np

Successfully imported: from sklearn.ensemble import RandomForestRegressor

Successfully imported:

In [None]:
# there are two failures:
# Failed to import: import tensorflow_text
#  - Error: No module named 'tensorflow_text'

#  Failed to import: from tensorflow.keras.layers import (Conv2D, BatchNormalization, Dropout,
#  - Error: unexpected EOF while parsing (<string>, line 1)

# under the current env we have the following failure too:
# Failed to import: import noisereduce as nr
#  - Error: No module named 'torch'