# Finding relevant code changes in frameworks and packages

The goal of this notebook is to track evolving code bases by first extracting changes made via the git log. These can then be filtered for the correct timeframe and relevant functions. The next step would then be to analyse the changes and decide whether they are relevant to a developer that uses that part of the code for differential testing or not.

## Imports

In [14]:
import os
import inspect
import pandas as pd
from datetime import date, timedelta
import sys
import subprocess
from IPython.display import display, HTML
from tqdm import tqdm

#import numpy as np
#from scipy import stats

## Setup: User Input

* The user inputs the package that they would like to update and the Deep Learning Library. 
* They then input the current version of the package that the DLL is using and the one that they would like to upgrade to (default: most recent version). The version is here simplified to release dates for now, since this is easier to handle for git diff.
* If the Github Link for that package is not stored, they then input the Github Link for that package.


In [2]:
# Set root folder for all libraries:
dl_library_root = "/Users/Alex/Desktop/BachelorThesis/DLL_Testing_Tool/DL_Libraries/"

In [3]:
# Input 1: Package name
package_name = 'tensorflow_1.12.0'

# Input 2: Deep Learning Library name and directory
dll_name = 'tensorflow_1.12.0'
dll_directory = dl_library_root + 'Tensorflow/tensorflow-1.12.0/tensorflow/python/'

# Input 3: Current version(i.e. date for simplicity) of the package (and optionally the desired version)
# Format: date(Year, month, day)
current_version_date = date(2018,11,6) # release date of TF 1.12.0
desired_version_date = date(2019,2,25) # release date of TF 1.13.1

# Input 4: Github Link of package (if not stored by the tool)
git_url = "https://github.com/tensorflow/tensorflow.git"
#git_url = 'https://github.com/keras-team/keras.git'

In [None]:
# Input 1: Package name
package_name = 'keras'

# Input 2: Deep Learning Library name and directory
dll_name = 'tensorflow_1.12.0'
dll_directory = dl_library_root + 'Tensorflow/tensorflow-1.12.0/tensorflow/python/'

# Input 3: Current version (i.e. date for simplicity) of the package (and optionally the desired version)
current_version_date = date(2021,1,1)
desired_version_date = date.today()

# Input 4: Github Link of package (if not stored by the tool)
#git_url = "https://github.com/tensorflow/tensorflow.git"
git_url = 'https://github.com/keras-team/keras.git'

In [None]:
# Input 1: Package name
package_name = 'scipy'

# Input 2: Deep Learning Library name and directory
dll_name = 'theano'
dll_directory = 'A:/BachelorThesis/DLL_Testing_Tool/DL_Libraries/Theano-rel-1.0.3/theano/'

# Input 3: Current version (i.e. date for simplicity) of the package (and optionally the desired version)
current_version_date = date(2018,1,1)
desired_version_date = date.today()

# Input 4: Github Link of package (if not stored by the tool)
git_url = 'https://github.com/scipy/scipy.git'

In [None]:
# Input 1: Package name
package_name = 'scipy'

# Input 2: Deep Learning Library name and directory
dll_name = 'tensorflow'
dll_directory = dl_library_root + 'Tensorflow/tensorflow-2.6.0/tensorflow/python/'

# Input 3: Current version (i.e. date for simplicity) of the package (and optionally the desired version)
current_version_date = date(2018,1,1)
desired_version_date = date.today()

# Input 4: Github Link of package (if not stored by the tool)
git_url = 'https://github.com/scipy/scipy.git'

In [None]:
# Input 1: Package name
package_name = 'np'

# Input 2: Deep Learning Library name and directory
dll_name = 'pytorch'
dll_directory = 'A:/BachelorThesis/DLL_Testing_Tool/DL_Libraries/PyTorch/pytorch-master/'

# Input 3: Current version (i.e. date for simplicity) of the package (and optionally the desired version)
current_version_date = date(2021,6,1)
desired_version_date = date.today()

# Input 4: Github Link of package (if not stored by the tool)
git_url = 'https://github.com/numpy/numpy.git'

In [None]:
# Input 1: Package name
package_name = 'scipy'

# Input 2: Deep Learning Library name and directory
dll_name = 'pytorch'
dll_directory = 'A:/BachelorThesis/DLL_Testing_Tool/DL_Libraries/PyTorch/pytorch-master/'

# Input 3: Current version (i.e. date for simplicity) of the package (and optionally the desired version)
current_version_date = date(2021,6,1)
desired_version_date = date.today()

# Input 4: Github Link of package (if not stored by the tool)
git_url = 'https://github.com/scipy/scipy.git'

In [None]:
# Input 1: Package name
package_name = 'stats'

# Input 2: Deep Learning Library name and directory
dll_name = 'numpy'
dll_directory = 'A:/BachelorThesis/DLL_Testing_Tool/DL_Libraries/Numpy/numpy-main/'

# Input 3: Current version (i.e. date for simplicity) of the package (and optionally the desired version)
current_version_date = date(2021,6,1)
desired_version_date = date.today()

# Input 4: Github Link of package (if not stored by the tool)
git_url = 'https://github.com/scipy/scipy.git'

In [4]:
# Import the package that should be upgraded (used to find the files where extracted functions are defined)
from tensorflow import keras
#import keras
#import scipy
#import numpy as np
#from scipy import stats

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


In [None]:
#!{sys.executable} -m pip install scipy

### Tools internal processing of the inputs

In [5]:
# TODO Check inputs for validity (i.e. does dll directory exist, is date in the correct format, is package known (for git url))

# Setup folder names
clone_folder_name = 'temp_bare_clone_' + package_name

## Create a bare clone of the library, which only includes repository data

In this way, we do not have to download the code, but still get access to the commit log.

In [22]:
%cd ..

/Users/Alex/Desktop/BachelorThesis/DLL_Testing_Tool/Code/2_Commit_Extraction_and_Analysis


In [23]:
# create a temporary directory for a bare clone of a give library
try:
    os.mkdir(clone_folder_name)
except:
    pass

In [17]:
# Only execute this if the clone was not yet created
if len(os.listdir(clone_folder_name)) == 0:

    # create the bare clone
    !git clone --bare {git_url} {clone_folder_name}

Cloning into bare repository 'temp_bare_clone_tensorflow_1.12.0'...
remote: Enumerating objects: 1225175, done.[K
remote: Counting objects: 100% (16/16), done.[K
remote: Compressing objects: 100% (11/11), done.[K
remote: Total 1225175 (delta 5), reused 14 (delta 5), pack-reused 1225159[K
Receiving objects: 100% (1225175/1225175), 753.82 MiB | 10.83 MiB/s, done.
Resolving deltas: 100% (996869/996869), done.


In [24]:
%cd {clone_folder_name}

/Users/Alex/Desktop/BachelorThesis/DLL_Testing_Tool/Code/2_Commit_Extraction_and_Analysis/temp_bare_clone_tensorflow_1.12.0


## Import the extraction data 

In [25]:
# import extracted test case data
df = pd.read_csv('../../1_Test_Case_Extraction_and_Analysis/extracted_data/{}_data.csv'.format(dll_name))

#for funcs in df.Differential_Test_Function.unique():
#    print(funcs)
#    if 'stats' in str(funcs):
#        print(funcs)

## Filter for only functions of the package




In [26]:
# For tensorflow 1.12.0, comment this line in:
package_name = 'keras'

# As a temporary solution, we will filter these for functions that contain 'package_name.' specifically
column_to_filter = 'Differential_Test_Function'
filter_keyword = package_name + '\.'

relevant_test_cases = df[df[column_to_filter].str.contains(filter_keyword, na=False)]
relevant_test_cases_unique = relevant_test_cases.Differential_Test_Function.unique()

# For demonstration: Test cases found in rnn_test.py (TF 1.12.0):
# Windows:
demo_test_cases = relevant_test_cases[relevant_test_cases.File_Path.str.contains(os.sep + 'rnn_test.py', regex=False)]

# Mac:
demo_test_cases = relevant_test_cases[relevant_test_cases.File_Path.str.contains('\\rnn_test.py', regex=False)]
#demo_extracted_functions = demo_test_cases.Differential_Test_Function.unique()
relevant_test_cases = demo_test_cases

#package_name = 'tensorflow'

relevant_test_cases

Unnamed: 0,File_Path,Line_Number,Found_in_Function,Function_Definition_Line_Number,Assert_Statement_Type,Oracle_Argument_ Position,Differential_Function_Line_Number,Differential_Test_Function
26161,kernel_tests\rnn_test.py,344,testRNNWithKerasSimpleRNNCell,323,assertEqual,1,335,keras.layers.SimpleRNNCell
26164,kernel_tests\rnn_test.py,345,testRNNWithKerasSimpleRNNCell,323,assertEqual,1,335,keras.layers.SimpleRNNCell
26168,kernel_tests\rnn_test.py,353,testRNNWithKerasSimpleRNNCell,323,assertEqual,1,335,keras.layers.SimpleRNNCell
26176,kernel_tests\rnn_test.py,354,testRNNWithKerasSimpleRNNCell,323,assertEqual,1,335,keras.layers.SimpleRNNCell
26184,kernel_tests\rnn_test.py,377,testRNNWithKerasGRUCell,356,assertEqual,1,368,keras.layers.GRUCell
26187,kernel_tests\rnn_test.py,378,testRNNWithKerasGRUCell,356,assertEqual,1,368,keras.layers.GRUCell
26191,kernel_tests\rnn_test.py,386,testRNNWithKerasGRUCell,356,assertEqual,1,368,keras.layers.GRUCell
26199,kernel_tests\rnn_test.py,387,testRNNWithKerasGRUCell,356,assertEqual,1,368,keras.layers.GRUCell
26207,kernel_tests\rnn_test.py,410,testRNNWithKerasLSTMCell,389,assertEqual,1,401,keras.layers.LSTMCell
26210,kernel_tests\rnn_test.py,411,testRNNWithKerasLSTMCell,389,assertEqual,1,401,keras.layers.LSTMCell


In [None]:
filter_keyword

## Getting a git diff of the current version of the extracted function and the desired version.

Procedure:
1. For a single extracted function, get the file it is defined in
2. Use git log to extract the commit id of the current version and the desired version
3. Perform a git diff, comparing the extracted file in those two commits  
4. (Selecting only the parts of the git diff that concern the extracted function)

In [34]:
# this is to correctly remove the user specific part of the file path, e.g. /Users/Alex/Desktop etc. 
# from the extracted functions source file location
package_name_in_root = 'tensorflow'
doc_name_ending = '_test'

def get_function_file_location(extracted_function, _package_name='tensorflow'):
    """For step 1. Find where the function is defined."""
    
    # use the extracted_function string as if it were code, since 'inspect' can't deal with strings
    str_to_execute = 'extracted_function_file_location = inspect.getsourcefile({})'.format(extracted_function)
    
    # get local scope (necessary since exec does not work well inside of function definition scopes)
    lcls = locals()
    
    # execute the string as if it were code, setting the file location variable in the local scope
    exec(str_to_execute, globals(), lcls)
    
    # getting the variable from the local scope
    extracted_function_file_location = lcls["extracted_function_file_location"]
    
    print(extracted_function_file_location)
    
    # remove the package root to get the relative file path 
    package_root_index = extracted_function_file_location.index(_package_name)
    extracted_function_file_location = extracted_function_file_location[package_root_index:]
    
    return extracted_function_file_location


def get_nearest_commit(version_date):
    """For step 2. Return commit ID and message of the nearest commit on or before version_date."""
    git_log_output = ''
    days = 1
    while git_log_output == '':
        git_log_command = ["git", "log", "--since", (version_date-timedelta(days=days)).strftime("%m-%d-%Y"), "--until", version_date.strftime("%d-%m-%Y")]
        #, "--", extracted_function_file_location]
        git_log_output = subprocess.run(git_log_command, stdout=subprocess.PIPE).stdout.decode('utf-8')
        
        #print("-" + str(days) + " " + git_log_output)
        
        days += 1
        
        # exit condition for when search takes too long
        if days > 100:
            return 'ERROR', 'No commit within 100 days of the entered date.', version_date
            

    commit_id = git_log_output[7:].splitlines()[0]
    
    commit_message_command = ["git", "log", "--format=%B", "-n", "1", commit_id]
    commit_message = subprocess.run(commit_message_command, stdout=subprocess.PIPE).stdout.decode('utf-8')
    
    commit_date = version_date-timedelta(days=days-2)
    
    return commit_id, commit_message, commit_date


def format_line_beginning(line):
    line_beginning = []
    for char in line:
        if char == ' ':
            line_beginning.append('&nbsp')
        else:
            break

    separator = ' '
    formatted_line = separator.join(line_beginning)
    formatted_line += line.lstrip()
    
    return formatted_line


def get_git_diff_output_formatted(commit_id_current, commit_id_desired, extracted_function_file_location):
    git_diff_command = ["git", "diff", commit_id_current, commit_id_desired, "--", extracted_function_file_location]

    git_diff_output = subprocess.run(git_diff_command, stdout=subprocess.PIPE).stdout.decode('utf-8')
    
    git_diff_processed = ''
    for line in git_diff_output.splitlines():
        if line.startswith('-'):
            line = line[1:]
            git_diff_processed += "<span style=\"color:red\">- " + format_line_beginning(line) + "</span>\n"
        
        elif line.startswith('+'):
            line = line[1:]
            git_diff_processed += "<span style=\"color:green\">+" + format_line_beginning(line) + "</span>\n"
        
        elif line.startswith(' '):
            git_diff_processed += format_line_beginning(line) + "\n"
            
        else:
            git_diff_processed += line + "\n"
    
    # formatting for html
    git_diff_processed = git_diff_processed.replace('\n', '\n<br>')#.replace(' ', '&nbsp ')
    
    return git_diff_processed



tool_output_destination = "../tool_output/tool_output_{}_{}{}.html".format(dll_name, package_name, doc_name_ending)
tool_output = open(tool_output_destination, "w+", encoding='utf-8')
tool_output.write("""
    <!DOCTYPE html>
    <html>
    <head>
    <style>
    .collapsible {
      background-color: #777;
      color: white;
      cursor: pointer;
      padding: 18px;
      width: 100%;
      border: none;
      text-align: left;
      outline: none;
      font-size: 15px;
    }

    .active, .collapsible:hover {
      background-color: #555;
    }

    .content {
      padding: 0 18px;
      display: none;
      overflow: hidden;
      background-color: #f1f1f1;
    }
    </style>
    </head>
    <body>\n
""")

error_list = []
extr_func_file_location_list = []

for extracted_function in relevant_test_cases.Differential_Test_Function:

    # 1:   
    try:
        extracted_function_file_location = get_function_file_location(extracted_function, _package_name=package_name_in_root)
        #print(extracted_function_file_location)
    except Exception as exc:
        error_list.append(extracted_function + " : " + str(exc))
        extr_func_file_location_list.append("ERROR")
        continue
    
    extr_func_file_location_list.append(extracted_function_file_location)
    
relevant_test_cases.loc[:, 'Extracted_Function_File_Location'] = extr_func_file_location_list
#.insert(-1, 'Extracted_Function_File_Location', extr_func_file_location_list)

#display(relevant_test_cases)


# 2:, commit_date_current
#commit_id_current, commit_message_current = get_nearest_commit(current_version_date)
commit_id_current, commit_message_current, commit_date_current = get_nearest_commit(current_version_date)
tool_output.write("\n <br>Commit id closest to current version: " + commit_id_current + "\n<br>Date: " + commit_date_current.strftime("%d-%b-%Y") + "\n")
tool_output.write("\n <br>Commit message: " + commit_message_current.replace('\n', '<br>') + "\n")

commit_id_desired, commit_message_desired, commit_date_desired = get_nearest_commit(desired_version_date)
tool_output.write("<br>Commit id closest to desired version: " + commit_id_desired + "\n<br>Date: " + commit_date_desired.strftime("%d-%b-%Y") + "\n")
tool_output.write("\n <br>Commit message: " + commit_message_desired.replace('\n', '<br>') + "\n<br>")


for extracted_function_file_location in tqdm(relevant_test_cases.Extracted_Function_File_Location.unique()):
    
    tool_output.write("_____________________________________" + extracted_function_file_location + "_________________________________________\n")
    
    tool_output.write(relevant_test_cases[relevant_test_cases['Extracted_Function_File_Location'] == extracted_function_file_location].to_html())
    tool_output.write("\n<br>")
    
    
    # 3:
    git_diff_processed = get_git_diff_output_formatted(commit_id_current, commit_id_desired, extracted_function_file_location)
    
    # also include the git diff of the file that the test case was found in:
    #git_diff_processed += "\n<br>" + get_git_diff_output_formatted(commit_id_current, commit_id_desired, 'tensorflow/python/kernel_tests/rnn_test.py')
    
    # add git diff as collapsible section
    tool_output.write("<button type=\"button\" class=\"collapsible\">Git Diff</button>\n<div class=\"content\">\n<p>" + git_diff_processed + "</p>\n</div>\n<br><br><br>")

# Add script to html to make git diff collapsible
tool_output.write("""
<br>
<script>
var coll = document.getElementsByClassName("collapsible");
var i;

for (i = 0; i < coll.length; i++) {
  coll[i].addEventListener("click", function() {
    this.classList.toggle("active");
    var content = this.nextElementSibling;
    if (content.style.display === "block") {
      content.style.display = "none";
    } else {
      content.style.display = "block";
    }
  });
}
</script>
</body>
</html>""")
tool_output.close()
print(str(len(error_list)) + " errors: " + str(error_list))
print("Tool output saved to " + tool_output_destination)

/opt/miniconda3/envs/python36_env/lib/python3.6/site-packages/tensorflow/python/keras/layers/recurrent.py
/opt/miniconda3/envs/python36_env/lib/python3.6/site-packages/tensorflow/python/keras/layers/recurrent.py
/opt/miniconda3/envs/python36_env/lib/python3.6/site-packages/tensorflow/python/keras/layers/recurrent.py
/opt/miniconda3/envs/python36_env/lib/python3.6/site-packages/tensorflow/python/keras/layers/recurrent.py
/opt/miniconda3/envs/python36_env/lib/python3.6/site-packages/tensorflow/python/keras/layers/recurrent.py
/opt/miniconda3/envs/python36_env/lib/python3.6/site-packages/tensorflow/python/keras/layers/recurrent.py
/opt/miniconda3/envs/python36_env/lib/python3.6/site-packages/tensorflow/python/keras/layers/recurrent.py
/opt/miniconda3/envs/python36_env/lib/python3.6/site-packages/tensorflow/python/keras/layers/recurrent.py
/opt/miniconda3/envs/python36_env/lib/python3.6/site-packages/tensorflow/python/keras/layers/recurrent.py
/opt/miniconda3/envs/python36_env/lib/python3.

100%|██████████| 3/3 [00:00<00:00, 15.79it/s]

0 errors: []
Tool output saved to ../tool_output/tool_output_tensorflow_1.12.0_keras_test.html





In [35]:
# open tool output
!open {tool_output_destination}

In [33]:
current_version_date = date(2019,10,7)#date(2018,11,6) # release date of TF 1.12.0 is 2018, 11, 6
desired_version_date = date(2019,10,18)

## Install section (for testing)

In [None]:
!python --version

In [None]:
# install the package (TODO if not already installed)
!{sys.executable} -m pip install {package_name}==2.2.4

In [None]:
!{sys.executable} -m pip install theano==1.0.3

In [None]:
!{sys.executable} -m pip install tensorflow==1.12.0

In [None]:
%pip install tensorflow==1.12.0

In [None]:
!{sys.executable} -m pip show keras

In [None]:
!{sys.executable} --version

In [None]:
sys.executable

In [None]:
!python -V

In [None]:
inspect.getsourcefile(scipy.linalg.cholesky)

## TESTING SECTION:

In [None]:
from scipy import *

In [None]:
# OLD CODE GIT LOG SECTION

# 1:

# get extracted function as string
#extracted_function = relevant_test_cases.iloc[case_id]['Differential_Test_Function']

# get the package root and remove it from the file path. This relative file path is necessary for a git diff
#package_root = ''
#exec('package_root = inspect.getsourcefile({})'.format(package_name))
# remove the init.py part from the path
#package_root = package_root.replace('__init__.py', '')
#print(package_root)


In [None]:
!start .
#os.system("git log --oneline -- {extracted_function_file_location}")
# --since "20-06-2021 00:00:00" -p

In [None]:
# helper for finding where functions are defined
print(inspect.getsourcefile(np.sum) + "\n")
#print(inspect.getsource(np.array))

In [None]:
# Different git urls:
#git_url = "https://github.com/pytorch/pytorch.git"
#git_url = "https://github.com/scipy/scipy.git"
#git_url = "https://github.com/keras-team/keras.git"

### Testing git log functions

-p shows the diffs

Hunks of differences are in the format @@ from-file-range to-file-range @@ [header].  
The from-file-range is in the form -\<start line\>,\<number of lines\>, and to-file-range is +\<start line\>,\<number of lines\>

In [None]:
#command = ["git", "log", "--oneline", "--name-only", "--since", current_version_date, "--until", desired_version_date, "--", extracted_function_file_location]
#command = ["git", "log", "--oneline", "--since", current_version_date, "--until", desired_version_date, "--", extracted_function_file_location]
#command = ["git", "log", "--oneline", "--since", current_version_date-timedelta(days=1), "--until", current_version_date, "--", extracted_function_file_location]
#command = ["git", "log", "--oneline", "--", extracted_function_file_location]

In [None]:
!git log --oneline -- tensorflow\\python\\keras\\layers\\recurrent.py

In [None]:
!git log --since 10-6-2019 --until 10-7-2019

In [None]:
!git diff 47c368bcc8d717ec6624d33152784338b99f6dea 8a05bdf333f34603b33c0f3a029e023deb27ae04 -- tensorflow/python/keras/layers/recurrent.py

In [None]:
extracted_function_file_location

In [None]:
!git log --since="3 hours ago" --pretty=oneline

In [None]:
!git log --name-only --date=local --since "20-06-2021 00:00:00" 

In [None]:
!git log --name-only --oneline --since "20-06-2021 00:00:00"

In [None]:
!git log --name-only --oneline --since "20-06-2021 00:00:00"
#--since "20-06-2021 00:00:00" -p -- scipy/special/_basic.py