# Finding relevant code changes in frameworks and packages

The goal of this notebook is to track evolving code bases by first extracting changes made via the git log. These can then be filtered for the correct timeframe and relevant functions. The next step would then be to analyse the changes and decide whether they are relevant to a developer that uses that part of the code for differential testing or not.

## Imports

In [2]:
import os
import inspect
import pandas as pd
from datetime import date, timedelta
import sys
import subprocess
from IPython.display import display, HTML

import numpy as np
#from scipy import stats

## Setup: User Input

* The user inputs the package that they would like to update and the Deep Learning Library. 
* They then inputs the current version of the package that the DLL is using and the one that they would like to upgrade to (default: most recent version). The version is here simplified to release dates for now, since this is easier to handle for git diff.
* If the Github Link for that package is not stored, they then input the Github Link for that package.


In [3]:
# Input 1: Package name
package_name = 'tensorflow'

# Input 2: Deep Learning Library name and directory
dll_name = 'tensorflow'
dll_directory = 'A:/BachelorThesis/DLL_Testing_Tool/DL_Libraries/Tensorflow/tensorflow-1.12.0/tensorflow/python/'

# Input 3: Current version (i.e. date for simplicity) of the package (and optionally the desired version)
current_version_date = date(2021,1,1)
desired_version_date = date.today()

# Input 4: Github Link (if not stored by the tool)
git_url = "https://github.com/tensorflow/tensorflow.git"
#git_url = 'https://github.com/keras-team/keras.git'

In [4]:
# Import the package that should be upgraded (used to find the files where extracted functions are defined)
from tensorflow import keras
#import keras

  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])


### Tools internal processing of the inputs

In [5]:
# TODO Check inputs for validity (i.e. does dll directory exist, is date in the correct format, is package known (for git url))

# Setup folder names
clone_folder_name = 'temp_bare_clone_' + package_name

## Create a bare clone of the library, which only includes repository data

In this way, we do not have to download the code, but still get access to the commit log.

In [6]:
# create a temporary directory for a bare clone of a give library
try:
    os.mkdir(clone_folder_name)
except:
    pass

In [8]:
# Only execute this if the clone was not yet created
if len(os.listdir(clone_folder_name)) == 0:

    # create the bare clone
    !git clone --bare {git_url} {clone_folder_name}

In [9]:
%cd {clone_folder_name}

A:\BachelorThesis\DLL_Testing_Tool\Code\2_Commit_Extraction_and_Analysis\temp_bare_clone_tensorflow


## Import the extraction data 

In [10]:
# import tensorflow 1.12.0 data
df_tensorflow_1_12_0 = pd.read_csv('../../1_Test_Case_Extraction_and_Analysis/extracted_data/tensorflow_1.12.0_data.csv')

## Filter for only functions of the package




In [12]:
package_name = 'keras'
# As a temporary solution, we will filter these for functions that contain 'package_name.' specifically
column_to_filter = 'Differential_Test_Function'
filter_keyword = package_name + '\.'

relevant_test_cases = df_tensorflow_1_12_0[df_tensorflow_1_12_0[column_to_filter].str.contains(filter_keyword, na=False)]
relevant_test_cases_unique = relevant_test_cases.Differential_Test_Function.unique()

# For demonstration: Test cases found in rnn_test.py (TF 1.12.0):
demo_test_cases = relevant_test_cases[relevant_test_cases.File_Path.str.contains(os.sep + 'rnn_test.py', regex=False)]
demo_extracted_functions = demo_test_cases.Differential_Test_Function.unique()

#relevant_test_cases
demo_test_cases

Unnamed: 0,File_Path,Line_Number,Found_in_Function,Function_Definition_Line_Number,Assert_Statement_Type,Oracle_Argument_ Position,Differential_Function_Line_Number,Differential_Test_Function
26161,kernel_tests\rnn_test.py,344,testRNNWithKerasSimpleRNNCell,323,assertEqual,1,335,keras.layers.SimpleRNNCell
26164,kernel_tests\rnn_test.py,345,testRNNWithKerasSimpleRNNCell,323,assertEqual,1,335,keras.layers.SimpleRNNCell
26168,kernel_tests\rnn_test.py,353,testRNNWithKerasSimpleRNNCell,323,assertEqual,1,335,keras.layers.SimpleRNNCell
26176,kernel_tests\rnn_test.py,354,testRNNWithKerasSimpleRNNCell,323,assertEqual,1,335,keras.layers.SimpleRNNCell
26184,kernel_tests\rnn_test.py,377,testRNNWithKerasGRUCell,356,assertEqual,1,368,keras.layers.GRUCell
26187,kernel_tests\rnn_test.py,378,testRNNWithKerasGRUCell,356,assertEqual,1,368,keras.layers.GRUCell
26191,kernel_tests\rnn_test.py,386,testRNNWithKerasGRUCell,356,assertEqual,1,368,keras.layers.GRUCell
26199,kernel_tests\rnn_test.py,387,testRNNWithKerasGRUCell,356,assertEqual,1,368,keras.layers.GRUCell
26207,kernel_tests\rnn_test.py,410,testRNNWithKerasLSTMCell,389,assertEqual,1,401,keras.layers.LSTMCell
26210,kernel_tests\rnn_test.py,411,testRNNWithKerasLSTMCell,389,assertEqual,1,401,keras.layers.LSTMCell


## Getting a git diff of the current version of the extracted function and the desired version.

Procedure:
1. For a single extracted function, get the file it is defined in
2. Use git log to extract the commit id of the current version and the desired version
3. Perform a git diff, comparing the extracted file in those two commits  
4. (Selecting only the parts of the git diff that concern the extracted function)

In [13]:
def get_function_file_location(extracted_function, _package_name='tensorflow'):
    """For step 1. Find where the function is defined."""
    
    # use the extracted_function string as if it were code, since 'inspect' can't deal with strings
    str_to_execute = 'extracted_function_file_location = inspect.getsourcefile({})'.format(extracted_function)
    
    # get local scope (necessary since exec does not work well inside of function definition scopes)
    lcls = locals()
    
    # execute the string as if it were code, setting the file location variable in the local scope
    exec(str_to_execute, globals(), lcls)
    
    # getting the variable from the local scope
    extracted_function_file_location = lcls["extracted_function_file_location"]
    
    # remove the package root to get the relative file path 
    package_root_index = extracted_function_file_location.index(_package_name)
    extracted_function_file_location = extracted_function_file_location[package_root_index:]
    
    return extracted_function_file_location


def get_nearest_commit(version_date, extracted_function_file_location):
    """For step 2. Return commit ID and message of the nearest commit on or before version_date."""
    git_log_output = ''
    days = 1
    while git_log_output == '':
        git_log_command = ["git", "log", "--since", (version_date-timedelta(days=days)).strftime("%d-%m-%Y"), "--until", version_date.strftime("%d-%m-%Y"), "--", extracted_function_file_location]
        git_log_output = subprocess.run(git_log_command, stdout=subprocess.PIPE).stdout.decode('utf-8')

        days += 1

    commit_id = git_log_output[7:].splitlines()[0]
    
    commit_message_command = ["git", "log", "--format=%B", "-n", "1", commit_id]
    commit_message = subprocess.run(commit_message_command, stdout=subprocess.PIPE).stdout.decode('utf-8')
    
    return commit_id, commit_message


def format_line_beginning(line):
    line_beginning = []
    for char in line:
        if char == ' ':
            line_beginning.append('&nbsp')
        else:
            break

    separator = ' '
    formatted_line = separator.join(line_beginning)
    formatted_line += line.lstrip()
    
    return formatted_line


tool_output = open("../tool_output/tool_output.html", "w+", encoding='utf-8')
tool_output.write("<!DOCTYPE html>\n<html>\n<body>\n")

error_list = []
extr_func_file_location_list = []

for extracted_function in demo_test_cases.Differential_Test_Function:

    # 1:   
    try:
        extracted_function_file_location = get_function_file_location(extracted_function)
        
    except:
        error_list.append(extracted_function)
        continue
    
    extr_func_file_location_list.append(extracted_function_file_location)
    
demo_test_cases.loc[:, 'Extracted_Function_File_Location'] = extr_func_file_location_list
#.insert(-1, 'Extracted_Function_File_Location', extr_func_file_location_list)

#display(demo_test_cases)


for extracted_function_file_location in demo_test_cases.Extracted_Function_File_Location.unique():
    
    tool_output.write("_____________________________________" + extracted_function_file_location + "_________________________________________\n")
    
    tool_output.write(demo_test_cases[demo_test_cases['Extracted_Function_File_Location'] == extracted_function_file_location].to_html())
    
    
    # 2:
    commit_id_current, commit_message_current = get_nearest_commit(current_version_date, extracted_function_file_location)
    tool_output.write("\n <br>Commit id closest to current version: " + commit_id_current + "\n")
    tool_output.write("\n <br>Commit message: " + commit_message_current.replace('\n', '<br>') + "\n")
    
    commit_id_desired, commit_message_desired = get_nearest_commit(desired_version_date, extracted_function_file_location)
    tool_output.write("<br>Commit id closest to desired version: " + commit_id_desired + "\n")
    tool_output.write("\n <br>Commit message: " + commit_message_desired.replace('\n', '<br>') + "\n<br>")
    

    # 3:
    git_diff_command = ["git", "diff", commit_id_current, commit_id_desired, "--", extracted_function_file_location]

    git_diff_output = subprocess.run(git_diff_command, stdout=subprocess.PIPE).stdout.decode('utf-8')
    
    git_diff_processed = ''
    for line in git_diff_output.splitlines():
        if line.startswith('-'):
            line = line[1:]
            git_diff_processed += "<span style=\"color:red\">- " + format_line_beginning(line) + "</span>\n"
        
        elif line.startswith('+'):
            line = line[1:]
            git_diff_processed += "<span style=\"color:green\">+" + format_line_beginning(line) + "</span>\n"
        
        elif line.startswith(' '):
            git_diff_processed += format_line_beginning(line) + "\n"
            
        else:
            git_diff_processed += line + "\n"
    
    # formatting for html
    git_diff_processed = git_diff_processed.replace('\n', '\n<br>')#.replace(' ', '&nbsp ')
    
    tool_output.write("<p>" + git_diff_processed + "</p>")

tool_output.write("<br>\n</body>\n</html>")
tool_output.close()
print(str(len(error_list)) + " errors: " + str(error_list))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[key] = _infer_fill_value(value)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  isetter(ilocs[0], value)


0 errors: []


## Install section (for testing)

In [None]:
!python --version

In [None]:
# install the package (TODO if not already installed)
!{sys.executable} -m pip install {package_name}==2.2.4

In [None]:
!{sys.executable} -m pip install tensorflow

In [None]:
%pip install tensorflow==1.12.0

In [None]:
!{sys.executable} -m pip show keras

In [None]:
!{sys.executable} --version

In [None]:
sys.executable

## TESTING SECTION:

In [None]:
# OLD CODE GIT LOG SECTION

# 1:

# get extracted function as string
#extracted_function = relevant_test_cases.iloc[case_id]['Differential_Test_Function']

# get the package root and remove it from the file path. This relative file path is necessary for a git diff
#package_root = ''
#exec('package_root = inspect.getsourcefile({})'.format(package_name))
# remove the init.py part from the path
#package_root = package_root.replace('__init__.py', '')
#print(package_root)


In [83]:
!start .
#os.system("git log --oneline -- {extracted_function_file_location}")
# --since "20-06-2021 00:00:00" -p

In [None]:
# helper for finding where functions are defined
print(inspect.getsourcefile(stats.t.logpdf) + "\n")
print(inspect.getsource(stats.t.logpdf))

In [None]:
# Different git urls:
#git_url = "https://github.com/pytorch/pytorch.git"
#git_url = "https://github.com/scipy/scipy.git"
#git_url = "https://github.com/keras-team/keras.git"

### Testing git log functions

-p shows the diffs

Hunks of differences are in the format @@ from-file-range to-file-range @@ [header].  
The from-file-range is in the form -\<start line\>,\<number of lines\>, and to-file-range is +\<start line\>,\<number of lines\>

In [None]:
#command = ["git", "log", "--oneline", "--name-only", "--since", current_version_date, "--until", desired_version_date, "--", extracted_function_file_location]
#command = ["git", "log", "--oneline", "--since", current_version_date, "--until", desired_version_date, "--", extracted_function_file_location]
#command = ["git", "log", "--oneline", "--since", current_version_date-timedelta(days=1), "--until", current_version_date, "--", extracted_function_file_location]
#command = ["git", "log", "--oneline", "--", extracted_function_file_location]

In [None]:
!git log --oneline -- {extracted_function_file_location}

In [None]:
!git log --since="3 hours ago" --pretty=oneline

In [None]:
!git log --name-only --date=local --since "20-06-2021 00:00:00" 

In [None]:
!git log --name-only --oneline --since "20-06-2021 00:00:00"

In [None]:
!git log --name-only --oneline --since "20-06-2021 00:00:00"
#--since "20-06-2021 00:00:00" -p -- scipy/special/_basic.py