## Preprocessing on the Method2test Datasets:

#### 1. preparing the dataset for src2abs
#### 2. runinng src2abs on dataset for abstraction
#### 3. selecting the samples for training the model

In [1]:
# reading the datasets: input (Methods) and output (Test Cases) text files

with open("input.methods.txt", "r", encoding='utf-8') as f_read_in:
    input_methods = f_read_in.readlines()
    
with open("output.tests.txt", "r", encoding='utf-8') as f_output_tests:
    output_tests = f_output_tests.readlines()
    

print('length input is  : ', len(input_methods))
print('length output is : ', len(output_tests))

length input is  :  462093
length output is :  462093


In [2]:
# decreasing the number of samples for checking the code:

input_methods = input_methods [:5000]
output_tests = output_tests [:5000]

print('length input is  : ', len(input_methods))
print('length output is : ', len(output_tests))

length input is  :  5000
length output is :  5000


## 1. preparing the dataset for src2abs


In [3]:
# removing annotations:
# (Since src2abs can't abstract the code with annotations (@...) we have to find them and remove them.)

import re

for i in range(len(input_methods)):
    # removing simple annotations (like @Test, @Deprecated, ...):
    input_methods[i] = re.sub(r'@\w+ ', '', input_methods[i]) 
    
    # removing other annotations (like @SuppressWarnings("unchecked"), @Path("paths"), ...):
    annotations = re.findall('@\w+\(',input_methods[i])
    for a in annotations:
        a_end_1 = input_methods[i].find(a)
        a_end_2 = input_methods[i].find(')', a_end_1)
        input_methods[i] = input_methods[i].replace(input_methods[i][a_end_1:a_end_2+1], '')
    
    # removing syntax error (everything between last ';' and last '}'):
    end_1 = input_methods[i].rfind(r';')
    if (end_1) > -1:
        end_2 = input_methods[i][end_1:].find(r'}')
        if (end_2) > -1:
            end_2 = end_2 + end_1
            input_methods[i] = input_methods[i][:end_1] + "; " + input_methods[i][end_2:]
    
    
    
for i in range(len(output_tests)):
    # removing simple annotations (like @Test, @Deprecated, ...):
    output_tests[i] = re.sub(r'@\w+ ', '', output_tests[i])
    
    # removing other annotations (like @SuppressWarnings("unchecked"), @Path("paths"), ...):
    annotations = re.findall('@\w+\(',output_tests[i])
    for a in annotations:
        a_end_1 = output_tests[i].find(a)
        a_end_2 = output_tests[i].find(')', a_end_1)
        output_tests[i] = output_tests[i].replace(output_tests[i][a_end_1:a_end_2+1], '')
    
    # removing syntax error (everything between last ';' and last '}'):    
    end_1 = output_tests[i].rfind(r';')
    if (end_1) > -1:
        end_2 = output_tests[i][end_1:].find(r'}')
        if (end_2) > -1:
            end_2 = end_2 + end_1
            output_tests[i] = output_tests[i][:end_1] + "; " + output_tests[i][end_2:]

In [4]:
# we will find public or private or protected that appears sooner and will remove everything before them:

num_samples = len(input_methods)

nothing_index_input = []
for i in range(num_samples):
    index = []
    index.append(input_methods[i].find('public'))
    index.append(input_methods[i].find('private'))
    index.append(input_methods[i].find('protected'))
    
    if (index[0] == -1) and (index[1] == -1) and (index[2] == -1):   
        nothing_index_input.append(i)       
    else:    
        min_index = min([x for x in index if x > -1])
        input_methods[i] = input_methods[i][min_index:]

        
nothing_index_output = []
for i in range(num_samples):  
    index = []
    index.append(output_tests[i].find('public'))
    index.append(output_tests[i].find('private'))
    index.append(output_tests[i].find('protected'))
    
    if (index[0] == -1) and (index[1] == -1) and (index[2] == -1):   
        nothing_index_output.append(i)
    else:    
        min_index = min([x for x in index if x > -1])
        output_tests[i] = output_tests[i][min_index:]


In [6]:
#print ('Number of the samples that do not contain public or private or protected (', len(nothing_index_input), '):')
#print (nothing_index_input, '\n')

#for i in nothing_index_input:
#    print(input_methods[i][:-1])

#print ('Number of the samples that do not contain public or private or protected (', len(nothing_index_output), '):')
#print (nothing_index_output, '\n')

#for i in nothing_index_output:
#    print(output_tests[i][:-1])

In [7]:
# saving the preprocessed inputs and outputs in new text files:

with open("input.methods.preprocessed.txt", "w", encoding='utf-8') as f_input_preprocessed:
    for s in input_methods:
        f_input_preprocessed.write(s)

f_input_preprocessed.close()


with open("output.tests.preprocessed.txt", "w", encoding='utf-8') as f_output_preprocessed:
    for s in output_tests:
        f_output_preprocessed.write(s)

f_output_preprocessed.close()

## 2. runinng src2abs on dataset for abstraction

In [None]:
# downloading the src2abs to Abstracting the methods and test cases:

! git clone https://github.com/micheletufano/src2abs.git

#### Attention Please:
There are several syntax errors that haven't solved by previous steps.
So we need to save the index of all buggy samples in a file to remove them from the dataset.
To aim this goal we need to change the src2abs source code.

##### 1- find and open the AbstractorManager.java file.

##### 2- add the below imports:

import java.io.BufferedWriter;

import java.io.BufferedReader;

import java.nio.file.Path;

import java.io.FileWriter;

import java.io.FileReader;

import java.io.IOException;

##### 3- search for both ("Parsing ERROR!")

##### 4- add the below code in the catch place before first "Parsing ERROR!" (src2abs for single):
    ////////-- My Code: --///////////////////////////// 
        //String log_single = e.toString().split("\\r?\\n")[0];
        try {
            //BufferedReader reader_single = new BufferedReader(new FileReader(inputCodePath));
            //String buggy_sample_single = reader_single.readLine();
            //reader_single.close();

            BufferedWriter writer_single = new BufferedWriter(new FileWriter("parsError_single.txt", true));
            //writer_single.write(buggy_sample_single + "\n");
            writer_single.write("Buggy Sample");
            writer_single.close();
        } catch (IOException ee_single) {
            System.out.println("exception occoured" + ee_single);
        }
    ///////////////////////////////////////////////////
            
##### 5- add the below code in the catch place before second "Parsing ERROR!" (src2abs for pair):
           
    ////////-- My Code: --///////////////////////////// 
        //String log_pair = e.toString().split("\\r?\\n")[0];
        try {
            //BufferedReader reader_pair = new BufferedReader(new FileReader(inputCodePath1));
            //String buggy_sample_pair = reader_pair.readLine();
            //reader_pair.close();

            BufferedWriter writer_pair = new BufferedWriter(new FileWriter("parsError_pair.txt", true));
            //writer_pair.write(buggy_sample_pair + "\n");
            writer_pair.write("Buggy Sample");
            writer_pair.close();
        } catch (IOException ee_pair) {
            System.out.println("exception occoured" + ee_pair);
        }
    ///////////////////////////////////////////////////
        

In [10]:
cd src2abs

! mvn clean
! mvn install:install-file -Dfile="lib/javalexer.jar" -DgroupId="edu.wm.cs" -DartifactId="javalexer" -Dversion="1" -Dpackaging="jar"
! mvn package

[INFO] Scanning for projects...
[INFO] 
[INFO] -------------------------< edu.wm.cs:src2abs >--------------------------
[INFO] Building src2abs 0.1
[INFO] --------------------------------[ jar ]---------------------------------
[INFO] 
[INFO] --- maven-clean-plugin:2.5:clean (default-clean) @ src2abs ---
[INFO] Deleting D:\3. PhD\2. Independent Study - Test Evaluation (Mona Rahimi)\Codes\Test_Local\src2abs\target
[INFO] ------------------------------------------------------------------------
[INFO] BUILD SUCCESS
[INFO] ------------------------------------------------------------------------
[INFO] Total time:  0.562 s
[INFO] Finished at: 2021-06-23T17:36:54+04:30
[INFO] ------------------------------------------------------------------------
[INFO] Scanning for projects...
[INFO] 
[INFO] -------------------------< edu.wm.cs:src2abs >--------------------------
[INFO] Building src2abs 0.1
[INFO] --------------------------------[ jar ]---------------------------------
[INFO] 
[INFO] --- m

#### Here you can run src2abs on a file contain just one sample (for checking):


In [None]:
# removing the previous generated file by src2abs:

import os
from pathlib import Path

if Path("input_methods_abstracted.txt").is_file():          
    os.remove("input_methods_abstracted.txt")
    
if Path("output_tests_abstracted.txt").is_file():          
    os.remove("output_tests_abstracted.txt")     

if Path("input_methods_abstracted.txt.map").is_file():          
    os.remove("input_methods_abstracted.txt.map")

    
# Run Single mode of src2abs:
#! java -jar target/src2abs-0.1-jar-with-dependencies.jar single method ./outputTEST.txt ./input_methods_abstracted.txt ./idioms/idioms-review.csv

# Run Method mode of src2abs:
! java -jar ./target/src2abs-0.1-jar-with-dependencies.jar pair method ./input.methods.testing.txt ./output.tests.testing.txt ./input_methods_abstracted.txt ./output_tests_abstracted.txt ./idioms/idioms-review.csv

In [356]:
# Display the result:

# Enabling the horizontal scrollbar:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))


# Prints results:
with open("input.methods.testing.txt", "r", encoding='utf-8') as f:
    print("Input Method:")
    print(f.readlines()[0][:-1], "\n")

with open("input_methods_abstracted.txt", "r", encoding='utf-8') as f:
    print("Abstracted:")
    print(f.readlines()[0], "\n\n")


with open("output.tests.testing.txt", "r", encoding='utf-8') as f:
    print("Output Test:")
    print(f.readlines()[0][:-1], "\n")

with open("output_tests_abstracted.txt", "r", encoding='utf-8') as f:
    print("Abstracted:")
    print(f.readlines()[0])

print("\n--------------------------------------------------------------------------------------------------------------\n")

# Print results in map file:

with open("input_methods_abstracted.txt.map", "r", encoding='utf-8') as f:
    print("Map:\n")
    for i, line in enumerate(f.readlines()):
        print(i,"=>",line)

Input Method:
public Stream<OwnerDTO> listUsersOwners(  String username, Principal principal) { UserInfo user = this.fetchUserByUsername(username); Collection<? extends OwnerInfo> owners = this.userService.getAccessibleOwners(username); if (owners != null) { return owners.stream() .map(this::resolveOwner) .map(this.modelTranslator.getStreamMapper(Owner.class, OwnerDTO.class)); } return null; } 

Abstracted:
public TYPE_1 < TYPE_2 > METHOD_1 ( String VAR_1 , TYPE_3 VAR_2 ) { TYPE_4 user = this . METHOD_2 ( VAR_1 ) ; TYPE_5 < ? extends TYPE_6 > VAR_3 = this . VAR_4 . METHOD_3 ( VAR_1 ) ; if ( VAR_3 != null ) { return VAR_3 . METHOD_4 ( ) . METHOD_5 ( this : : METHOD_6 ) . METHOD_5 ( this . VAR_5 . METHOD_7 ( VAR_6 class , VAR_7 class ) ) ; } return null ; } 


Output Test:
public void testListAllOwners() { User user = new User(); user.setUsername("dummyuser" + TestUtil.randomInt()); user.setPassword("password"); this.userCurator.create(user); Owner owner1 = this.createOwner(); Owner owne

### Runing src2abs on the dataset:

In [None]:
import os
from pathlib import Path
# Enabling the horizontal scrollbar:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))


# Read preprocessed input(methods) and output(tests) files:
with open("input.methods.preprocessed.txt", "r", encoding='utf-8') as f_read_in:
    all_input = f_read_in.readlines()
with open("output.tests.preprocessed.txt", "r", encoding='utf-8') as f_read_out:
    all_output = f_read_out.readlines()    

# decreasing the number of samples to test the functionality:
all_input  = all_input[:1300]    
all_output = all_output[:1300]

# Since, the reslut of src2abs for all samples are store in just one line, We have to calulate 
# the length of the abstracted version of each file to extract them from the resut file.

# So First we have to run src2abs seperately on each sample and calculate the number of words
# in result. Then after runing src2abs on the file which contains all sample, we will extract
# each the result of each sample based on its length. 

#why we have to run again src2abs on whole sample? because of the map file. 

len_inputs  = []    # the length of each sample in input file
len_outputs = []    # the length of each sample in output file
buggy_samples = []  # the indexes of samples with Parsing Error.
 
number_all_input = len(all_input)

for i in range(number_all_input):
    
    # we extract each sample one by one to run src2abs on it separately to calculate their length:
    with open("temp_in.txt", "w", encoding='utf-8') as f_write_in: 
        f_write_in.write(all_input[i])
    f_write_in.close()
    with open("temp_out.txt", "w", encoding='utf-8') as f_write_out:
        f_write_out.write(all_output[i])
    f_write_out.close()    
    
    # removing the previous generated file by src2abs:
    if Path("temp_input_methods_abstracted.txt").is_file():          
        os.remove("temp_input_methods_abstracted.txt")

    if Path("temp_output_tests_abstracted.txt").is_file():          
        os.remove("temp_output_tests_abstracted.txt")     

    if Path("temp_input_methods_abstracted.txt.map").is_file():          
        os.remove("temp_input_methods_abstracted.txt.map")    
    
    # run src2abs:
    print ('\nSample ', i, ':') # for debuging
    ! java -jar ./target/src2abs-0.1-jar-with-dependencies.jar pair method ./temp_in.txt ./temp_out.txt ./temp_input_methods_abstracted.txt ./temp_output_tests_abstracted.txt ./idioms/idioms-review.csv
    
    # counts number of words:
    # we didn't use of len method, since here most of the file has small number in map file like VAR_1 (len = 4) but in next step it can be VAR_78 (len = 6)
    with open("temp_input_methods_abstracted.txt", "r", encoding='utf-8') as f_read_in_abs:
        temp_input = f_read_in_abs.readlines()[0]
        len_inputs.append(len(temp_input.split()))
    f_read_in_abs.close()
    
    with open("temp_output_tests_abstracted.txt", "r", encoding='utf-8') as f_read_out_abs:
        temp_output = f_read_out_abs.readlines()[0]
        len_outputs.append(len(temp_output.split()))
    f_read_out_abs.close()
    
      
    # if the sample faced Parsing Error store the index:   
    f_pars_eroro = open('parsError_pair.txt', 'r')
    str_pars_eroro = f_pars_eroro.read();
    f_pars_eroro.close()
    
    if (str_pars_eroro.find("Buggy Sample") != -1):
        buggy_samples.append(i) 

    open('parsError_pair.txt', 'w').close()
     
    

In [12]:
# Results:

# smaples with Parsing Error:
print("Number of Parsing Error:\n", len(buggy_samples))
print("Example:\n", buggy_samples[0])
print("------------------------------------------------------------------------")

# length of each sample in input file:
print("Number of lengths calculated for the input samples:\n", len(len_inputs))
print("Example:\n", len_inputs[0])
print("------------------------------------------------------------------------")

# length of each sample in output file:
print("Number of lengths calculated for the output samples:\n", len(len_outputs))
print("Example:\n", len_outputs[0]) 
print("------------------------------------------------------------------------")

# all input samples:
print("Number of input samples:\n", len(all_input))
print("Example:\n", all_input[0]) 
print("------------------------------------------------------------------------")

# all output samples:
print("Number of output samples:\n", len(all_output))
print("Example:\n", all_output[0]) 

Number of Parsing Error:
 64
Example:
 46
------------------------------------------------------------------------
Number of lengths calculated for the input samples:
 1300
Example:
 99
------------------------------------------------------------------------
Number of lengths calculated for the output samples:
 1300
Example:
 114
------------------------------------------------------------------------
Number of input samples:
 1300
Example:
 public boolean equals(Object obj) { if (this == obj) { return true; } if (obj == null) return false; if (getClass() != obj.getClass()) return false; SortedIntArray other = (SortedIntArray) obj; if (length != other.length) return false; for (int i = 0; i < length; i++) { if (a[i] != other.a[i]) return false; } return true; }

------------------------------------------------------------------------
Number of output samples:
 1300
Example:
 public void testEquals() { SortedIntArray s1 = new SortedIntArray(1, 3, 7); SortedIntArray s2 = new SortedIntArr

In [13]:
# removing samples with Parsing ERROR! from the dataset:

for i in reversed(buggy_samples):
    del len_inputs[i]
    del len_outputs[i]
    del all_input[i]
    del all_output[i]

In [14]:
# saving the preprocessed inputs and outputs without the Parsing Error:

with open("input.methods.preprocessed.txt", "w", encoding='utf-8') as f_input_preprocessed_p_e:
    for s in all_input:
        f_input_preprocessed_p_e.write(s)

f_input_preprocessed_p_e.close()


with open("output.tests.preprocessed.txt", "w", encoding='utf-8') as f_output_preprocessed_p_e:
    for s in all_output:
        f_output_preprocessed_p_e.write(s)

f_output_preprocessed_p_e.close()

In [15]:
# runing src2abs on whole samples:

# removing the previous generated file by src2abs:
if Path("input_methods_abstracted.txt").is_file():          
    os.remove("input_methods_abstracted.txt")

if Path("output_tests_abstracted.txt").is_file():          
    os.remove("output_tests_abstracted.txt")     

if Path("output_tests_abstracted.txt.map").is_file():          
    os.remove("output_tests_abstracted.txt.map")    
    

! java -jar ./target/src2abs-0.1-jar-with-dependencies.jar pair method ./input.methods.preprocessed.txt ./output.tests.preprocessed.txt ./input_methods_abstracted.txt ./output_tests_abstracted.txt ./idioms/idioms-review.csv

Source Code Abstracted successfully!
Abstracted Code: ./input_methods_abstracted.txt and ./output_tests_abstracted.txt
Mapping: ./input_methods_abstracted.txt.map


In [16]:
# read the result of the src2abs on whole samples:

with open("input_methods_abstracted.txt", "r", encoding='utf-8') as f_read_in_final:
    all_input_final = f_read_in_final.readlines()[0]
    
with open("output_tests_abstracted.txt", "r", encoding='utf-8') as f_read_out_final:
    all_output_final = f_read_out_final.readlines()[0] 

In [18]:
# extracting the result of each sample from the generated result for all samples:

splited_input = all_input_final.split()
splited_output = all_output_final.split()


# generating new text file contain the abstracted version of the samples: (each sample in one line)
with open("input_abs_final.txt", "w", encoding='utf-8') as f_input_abs_final:
    counter = 0
    for i in range(len(len_inputs)):
        f_input_abs_final.write(' '.join(splited_input[counter:(counter+len_inputs[i])]) + '\n')
        counter = counter+len_inputs[i]    
f_input_abs_final.close()        
        
with open("output_abs_final.txt", "w", encoding='utf-8') as f_output_abs_final:
    counter = 0
    for i in range(len(len_outputs)):
        f_output_abs_final.write(' '.join(splited_output[counter:(counter+len_outputs[i])]) + '\n')
        counter = counter+len_outputs[i]
f_output_abs_final.close()

In [19]:
# read the saved file of inputs and outputs:

with open("input_abs_final.txt", "r", encoding='utf-8') as f_show_in_final:
    show_all_input_final = f_show_in_final.readlines()
    
with open("output_abs_final.txt", "r", encoding='utf-8') as f_show_out_final:
    show_all_output_final = f_show_out_final.readlines()

In [24]:
show_all_input_final[:10]

['public boolean equals ( TYPE_1 VAR_1 ) { if ( this == VAR_1 ) { return true ; } if ( VAR_1 == null ) return false ; if ( METHOD_1 ( ) != VAR_1 . METHOD_1 ( ) ) return false ; TYPE_2 VAR_2 = ( TYPE_2 ) VAR_1 ; if ( length != VAR_2 . length ) return false ; for ( int i = 0 ; i < length ; i ++ ) { if ( a [ i ] != VAR_2 . a [ i ] ) return false ; } return true ; }\n',
 'public TYPE_3 METHOD_2 ( TYPE_4 VAR_3 , TYPE_5 VAR_4 ) { METHOD_3 ( VAR_3 ) ; VAR_5 = VAR_4 ; final TYPE_3 VAR_6 = new TYPE_3 ( VAR_7 ) ; TYPE_6 VAR_8 = METHOD_4 ( ) ; try ( TYPE_7 VAR_9 = VAR_5 . METHOD_5 ( ) ) { if ( VAR_10 ) { VAR_5 . METHOD_6 ( true ) ; } METHOD_7 ( VAR_4 ) ; VAR_11 = VAR_5 . METHOD_8 ( VAR_5 . METHOD_9 ( VAR_12 ) . METHOD_10 ( ) ) ; VAR_13 = VAR_14 . METHOD_11 ( ) ; VAR_15 = new TYPE_8 ( VAR_5 , VAR_16 , VAR_17 , VAR_18 , VAR_19 , VAR_20 , VAR_21 , VAR_22 , VAR_23 , VAR_24 , VAR_3 . METHOD_12 ( ) , VAR_3 . METHOD_13 ( ) , VAR_25 , VAR_26 , VAR_27 , VAR_28 , VAR_8 , VAR_29 , VAR_30 , VAR_31 , VAR_32 ,

In [22]:
show_all_output_final[:10]

['public void METHOD_4463 ( ) { TYPE_2 VAR_1437 = new TYPE_2 ( 1 , INT_8 , INT_43 ) ; TYPE_2 VAR_1438 = new TYPE_2 ( INT_8 , 1 , INT_43 ) ; TYPE_2 VAR_680 = new TYPE_2 ( 1 , INT_8 ) ; assertEquals ( VAR_1437 . METHOD_2002 ( ) , VAR_1438 . METHOD_2002 ( ) ) ; METHOD_4464 ( VAR_1437 . equals ( VAR_1438 ) ) ; METHOD_4464 ( VAR_1438 . equals ( VAR_1437 ) ) ; METHOD_4465 ( VAR_1437 . equals ( VAR_680 ) ) ; METHOD_4465 ( VAR_680 . equals ( VAR_1437 ) ) ; METHOD_4465 ( VAR_1437 . METHOD_2002 ( ) == VAR_680 . METHOD_2002 ( ) ) ; }\n',
 'public void METHOD_4466 ( ) throws Exception { TYPE_2832 . METHOD_4467 ( ) . METHOD_1432 ( VAR_4565 ) ; METHOD_4468 ( VAR_4566 ) ; final TYPE_221 VAR_4567 = new TYPE_223 ( TYPE_2833 . METHOD_3232 ( STRING_1261 ) ) ; METHOD_4469 ( VAR_4568 . METHOD_89 ( METHOD_4470 ( ) , METHOD_1137 ( STRING_1262 ) ) ) . METHOD_4471 ( TYPE_36 . status ( VAR_4569 ) . METHOD_1862 ( VAR_4567 ) . METHOD_256 ( ) ) ; VAR_5 . METHOD_4472 ( out ) ; final TYPE_3 VAR_79 = VAR_512 . METHOD

## 3. selecting the samples for training the model

In [25]:
import os, sys
from keras.models import Model
from keras.layers import Input, LSTM, GRU, Dense, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
#from tensorflow.keras.utils import to_categorical
from keras.utils import to_categorical
import numpy as np
import matplotlib.pyplot as plt
import nltk
nltk.download('punkt')

# Enabling the horizontal scrollbar:
from IPython.core.display import HTML
display(HTML("<style>pre { white-space: pre !important; }</style>"))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Totoro\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [26]:
# reading data:

input_sentences = []
output_sentences = []
output_sentences_inputs = []

with open('input_abs_final.txt', 'r', encoding='utf-8') as f:
    input_sentences = f.read().split('\n')
del input_sentences[-1]  # last element is ''


with open('output_abs_final.txt', 'r', encoding='utf-8') as f:
    target_texts = f.read().split('\n')
del target_texts[-1]  # last element is ''


In [27]:
# adding the start and the end words for training the model:

for s in target_texts:
    
    output_sentence = s + ' eeooss'
    output_sentence_input = 'ssooss ' + s

    output_sentences.append(output_sentence)
    output_sentences_inputs.append(output_sentence_input)

print("num samples input:", len(input_sentences))
print("num samples output:", len(output_sentences))
print("num samples output input:", len(output_sentences_inputs))

num samples input: 1236
num samples output: 1236
num samples output input: 1236


In [28]:
# display a sample of the data:

print('input_sentences         = ', input_sentences[0])
print('output_sentences        = ', output_sentences[0])
print('output_sentences_inputs = ', output_sentences_inputs[0])

input_sentences         =  public boolean equals ( TYPE_1 VAR_1 ) { if ( this == VAR_1 ) { return true ; } if ( VAR_1 == null ) return false ; if ( METHOD_1 ( ) != VAR_1 . METHOD_1 ( ) ) return false ; TYPE_2 VAR_2 = ( TYPE_2 ) VAR_1 ; if ( length != VAR_2 . length ) return false ; for ( int i = 0 ; i < length ; i ++ ) { if ( a [ i ] != VAR_2 . a [ i ] ) return false ; } return true ; }
output_sentences        =  public void METHOD_4463 ( ) { TYPE_2 VAR_1437 = new TYPE_2 ( 1 , INT_8 , INT_43 ) ; TYPE_2 VAR_1438 = new TYPE_2 ( INT_8 , 1 , INT_43 ) ; TYPE_2 VAR_680 = new TYPE_2 ( 1 , INT_8 ) ; assertEquals ( VAR_1437 . METHOD_2002 ( ) , VAR_1438 . METHOD_2002 ( ) ) ; METHOD_4464 ( VAR_1437 . equals ( VAR_1438 ) ) ; METHOD_4464 ( VAR_1438 . equals ( VAR_1437 ) ) ; METHOD_4465 ( VAR_1437 . equals ( VAR_680 ) ) ; METHOD_4465 ( VAR_680 . equals ( VAR_1437 ) ) ; METHOD_4465 ( VAR_1437 . METHOD_2002 ( ) == VAR_680 . METHOD_2002 ( ) ) ; } eeooss
output_sentences_inputs =  ssooss public void MET

In [29]:
# tokenizing the samples to calculating the number of the words:

word2idx_inputs = {}
counter = 0
for sentence in input_sentences:
    words = nltk.word_tokenize(sentence)
    for word in words:
        if word not in word2idx_inputs:
            counter = counter + 1
            word2idx_inputs[word] = counter

input_integer_seq = []
for i, sentence in enumerate(input_sentences):
    words = nltk.word_tokenize(sentence)
    input_integer = [word2idx_inputs[word] for word in words]
    input_integer_seq.append(input_integer)    

print('Total unique words in the input: %s' % len(word2idx_inputs))

max_input_len = max(len(sen) for sen in input_integer_seq)
print("Length of longest sentence in input: %g" % max_input_len)

word2idx_outputs = {}
counter = 0
for sentence in output_sentences + output_sentences_inputs:
    words = nltk.word_tokenize(sentence)
    for word in words:
        if word not in word2idx_outputs:
            counter = counter + 1
            word2idx_outputs[word] = counter

#word2idx_outputs["<sos>"] = counter + 1
#word2idx_outputs["<eos>"] = counter + 2

#  s + ' <eos>' -> (eeooss)
output_integer_seq = []
for i, sentence in enumerate(output_sentences):
    words = nltk.word_tokenize(sentence)
    output_integer = [word2idx_outputs[word] for word in words]
    output_integer_seq.append(output_integer) 

# '<sos> ' + s -> (ssooss)
output_input_integer_seq = []
for i, sentence in enumerate(output_sentences_inputs):
    words = nltk.word_tokenize(sentence)
    output_input_integer = [word2idx_outputs[word] for word in words]
    output_input_integer_seq.append(output_input_integer)     

print('Total unique words in the output: %s' % len(word2idx_outputs))

num_words_output = len(word2idx_outputs) + 1
max_out_len = max(len(sen) for sen in output_integer_seq)
print("Length of longest sentence in the output: %g" % max_out_len)

Total unique words in the input: 13457
Length of longest sentence in input: 1669
Total unique words in the output: 13856
Length of longest sentence in the output: 928


In [30]:
# checking the reasults:

print(input_integer_seq[10])
print("-----------------------------------------------------------")
print(input_sentences[10])
print("-----------------------------------------------------------")
print(word2idx_inputs)

[1, 317, 367, 4, 317, 368, 7, 8, 25, 4, 26, 27, 20, 28, 14, 27, 29, 368, 21, 122, 4, 7, 14, 27, 30, 7, 8, 5, 369, 20, 368, 21, 320, 4, 27, 7, 14, 9, 4, 369, 281, 150, 7, 8, 368, 21, 93, 4, 27, 38, 10, 21, 370, 21, 371, 4, 4, 150, 7, 369, 7, 7, 14, 15, 157, 9, 4, 369, 281, 5, 32, 33, 7, 8, 5, 32, 33, 372, 20, 4, 5, 32, 33, 7, 369, 14, 9, 4, 372, 21, 24, 123, 28, 288, 288, 372, 32, 28, 33, 281, 150, 7, 8, 372, 32, 28, 33, 20, 10, 21, 370, 21, 371, 4, 4, 150, 7, 372, 32, 28, 33, 7, 14, 15, 15, 15, 12, 368, 14, 15]
-----------------------------------------------------------
public List METHOD_92 ( List VAR_104 ) { for ( int i = 0 ; i < VAR_104 . size ( ) ; i ++ ) { TYPE_1 result = VAR_104 . get ( i ) ; if ( result instanceof String ) { VAR_104 . set ( i , this . VAR_105 . METHOD_93 ( ( String ) result ) ) ; } else if ( result instanceof TYPE_1 [ ] ) { TYPE_1 [ ] row = ( TYPE_1 [ ] ) result ; if ( row . length > 0 && row [ 0 ] instanceof String ) { row [ 0 ] = this . VAR_105 . METHOD_93 ( (

### Ploting the reasults: 

In [31]:
# Histogram of the length of the methods and tests:

import pandas as pd
import altair as alt

# Generating Data
length_of_sentences = pd.DataFrame({
    'Method': [len(sen) for sen in input_integer_seq ],
    'Test'  : [len(sen) for sen in output_integer_seq],
})


alt.data_transformers.disable_max_rows()

Method = alt.Chart(length_of_sentences).mark_bar().encode(
    alt.X('Method:Q', bin=alt.Bin(maxbins=200, step=35)),
    alt.Y('count()', stack=None),
).properties(
    width=700,
    height=300
)
display(Method)

Test = alt.Chart(length_of_sentences).mark_bar().encode(
    alt.X('Test:Q', bin=alt.Bin(maxbins=200, step=30)),
    alt.Y('count()', stack=None),
).properties(
    width=700,
    height=300
)
display(Test)

In [32]:
# chart of the correlation between the length of the methods and tests:

alt.Chart(length_of_sentences).mark_rect().encode(
    alt.X('Method', bin=alt.Bin(maxbins=200)),
    alt.Y('Test', bin=alt.Bin(maxbins=200)),
    alt.Color('count()')
).interactive().properties(
    width=400,
    height=400
)

In [34]:
# results:

length_of_sentences

Unnamed: 0,Method,Test
0,102,115
1,1481,108
2,60,150
3,36,83
4,91,28
...,...,...
1231,19,116
1232,154,122
1233,119,36
1234,468,191


In [35]:
# selecting short samples:

small_sentences = length_of_sentences[length_of_sentences['Method']<210]
small_sentences = small_sentences[small_sentences['Test']<280]
small_sentences

Unnamed: 0,Method,Test
0,102,115
2,60,150
3,36,83
4,91,28
6,10,46
...,...,...
1230,150,278
1231,19,116
1232,154,122
1233,119,36


In [36]:
# saving the index of the selected samples (short samples):

small_sentences_index = small_sentences.index.values

with open("small_sentences_index.txt", "w") as output:
    for row in small_sentences_index:
        output.write(str(row) + '\n')

### Removing long sentences: 

In [37]:
# reading the dataset:

with open('input_abs_final.txt', 'r', encoding='utf-8') as f:
    input_sentences = f.read().split('\n')
del input_sentences[-1]  # last element is ''


with open('output_abs_final.txt', 'r', encoding='utf-8') as f:
    target_sentences = f.read().split('\n')
del target_sentences[-1]  # last element is ''


In [38]:
# Uploads the file of the indexes of short samples:

small_sentences_index = []

with open("small_sentences_index.txt", "r", encoding='utf-8') as f:
    small_sentences_index = f.read().split('\n')
del small_sentences_index[-1]  # last element is ''

In [39]:
# Removing long sentences:

temp_input_sentences = []
temp_target_sentences = []

for i in small_sentences_index:
    temp_input_sentences.append(input_sentences[int(i)])
    temp_target_sentences.append(target_sentences[int(i)])

input_sentences = temp_input_sentences
target_sentences = temp_target_sentences

del temp_input_sentences
del temp_target_sentences

print("num samples input:", len(input_sentences))
print("num samples output:", len(target_sentences))


num samples input: 1038
num samples output: 1038


In [40]:
# Finally saveing the last version of the methods and test samples:

with open("input_sentences_final.txt", "w", encoding='utf-8') as f_input_sentences:
    for row in input_sentences:
        f_input_sentences.write(str(row) + '\n')   
f_input_sentences.close()        
        
with open("target_sentences_final.txt", "w", encoding='utf-8') as f_target_sentences:
    for row in target_sentences:
        f_target_sentences.write(str(row) + '\n')
f_target_sentences.close()

In [48]:
# Final version of input samples: (input_sentences_final.txt)

print("Number of input samples (methods):\n", len(input_sentences), "\n")
for i in range(10):
    print(input_sentences[i])

Number of input samples (methods):
 1038 

public boolean equals ( TYPE_1 VAR_1 ) { if ( this == VAR_1 ) { return true ; } if ( VAR_1 == null ) return false ; if ( METHOD_1 ( ) != VAR_1 . METHOD_1 ( ) ) return false ; TYPE_2 VAR_2 = ( TYPE_2 ) VAR_1 ; if ( length != VAR_2 . length ) return false ; for ( int i = 0 ; i < length ; i ++ ) { if ( a [ i ] != VAR_2 . a [ i ] ) return false ; } return true ; }
public long METHOD_52 ( ) { METHOD_53 ( VAR_73 . METHOD_54 ( ) ) ; long VAR_74 = VAR_75 . METHOD_52 ( ) ; if ( VAR_74 == 0 ) { VAR_74 = VAR_76 . METHOD_52 ( ) ; } double VAR_77 = VAR_76 . METHOD_55 ( ) ; return TYPE_34 . METHOD_56 ( VAR_74 * VAR_77 ) ; }
public void METHOD_57 ( TYPE_35 < ? > VAR_78 , TYPE_36 < ? > VAR_79 , Exception e ) { TYPE_37 event = METHOD_58 ( VAR_78 , e ) ; METHOD_59 ( event ) ; }
protected void METHOD_60 ( TYPE_38 token ) { if ( ! METHOD_61 ( token . METHOD_62 ( ) , VAR_80 . METHOD_62 ( ) ) || ! METHOD_61 ( token . METHOD_63 ( ) , VAR_80 . METHOD_63 ( ) ) || ! ME

In [49]:
# Final version of output samples: (target_sentences_final.txt)

print("Number of output samples (test cases):\n", len(target_sentences), "\n")
for i in range(10):
    print(target_sentences[i])

Number of output samples (test cases):
 1038 

public void METHOD_4463 ( ) { TYPE_2 VAR_1437 = new TYPE_2 ( 1 , INT_8 , INT_43 ) ; TYPE_2 VAR_1438 = new TYPE_2 ( INT_8 , 1 , INT_43 ) ; TYPE_2 VAR_680 = new TYPE_2 ( 1 , INT_8 ) ; assertEquals ( VAR_1437 . METHOD_2002 ( ) , VAR_1438 . METHOD_2002 ( ) ) ; METHOD_4464 ( VAR_1437 . equals ( VAR_1438 ) ) ; METHOD_4464 ( VAR_1438 . equals ( VAR_1437 ) ) ; METHOD_4465 ( VAR_1437 . equals ( VAR_680 ) ) ; METHOD_4465 ( VAR_680 . equals ( VAR_1437 ) ) ; METHOD_4465 ( VAR_1437 . METHOD_2002 ( ) == VAR_680 . METHOD_2002 ( ) ) ; }
public void METHOD_4474 ( ) throws Exception { String query = STRING_1263 ; TYPE_2834 VAR_4570 = new TYPE_2834 ( query ) ; TYPE_2835 VAR_4571 = new TYPE_2836 ( null , STRING_1264 , VAR_4570 . METHOD_4475 ( ) . METHOD_4476 ( STRING_1264 ) , TYPE_1081 . METHOD_1720 ( ) ) ; TYPE_2835 VAR_4572 = new TYPE_2836 ( null , STRING_1265 , VAR_4570 . METHOD_4475 ( ) . METHOD_4476 ( STRING_1265 ) , TYPE_1081 . METHOD_1720 ( ) ) ; TYPE_

## Now you can train your model (^_^)