In [1]:
import os
import numpy as np
import torch
import pandas as pd
from transformers import AutoTokenizer, RobertaTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
device = torch.device("cpu")

In [3]:
d_path = os.path.join("..", "..", "dataset", "token-classes-dataset")

In [4]:
# list of partial dataframes
dfs = []
# collects partial dataframes from oracles
for file_name in os.listdir(d_path):
    print(file_name)
    df = pd.read_json(os.path.join(d_path,  file_name))
    dfs.append(df)
# concat partial dataframes
df_dataset = pd.concat(dfs)
# drop column id (it is not relevant for training the model)
df_dataset = df_dataset.drop(['id'], axis=1)
# map empty cells to empty strings
df_dataset.fillna('', inplace=True)
# specify the type of each column in the dataset
df_dataset = df_dataset.astype({
    'label': 'bool',
    'oracleId': 'int64',
    'oracleType': 'string',
    'projectName': 'string',
    'packageName': 'string',
    'className': 'string',
    'javadocTag': 'string',
    'methodJavadoc': 'string',
    'methodSourceCode': 'string',
    'classJavadoc': 'string',
    'classSourceCode': 'string',
    'oracleSoFar': 'string',
    'token': 'string',
    'tokenClass': 'string',
    'tokenInfo': 'string'
})

150_oracle_datapoints_gs-core-1.3_7-augmented.json
839_oracle_datapoints_guava-19.0_1-augmented.json
687_oracle_datapoints_commons-collections4-4.1_2.json
498_oracle_datapoints_commons-math3-3.6.1_93.json
1085_oracle_datapoints_commons-collections4-4.1_16-augmented.json
578_oracle_datapoints_commons-math3-3.6.1_10-augmented.json
898_oracle_datapoints_commons-collections4-4.1_9.json
651_oracle_datapoints_commons-math3-3.6.1_129.json
955_oracle_datapoints_commons-collections4-4.1_10-augmented.json
264_oracle_datapoints_commons-math3-3.6.1_2.json
293_oracle_datapoints_commons-collections4-4.1_37.json
683_oracle_datapoints_commons-collections4-4.1_12-augmented.json
389_oracle_datapoints_commons-math3-3.6.1_53.json
974_oracle_datapoints_commons-math3-3.6.1_134.json
263_oracle_datapoints_commons-math3-3.6.1_2.json
315_oracle_datapoints_commons-math3-3.6.1_48.json
831_oracle_datapoints_commons-collections4-4.1_7-augmented.json
471_oracle_datapoints_guava-19.0_2-augmented.json
424_oracle_datap

In [5]:
df_dataset.shape

(1256442, 16)

In [6]:
df_dataset_copy = df_dataset.copy()

In [7]:
len(df_dataset_copy)

1256442

In [8]:
# Step 1: Remove rows with "oracleSoFar" as an empty string and "token" equals ";"
df_empty_semicolon = df_dataset_copy[(df_dataset_copy['oracleSoFar'] == '') & (df_dataset_copy['token'] == ';')]

In [18]:
df_dataset_copy.dtypes

label                  bool
oracleId              int64
oracleType           string
projectName          string
packageName          string
className            string
javadocTag           string
methodJavadoc        string
methodSourceCode     string
classJavadoc         string
classSourceCode      string
oracleSoFar          string
tokenClassesSoFar    object
token                string
tokenClass           string
tokenInfo            string
dtype: object

In [19]:
# Step 1: Remove rows with "oracleSoFar" as an empty string and "token" equals ";"
df_empty_semicolon_true = df_dataset[(df_dataset['oracleSoFar'] == '') & (df_dataset['token'] == ';') & (df_dataset['label'] == True)]

In [35]:
df_empty_semicolon_true_javadoc_empty = df_dataset[(df_dataset['oracleSoFar'] == '') & (df_dataset['token'] == ';') & (df_dataset['label'] == True) & (df_dataset['javadocTag'] == '')]

In [36]:
len(df_empty_semicolon_true_javadoc_empty)

15751

In [10]:
df_not_empty_semicolon_true = df_dataset[(df_dataset['oracleSoFar'].str.strip() != '') | (df_dataset['token'] != ';') | (df_dataset['label'] != True)]

In [21]:
print(f"Length df_empty_semicolon: {len(df_empty_semicolon)}")
print(f"Length df_empty_semicolon_true: {len(df_empty_semicolon_true)}")
print(f"Length df_not_empty_semicolon_true: {len(df_not_empty_semicolon_true)}")
assert (len(df_empty_semicolon_true) + len(df_not_empty_semicolon_true)) == len(df_dataset_copy)

Length df_empty_semicolon: 161051
Length df_empty_semicolon_true: 142344
Length df_not_empty_semicolon_true: 1114098


In [30]:
# Step 2: Group the rows by "oracleId" and randomly select one row from each group
selected_rows = df_empty_semicolon_true.groupby('oracleId').apply(lambda x: x.sample(1, random_state=1))

In [32]:
print(f"Length selected rows: {len(selected_rows)}")

Length selected rows: 23373


In [33]:
# Step 4: Reintegrate the selected rows into the original DataFrame
df_final = pd.concat([df_not_empty_semicolon_true, selected_rows])

# Optional: Reset the index of the DataFrame
df_final = df_final.reset_index(drop=True)

In [34]:
len(selected_rows[selected_rows["javadocTag"]==""])

1811

In [27]:
print(f"Length final dataset: {len(df_final)}")

Length final dataset: 1137471


In [31]:
import random

random.seed(1)

df_filtered = df_dataset[(df_dataset['tokenClass'] == 'Semicolon') & (df_dataset['oracleSoFar'] == '') & (df_dataset['label'] == True)]

df_filtered_oracle_ids = df_filtered["oracleId"].unique().tolist()

# Randomly selecting 3000 rows
df_random = random.sample(df_filtered_oracle_ids, 20000) # Adjust the random_state as desired

# Removing the other 20000 rows
df_remaining = df_dataset[~((df_dataset['oracleId'].isin(df_random)))]

In [32]:
df_method_source_code = df_dataset.copy()

df_method_source_code['methodSourceCode'] = df_method_source_code['methodSourceCode'].str.split('{').str[0]

In [33]:
df_method_source_code.head(1000)

Unnamed: 0,label,oracleId,oracleType,projectName,packageName,className,javadocTag,methodJavadoc,methodSourceCode,classJavadoc,classSourceCode,oracleSoFar,tokenClassesSoFar,token,tokenClass,tokenInfo
0,False,2900,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,GraggBulirschStoerIntegrator,@param maximalOrder maximal order in the extra...,/** Set the order control parameters.  * <p...,public void setOrderControl(final int maximalO...,/**  * This class implements a Gragg-Bulirsch...,/*  * Licensed to the Apache Software Foundati...,,[],,MethodArgument,
1,False,2900,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,GraggBulirschStoerIntegrator,@param maximalOrder maximal order in the extra...,/** Set the order control parameters.  * <p...,public void setOrderControl(final int maximalO...,/**  * This class implements a Gragg-Bulirsch...,/*  * Licensed to the Apache Software Foundati...,,[],,Class,
2,False,2900,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,GraggBulirschStoerIntegrator,@param maximalOrder maximal order in the extra...,/** Set the order control parameters.  * <p...,public void setOrderControl(final int maximalO...,/**  * This class implements a Gragg-Bulirsch...,/*  * Licensed to the Apache Software Foundati...,,[],,This,
3,True,2900,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,GraggBulirschStoerIntegrator,@param maximalOrder maximal order in the extra...,/** Set the order control parameters.  * <p...,public void setOrderControl(final int maximalO...,/**  * This class implements a Gragg-Bulirsch...,/*  * Licensed to the Apache Software Foundati...,,[],,Semicolon,
4,False,2900,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,GraggBulirschStoerIntegrator,@param maximalOrder maximal order in the extra...,/** Set the order control parameters.  * <p...,public void setOrderControl(final int maximalO...,/**  * This class implements a Gragg-Bulirsch...,/*  * Licensed to the Apache Software Foundati...,,[],,OpeningParenthesis,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7,True,8401,PRE,commons-math3-3.6.1,org.apache.commons.math3.stat,StatUtils,@param length the number of elements to include,/**  * Returns the maximum of the entries...,public static double max(final double[] values...,/**  * StatUtils provides static methods for c...,/*  * Licensed to the Apache Software Foundati...,,[],,Semicolon,
8,False,8401,PRE,commons-math3-3.6.1,org.apache.commons.math3.stat,StatUtils,@param length the number of elements to include,/**  * Returns the maximum of the entries...,public static double max(final double[] values...,/**  * StatUtils provides static methods for c...,/*  * Licensed to the Apache Software Foundati...,,[],,OpeningParenthesis,
9,False,8401,PRE,commons-math3-3.6.1,org.apache.commons.math3.stat,StatUtils,@param length the number of elements to include,/**  * Returns the maximum of the entries...,public static double max(final double[] values...,/**  * StatUtils provides static methods for c...,/*  * Licensed to the Apache Software Foundati...,,[],,ArraysClass,
10,False,8402,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.stat,StatUtils,the maximum of the values or Double.NaN if le...,/**  * Returns the maximum of the entries...,public static double max(final double[] values...,/**  * StatUtils provides static methods for c...,/*  * Licensed to the Apache Software Foundati...,,[],,MethodArgument,


In [34]:
pd.set_option('display.max_colwidth', None)

In [35]:
result = df_dataset[df_dataset['oracleId']==19341]


In [36]:
result

Unnamed: 0,label,oracleId,oracleType,projectName,packageName,className,javadocTag,methodJavadoc,methodSourceCode,classJavadoc,classSourceCode,oracleSoFar,tokenClassesSoFar,token,tokenClass,tokenInfo
106,False,19341,NORMAL_POST,gs-core-1.3,org.graphstream.ui.graphicGraph.stylesheet,Rule,The group set or null.,"/**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  * * @return The group set or null.  */",public HashSet<String> getGroups(){  return groups; },"/**  * Style application rule.  * * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */","/*  * Copyright 2006 - 2015  * Stefan Balev <stefan.balev@graphstream-project.org>  * Julien Baudry <julien.baudry@graphstream-project.org>  * Antoine Dutot <antoine.dutot@graphstream-project.org>  * Yoann Pigné <yoann.pigne@graphstream-project.org>  * Guilhelm Savin <guilhelm.savin@graphstream-project.org>  * * This file is part of GraphStream <http://graphstream-project.org>.  * * GraphStream is a library whose purpose is to handle static or dynamic  * graph, create them from scratch, file or any source and display them.  * * This program is free software distributed under the terms of two licenses, the  * CeCILL-C license that fits European law, and the GNU Lesser General Public  * License. You can use, modify and/ or redistribute the software under the terms  * of the CeCILL-C license as circulated by CEA, CNRS and INRIA at the following  * URL <http://www.cecill.info> or under the terms of the GNU LGPL as published by  * the Free Software Foundation, either version 3 of the License, or (at your  * option) any later version.  * * This program is distributed in the hope that it will be useful, but WITHOUT ANY  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A  * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.  * * You should have received a copy of the GNU Lesser General Public License  * along with this program. If not, see <http://www.gnu.org/licenses/>.  * * The fact that you are presently reading this means that you have had  * knowledge of the CeCILL-C and LGPL licenses and that you accept their terms.  */ package org.graphstream.ui.graphicGraph.stylesheet; import java.util.HashSet; /**  * Style application rule.  *  * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */ public class Rule {  // Attributes  /**  * The match.  */  public Selector selector;  /**  * The style.  */  public Style style;  /**  * Optionally, the rule can store all the style groups it participates in.  */  public HashSet<String> groups;  // Constructors  protected Rule() {  }  /**  * New rule with a matcher.  *  * @param selector  * The rule selector.  */  public Rule(Selector selector) {  this.selector = selector;  }  public Rule(Selector selector, Rule parent) {  this.selector = selector;  this.style = new Style(parent);  }  /**  * This rule style.  *  * @return The rule style.  */  public Style getStyle() {  return style;  }  /**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  *  * @return The group set or null.  */  public HashSet<String> getGroups() {  return groups;  }  /**  * True if this rule selector match the given identifier.  *  * @param identifier  * The identifier to test for the match.  * @return True if matching.  */  public boolean matchId(String identifier) {  String ident = selector.getId();  if (ident != null)  return ident.equals(identifier);  return false;  }  /**  * Change the style.  *  * @param style  * A style specification.  */  public void setStyle(Style style) {  this.style = style;  }  /**  * Specify that this rule participates in the given style group.  *  * @param groupId  * The group unique identifier.  */  public void addGroup(String groupId) {  if (groups == null)  groups = new HashSet<String>();  groups.add(groupId);  }  /**  * Remove this rule from the style group.  *  * @param groupId  * The group unique identifier.  */  public void removeGroup(String groupId) {  if (groups != null)  groups.remove(groupId);  }  @Override  public String toString() {  return toString(-1);  }  public String toString(int level) {  StringBuilder builder = new StringBuilder();  String prefix = """";  if (level > 0) {  for (int i = 0; i < level; i++) prefix += "" "";  }  builder.append(prefix);  builder.append(selector.toString());  builder.append(style.toString(level + 1));  return builder.toString();  } }",,[],,Class,
107,False,19341,NORMAL_POST,gs-core-1.3,org.graphstream.ui.graphicGraph.stylesheet,Rule,The group set or null.,"/**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  * * @return The group set or null.  */",public HashSet<String> getGroups(){  return groups; },"/**  * Style application rule.  * * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */","/*  * Copyright 2006 - 2015  * Stefan Balev <stefan.balev@graphstream-project.org>  * Julien Baudry <julien.baudry@graphstream-project.org>  * Antoine Dutot <antoine.dutot@graphstream-project.org>  * Yoann Pigné <yoann.pigne@graphstream-project.org>  * Guilhelm Savin <guilhelm.savin@graphstream-project.org>  * * This file is part of GraphStream <http://graphstream-project.org>.  * * GraphStream is a library whose purpose is to handle static or dynamic  * graph, create them from scratch, file or any source and display them.  * * This program is free software distributed under the terms of two licenses, the  * CeCILL-C license that fits European law, and the GNU Lesser General Public  * License. You can use, modify and/ or redistribute the software under the terms  * of the CeCILL-C license as circulated by CEA, CNRS and INRIA at the following  * URL <http://www.cecill.info> or under the terms of the GNU LGPL as published by  * the Free Software Foundation, either version 3 of the License, or (at your  * option) any later version.  * * This program is distributed in the hope that it will be useful, but WITHOUT ANY  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A  * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.  * * You should have received a copy of the GNU Lesser General Public License  * along with this program. If not, see <http://www.gnu.org/licenses/>.  * * The fact that you are presently reading this means that you have had  * knowledge of the CeCILL-C and LGPL licenses and that you accept their terms.  */ package org.graphstream.ui.graphicGraph.stylesheet; import java.util.HashSet; /**  * Style application rule.  *  * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */ public class Rule {  // Attributes  /**  * The match.  */  public Selector selector;  /**  * The style.  */  public Style style;  /**  * Optionally, the rule can store all the style groups it participates in.  */  public HashSet<String> groups;  // Constructors  protected Rule() {  }  /**  * New rule with a matcher.  *  * @param selector  * The rule selector.  */  public Rule(Selector selector) {  this.selector = selector;  }  public Rule(Selector selector, Rule parent) {  this.selector = selector;  this.style = new Style(parent);  }  /**  * This rule style.  *  * @return The rule style.  */  public Style getStyle() {  return style;  }  /**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  *  * @return The group set or null.  */  public HashSet<String> getGroups() {  return groups;  }  /**  * True if this rule selector match the given identifier.  *  * @param identifier  * The identifier to test for the match.  * @return True if matching.  */  public boolean matchId(String identifier) {  String ident = selector.getId();  if (ident != null)  return ident.equals(identifier);  return false;  }  /**  * Change the style.  *  * @param style  * A style specification.  */  public void setStyle(Style style) {  this.style = style;  }  /**  * Specify that this rule participates in the given style group.  *  * @param groupId  * The group unique identifier.  */  public void addGroup(String groupId) {  if (groups == null)  groups = new HashSet<String>();  groups.add(groupId);  }  /**  * Remove this rule from the style group.  *  * @param groupId  * The group unique identifier.  */  public void removeGroup(String groupId) {  if (groups != null)  groups.remove(groupId);  }  @Override  public String toString() {  return toString(-1);  }  public String toString(int level) {  StringBuilder builder = new StringBuilder();  String prefix = """";  if (level > 0) {  for (int i = 0; i < level; i++) prefix += "" "";  }  builder.append(prefix);  builder.append(selector.toString());  builder.append(style.toString(level + 1));  return builder.toString();  } }",,[],,This,
108,True,19341,NORMAL_POST,gs-core-1.3,org.graphstream.ui.graphicGraph.stylesheet,Rule,The group set or null.,"/**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  * * @return The group set or null.  */",public HashSet<String> getGroups(){  return groups; },"/**  * Style application rule.  * * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */","/*  * Copyright 2006 - 2015  * Stefan Balev <stefan.balev@graphstream-project.org>  * Julien Baudry <julien.baudry@graphstream-project.org>  * Antoine Dutot <antoine.dutot@graphstream-project.org>  * Yoann Pigné <yoann.pigne@graphstream-project.org>  * Guilhelm Savin <guilhelm.savin@graphstream-project.org>  * * This file is part of GraphStream <http://graphstream-project.org>.  * * GraphStream is a library whose purpose is to handle static or dynamic  * graph, create them from scratch, file or any source and display them.  * * This program is free software distributed under the terms of two licenses, the  * CeCILL-C license that fits European law, and the GNU Lesser General Public  * License. You can use, modify and/ or redistribute the software under the terms  * of the CeCILL-C license as circulated by CEA, CNRS and INRIA at the following  * URL <http://www.cecill.info> or under the terms of the GNU LGPL as published by  * the Free Software Foundation, either version 3 of the License, or (at your  * option) any later version.  * * This program is distributed in the hope that it will be useful, but WITHOUT ANY  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A  * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.  * * You should have received a copy of the GNU Lesser General Public License  * along with this program. If not, see <http://www.gnu.org/licenses/>.  * * The fact that you are presently reading this means that you have had  * knowledge of the CeCILL-C and LGPL licenses and that you accept their terms.  */ package org.graphstream.ui.graphicGraph.stylesheet; import java.util.HashSet; /**  * Style application rule.  *  * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */ public class Rule {  // Attributes  /**  * The match.  */  public Selector selector;  /**  * The style.  */  public Style style;  /**  * Optionally, the rule can store all the style groups it participates in.  */  public HashSet<String> groups;  // Constructors  protected Rule() {  }  /**  * New rule with a matcher.  *  * @param selector  * The rule selector.  */  public Rule(Selector selector) {  this.selector = selector;  }  public Rule(Selector selector, Rule parent) {  this.selector = selector;  this.style = new Style(parent);  }  /**  * This rule style.  *  * @return The rule style.  */  public Style getStyle() {  return style;  }  /**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  *  * @return The group set or null.  */  public HashSet<String> getGroups() {  return groups;  }  /**  * True if this rule selector match the given identifier.  *  * @param identifier  * The identifier to test for the match.  * @return True if matching.  */  public boolean matchId(String identifier) {  String ident = selector.getId();  if (ident != null)  return ident.equals(identifier);  return false;  }  /**  * Change the style.  *  * @param style  * A style specification.  */  public void setStyle(Style style) {  this.style = style;  }  /**  * Specify that this rule participates in the given style group.  *  * @param groupId  * The group unique identifier.  */  public void addGroup(String groupId) {  if (groups == null)  groups = new HashSet<String>();  groups.add(groupId);  }  /**  * Remove this rule from the style group.  *  * @param groupId  * The group unique identifier.  */  public void removeGroup(String groupId) {  if (groups != null)  groups.remove(groupId);  }  @Override  public String toString() {  return toString(-1);  }  public String toString(int level) {  StringBuilder builder = new StringBuilder();  String prefix = """";  if (level > 0) {  for (int i = 0; i < level; i++) prefix += "" "";  }  builder.append(prefix);  builder.append(selector.toString());  builder.append(style.toString(level + 1));  return builder.toString();  } }",,[],,TRUE,
109,False,19341,NORMAL_POST,gs-core-1.3,org.graphstream.ui.graphicGraph.stylesheet,Rule,The group set or null.,"/**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  * * @return The group set or null.  */",public HashSet<String> getGroups(){  return groups; },"/**  * Style application rule.  * * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */","/*  * Copyright 2006 - 2015  * Stefan Balev <stefan.balev@graphstream-project.org>  * Julien Baudry <julien.baudry@graphstream-project.org>  * Antoine Dutot <antoine.dutot@graphstream-project.org>  * Yoann Pigné <yoann.pigne@graphstream-project.org>  * Guilhelm Savin <guilhelm.savin@graphstream-project.org>  * * This file is part of GraphStream <http://graphstream-project.org>.  * * GraphStream is a library whose purpose is to handle static or dynamic  * graph, create them from scratch, file or any source and display them.  * * This program is free software distributed under the terms of two licenses, the  * CeCILL-C license that fits European law, and the GNU Lesser General Public  * License. You can use, modify and/ or redistribute the software under the terms  * of the CeCILL-C license as circulated by CEA, CNRS and INRIA at the following  * URL <http://www.cecill.info> or under the terms of the GNU LGPL as published by  * the Free Software Foundation, either version 3 of the License, or (at your  * option) any later version.  * * This program is distributed in the hope that it will be useful, but WITHOUT ANY  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A  * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.  * * You should have received a copy of the GNU Lesser General Public License  * along with this program. If not, see <http://www.gnu.org/licenses/>.  * * The fact that you are presently reading this means that you have had  * knowledge of the CeCILL-C and LGPL licenses and that you accept their terms.  */ package org.graphstream.ui.graphicGraph.stylesheet; import java.util.HashSet; /**  * Style application rule.  *  * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */ public class Rule {  // Attributes  /**  * The match.  */  public Selector selector;  /**  * The style.  */  public Style style;  /**  * Optionally, the rule can store all the style groups it participates in.  */  public HashSet<String> groups;  // Constructors  protected Rule() {  }  /**  * New rule with a matcher.  *  * @param selector  * The rule selector.  */  public Rule(Selector selector) {  this.selector = selector;  }  public Rule(Selector selector, Rule parent) {  this.selector = selector;  this.style = new Style(parent);  }  /**  * This rule style.  *  * @return The rule style.  */  public Style getStyle() {  return style;  }  /**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  *  * @return The group set or null.  */  public HashSet<String> getGroups() {  return groups;  }  /**  * True if this rule selector match the given identifier.  *  * @param identifier  * The identifier to test for the match.  * @return True if matching.  */  public boolean matchId(String identifier) {  String ident = selector.getId();  if (ident != null)  return ident.equals(identifier);  return false;  }  /**  * Change the style.  *  * @param style  * A style specification.  */  public void setStyle(Style style) {  this.style = style;  }  /**  * Specify that this rule participates in the given style group.  *  * @param groupId  * The group unique identifier.  */  public void addGroup(String groupId) {  if (groups == null)  groups = new HashSet<String>();  groups.add(groupId);  }  /**  * Remove this rule from the style group.  *  * @param groupId  * The group unique identifier.  */  public void removeGroup(String groupId) {  if (groups != null)  groups.remove(groupId);  }  @Override  public String toString() {  return toString(-1);  }  public String toString(int level) {  StringBuilder builder = new StringBuilder();  String prefix = """";  if (level > 0) {  for (int i = 0; i < level; i++) prefix += "" "";  }  builder.append(prefix);  builder.append(selector.toString());  builder.append(style.toString(level + 1));  return builder.toString();  } }",,[],,Semicolon,
110,False,19341,NORMAL_POST,gs-core-1.3,org.graphstream.ui.graphicGraph.stylesheet,Rule,The group set or null.,"/**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  * * @return The group set or null.  */",public HashSet<String> getGroups(){  return groups; },"/**  * Style application rule.  * * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */","/*  * Copyright 2006 - 2015  * Stefan Balev <stefan.balev@graphstream-project.org>  * Julien Baudry <julien.baudry@graphstream-project.org>  * Antoine Dutot <antoine.dutot@graphstream-project.org>  * Yoann Pigné <yoann.pigne@graphstream-project.org>  * Guilhelm Savin <guilhelm.savin@graphstream-project.org>  * * This file is part of GraphStream <http://graphstream-project.org>.  * * GraphStream is a library whose purpose is to handle static or dynamic  * graph, create them from scratch, file or any source and display them.  * * This program is free software distributed under the terms of two licenses, the  * CeCILL-C license that fits European law, and the GNU Lesser General Public  * License. You can use, modify and/ or redistribute the software under the terms  * of the CeCILL-C license as circulated by CEA, CNRS and INRIA at the following  * URL <http://www.cecill.info> or under the terms of the GNU LGPL as published by  * the Free Software Foundation, either version 3 of the License, or (at your  * option) any later version.  * * This program is distributed in the hope that it will be useful, but WITHOUT ANY  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A  * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.  * * You should have received a copy of the GNU Lesser General Public License  * along with this program. If not, see <http://www.gnu.org/licenses/>.  * * The fact that you are presently reading this means that you have had  * knowledge of the CeCILL-C and LGPL licenses and that you accept their terms.  */ package org.graphstream.ui.graphicGraph.stylesheet; import java.util.HashSet; /**  * Style application rule.  *  * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */ public class Rule {  // Attributes  /**  * The match.  */  public Selector selector;  /**  * The style.  */  public Style style;  /**  * Optionally, the rule can store all the style groups it participates in.  */  public HashSet<String> groups;  // Constructors  protected Rule() {  }  /**  * New rule with a matcher.  *  * @param selector  * The rule selector.  */  public Rule(Selector selector) {  this.selector = selector;  }  public Rule(Selector selector, Rule parent) {  this.selector = selector;  this.style = new Style(parent);  }  /**  * This rule style.  *  * @return The rule style.  */  public Style getStyle() {  return style;  }  /**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  *  * @return The group set or null.  */  public HashSet<String> getGroups() {  return groups;  }  /**  * True if this rule selector match the given identifier.  *  * @param identifier  * The identifier to test for the match.  * @return True if matching.  */  public boolean matchId(String identifier) {  String ident = selector.getId();  if (ident != null)  return ident.equals(identifier);  return false;  }  /**  * Change the style.  *  * @param style  * A style specification.  */  public void setStyle(Style style) {  this.style = style;  }  /**  * Specify that this rule participates in the given style group.  *  * @param groupId  * The group unique identifier.  */  public void addGroup(String groupId) {  if (groups == null)  groups = new HashSet<String>();  groups.add(groupId);  }  /**  * Remove this rule from the style group.  *  * @param groupId  * The group unique identifier.  */  public void removeGroup(String groupId) {  if (groups != null)  groups.remove(groupId);  }  @Override  public String toString() {  return toString(-1);  }  public String toString(int level) {  StringBuilder builder = new StringBuilder();  String prefix = """";  if (level > 0) {  for (int i = 0; i < level; i++) prefix += "" "";  }  builder.append(prefix);  builder.append(selector.toString());  builder.append(style.toString(level + 1));  return builder.toString();  } }",,[],,OpeningParenthesis,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
183,False,19341,NORMAL_POST,gs-core-1.3,org.graphstream.ui.graphicGraph.stylesheet,Rule,The group set or null.,"/**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  * * @return The group set or null.  */",public HashSet<String> getGroups(){  return groups; },"/**  * Style application rule.  * * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */","/*  * Copyright 2006 - 2015  * Stefan Balev <stefan.balev@graphstream-project.org>  * Julien Baudry <julien.baudry@graphstream-project.org>  * Antoine Dutot <antoine.dutot@graphstream-project.org>  * Yoann Pigné <yoann.pigne@graphstream-project.org>  * Guilhelm Savin <guilhelm.savin@graphstream-project.org>  * * This file is part of GraphStream <http://graphstream-project.org>.  * * GraphStream is a library whose purpose is to handle static or dynamic  * graph, create them from scratch, file or any source and display them.  * * This program is free software distributed under the terms of two licenses, the  * CeCILL-C license that fits European law, and the GNU Lesser General Public  * License. You can use, modify and/ or redistribute the software under the terms  * of the CeCILL-C license as circulated by CEA, CNRS and INRIA at the following  * URL <http://www.cecill.info> or under the terms of the GNU LGPL as published by  * the Free Software Foundation, either version 3 of the License, or (at your  * option) any later version.  * * This program is distributed in the hope that it will be useful, but WITHOUT ANY  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A  * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.  * * You should have received a copy of the GNU Lesser General Public License  * along with this program. If not, see <http://www.gnu.org/licenses/>.  * * The fact that you are presently reading this means that you have had  * knowledge of the CeCILL-C and LGPL licenses and that you accept their terms.  */ package org.graphstream.ui.graphicGraph.stylesheet; import java.util.HashSet; /**  * Style application rule.  *  * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */ public class Rule {  // Attributes  /**  * The match.  */  public Selector selector;  /**  * The style.  */  public Style style;  /**  * Optionally, the rule can store all the style groups it participates in.  */  public HashSet<String> groups;  // Constructors  protected Rule() {  }  /**  * New rule with a matcher.  *  * @param selector  * The rule selector.  */  public Rule(Selector selector) {  this.selector = selector;  }  public Rule(Selector selector, Rule parent) {  this.selector = selector;  this.style = new Style(parent);  }  /**  * This rule style.  *  * @return The rule style.  */  public Style getStyle() {  return style;  }  /**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  *  * @return The group set or null.  */  public HashSet<String> getGroups() {  return groups;  }  /**  * True if this rule selector match the given identifier.  *  * @param identifier  * The identifier to test for the match.  * @return True if matching.  */  public boolean matchId(String identifier) {  String ident = selector.getId();  if (ident != null)  return ident.equals(identifier);  return false;  }  /**  * Change the style.  *  * @param style  * A style specification.  */  public void setStyle(Style style) {  this.style = style;  }  /**  * Specify that this rule participates in the given style group.  *  * @param groupId  * The group unique identifier.  */  public void addGroup(String groupId) {  if (groups == null)  groups = new HashSet<String>();  groups.add(groupId);  }  /**  * Remove this rule from the style group.  *  * @param groupId  * The group unique identifier.  */  public void removeGroup(String groupId) {  if (groups != null)  groups.remove(groupId);  }  @Override  public String toString() {  return toString(-1);  }  public String toString(int level) {  StringBuilder builder = new StringBuilder();  String prefix = """";  if (level > 0) {  for (int i = 0; i < level; i++) prefix += "" "";  }  builder.append(prefix);  builder.append(selector.toString());  builder.append(style.toString(level + 1));  return builder.toString();  } }",true?((methodResultID!=null&&methodResultID.equals(this.groups))||methodResultID==null):,"[TRUE, QuestionMark, OpeningParenthesis, OpeningParenthesis, MethodResultID, IneqOperator, NULL, LogicalOperator, MethodResultID, Period, MethodName, OpeningParenthesis, This, Period, ClassField, ClosingParenthesis, ClosingParenthesis, LogicalOperator, MethodResultID, EqOperator, NULL, ClosingParenthesis, Colon]",,This,
184,False,19341,NORMAL_POST,gs-core-1.3,org.graphstream.ui.graphicGraph.stylesheet,Rule,The group set or null.,"/**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  * * @return The group set or null.  */",public HashSet<String> getGroups(){  return groups; },"/**  * Style application rule.  * * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */","/*  * Copyright 2006 - 2015  * Stefan Balev <stefan.balev@graphstream-project.org>  * Julien Baudry <julien.baudry@graphstream-project.org>  * Antoine Dutot <antoine.dutot@graphstream-project.org>  * Yoann Pigné <yoann.pigne@graphstream-project.org>  * Guilhelm Savin <guilhelm.savin@graphstream-project.org>  * * This file is part of GraphStream <http://graphstream-project.org>.  * * GraphStream is a library whose purpose is to handle static or dynamic  * graph, create them from scratch, file or any source and display them.  * * This program is free software distributed under the terms of two licenses, the  * CeCILL-C license that fits European law, and the GNU Lesser General Public  * License. You can use, modify and/ or redistribute the software under the terms  * of the CeCILL-C license as circulated by CEA, CNRS and INRIA at the following  * URL <http://www.cecill.info> or under the terms of the GNU LGPL as published by  * the Free Software Foundation, either version 3 of the License, or (at your  * option) any later version.  * * This program is distributed in the hope that it will be useful, but WITHOUT ANY  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A  * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.  * * You should have received a copy of the GNU Lesser General Public License  * along with this program. If not, see <http://www.gnu.org/licenses/>.  * * The fact that you are presently reading this means that you have had  * knowledge of the CeCILL-C and LGPL licenses and that you accept their terms.  */ package org.graphstream.ui.graphicGraph.stylesheet; import java.util.HashSet; /**  * Style application rule.  *  * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */ public class Rule {  // Attributes  /**  * The match.  */  public Selector selector;  /**  * The style.  */  public Style style;  /**  * Optionally, the rule can store all the style groups it participates in.  */  public HashSet<String> groups;  // Constructors  protected Rule() {  }  /**  * New rule with a matcher.  *  * @param selector  * The rule selector.  */  public Rule(Selector selector) {  this.selector = selector;  }  public Rule(Selector selector, Rule parent) {  this.selector = selector;  this.style = new Style(parent);  }  /**  * This rule style.  *  * @return The rule style.  */  public Style getStyle() {  return style;  }  /**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  *  * @return The group set or null.  */  public HashSet<String> getGroups() {  return groups;  }  /**  * True if this rule selector match the given identifier.  *  * @param identifier  * The identifier to test for the match.  * @return True if matching.  */  public boolean matchId(String identifier) {  String ident = selector.getId();  if (ident != null)  return ident.equals(identifier);  return false;  }  /**  * Change the style.  *  * @param style  * A style specification.  */  public void setStyle(Style style) {  this.style = style;  }  /**  * Specify that this rule participates in the given style group.  *  * @param groupId  * The group unique identifier.  */  public void addGroup(String groupId) {  if (groups == null)  groups = new HashSet<String>();  groups.add(groupId);  }  /**  * Remove this rule from the style group.  *  * @param groupId  * The group unique identifier.  */  public void removeGroup(String groupId) {  if (groups != null)  groups.remove(groupId);  }  @Override  public String toString() {  return toString(-1);  }  public String toString(int level) {  StringBuilder builder = new StringBuilder();  String prefix = """";  if (level > 0) {  for (int i = 0; i < level; i++) prefix += "" "";  }  builder.append(prefix);  builder.append(selector.toString());  builder.append(style.toString(level + 1));  return builder.toString();  } }",true?((methodResultID!=null&&methodResultID.equals(this.groups))||methodResultID==null):,"[TRUE, QuestionMark, OpeningParenthesis, OpeningParenthesis, MethodResultID, IneqOperator, NULL, LogicalOperator, MethodResultID, Period, MethodName, OpeningParenthesis, This, Period, ClassField, ClosingParenthesis, ClosingParenthesis, LogicalOperator, MethodResultID, EqOperator, NULL, ClosingParenthesis, Colon]",,MethodResultID,
185,True,19341,NORMAL_POST,gs-core-1.3,org.graphstream.ui.graphicGraph.stylesheet,Rule,The group set or null.,"/**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  * * @return The group set or null.  */",public HashSet<String> getGroups(){  return groups; },"/**  * Style application rule.  * * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */","/*  * Copyright 2006 - 2015  * Stefan Balev <stefan.balev@graphstream-project.org>  * Julien Baudry <julien.baudry@graphstream-project.org>  * Antoine Dutot <antoine.dutot@graphstream-project.org>  * Yoann Pigné <yoann.pigne@graphstream-project.org>  * Guilhelm Savin <guilhelm.savin@graphstream-project.org>  * * This file is part of GraphStream <http://graphstream-project.org>.  * * GraphStream is a library whose purpose is to handle static or dynamic  * graph, create them from scratch, file or any source and display them.  * * This program is free software distributed under the terms of two licenses, the  * CeCILL-C license that fits European law, and the GNU Lesser General Public  * License. You can use, modify and/ or redistribute the software under the terms  * of the CeCILL-C license as circulated by CEA, CNRS and INRIA at the following  * URL <http://www.cecill.info> or under the terms of the GNU LGPL as published by  * the Free Software Foundation, either version 3 of the License, or (at your  * option) any later version.  * * This program is distributed in the hope that it will be useful, but WITHOUT ANY  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A  * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.  * * You should have received a copy of the GNU Lesser General Public License  * along with this program. If not, see <http://www.gnu.org/licenses/>.  * * The fact that you are presently reading this means that you have had  * knowledge of the CeCILL-C and LGPL licenses and that you accept their terms.  */ package org.graphstream.ui.graphicGraph.stylesheet; import java.util.HashSet; /**  * Style application rule.  *  * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */ public class Rule {  // Attributes  /**  * The match.  */  public Selector selector;  /**  * The style.  */  public Style style;  /**  * Optionally, the rule can store all the style groups it participates in.  */  public HashSet<String> groups;  // Constructors  protected Rule() {  }  /**  * New rule with a matcher.  *  * @param selector  * The rule selector.  */  public Rule(Selector selector) {  this.selector = selector;  }  public Rule(Selector selector, Rule parent) {  this.selector = selector;  this.style = new Style(parent);  }  /**  * This rule style.  *  * @return The rule style.  */  public Style getStyle() {  return style;  }  /**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  *  * @return The group set or null.  */  public HashSet<String> getGroups() {  return groups;  }  /**  * True if this rule selector match the given identifier.  *  * @param identifier  * The identifier to test for the match.  * @return True if matching.  */  public boolean matchId(String identifier) {  String ident = selector.getId();  if (ident != null)  return ident.equals(identifier);  return false;  }  /**  * Change the style.  *  * @param style  * A style specification.  */  public void setStyle(Style style) {  this.style = style;  }  /**  * Specify that this rule participates in the given style group.  *  * @param groupId  * The group unique identifier.  */  public void addGroup(String groupId) {  if (groups == null)  groups = new HashSet<String>();  groups.add(groupId);  }  /**  * Remove this rule from the style group.  *  * @param groupId  * The group unique identifier.  */  public void removeGroup(String groupId) {  if (groups != null)  groups.remove(groupId);  }  @Override  public String toString() {  return toString(-1);  }  public String toString(int level) {  StringBuilder builder = new StringBuilder();  String prefix = """";  if (level > 0) {  for (int i = 0; i < level; i++) prefix += "" "";  }  builder.append(prefix);  builder.append(selector.toString());  builder.append(style.toString(level + 1));  return builder.toString();  } }",true?((methodResultID!=null&&methodResultID.equals(this.groups))||methodResultID==null):,"[TRUE, QuestionMark, OpeningParenthesis, OpeningParenthesis, MethodResultID, IneqOperator, NULL, LogicalOperator, MethodResultID, Period, MethodName, OpeningParenthesis, This, Period, ClassField, ClosingParenthesis, ClosingParenthesis, LogicalOperator, MethodResultID, EqOperator, NULL, ClosingParenthesis, Colon]",,TRUE,
186,False,19341,NORMAL_POST,gs-core-1.3,org.graphstream.ui.graphicGraph.stylesheet,Rule,The group set or null.,"/**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  * * @return The group set or null.  */",public HashSet<String> getGroups(){  return groups; },"/**  * Style application rule.  * * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */","/*  * Copyright 2006 - 2015  * Stefan Balev <stefan.balev@graphstream-project.org>  * Julien Baudry <julien.baudry@graphstream-project.org>  * Antoine Dutot <antoine.dutot@graphstream-project.org>  * Yoann Pigné <yoann.pigne@graphstream-project.org>  * Guilhelm Savin <guilhelm.savin@graphstream-project.org>  * * This file is part of GraphStream <http://graphstream-project.org>.  * * GraphStream is a library whose purpose is to handle static or dynamic  * graph, create them from scratch, file or any source and display them.  * * This program is free software distributed under the terms of two licenses, the  * CeCILL-C license that fits European law, and the GNU Lesser General Public  * License. You can use, modify and/ or redistribute the software under the terms  * of the CeCILL-C license as circulated by CEA, CNRS and INRIA at the following  * URL <http://www.cecill.info> or under the terms of the GNU LGPL as published by  * the Free Software Foundation, either version 3 of the License, or (at your  * option) any later version.  * * This program is distributed in the hope that it will be useful, but WITHOUT ANY  * WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A  * PARTICULAR PURPOSE. See the GNU Lesser General Public License for more details.  * * You should have received a copy of the GNU Lesser General Public License  * along with this program. If not, see <http://www.gnu.org/licenses/>.  * * The fact that you are presently reading this means that you have had  * knowledge of the CeCILL-C and LGPL licenses and that you accept their terms.  */ package org.graphstream.ui.graphicGraph.stylesheet; import java.util.HashSet; /**  * Style application rule.  *  * <p>  * A rule is made of a selector and values. The selector identifies the  * element(s) this rule applies to, and the values are styles to apply to the  * matched elements.  * </p>  */ public class Rule {  // Attributes  /**  * The match.  */  public Selector selector;  /**  * The style.  */  public Style style;  /**  * Optionally, the rule can store all the style groups it participates in.  */  public HashSet<String> groups;  // Constructors  protected Rule() {  }  /**  * New rule with a matcher.  *  * @param selector  * The rule selector.  */  public Rule(Selector selector) {  this.selector = selector;  }  public Rule(Selector selector, Rule parent) {  this.selector = selector;  this.style = new Style(parent);  }  /**  * This rule style.  *  * @return The rule style.  */  public Style getStyle() {  return style;  }  /**  * The group this rule participate in, maybe null if the rule does not  * participate in any group.  *  * @return The group set or null.  */  public HashSet<String> getGroups() {  return groups;  }  /**  * True if this rule selector match the given identifier.  *  * @param identifier  * The identifier to test for the match.  * @return True if matching.  */  public boolean matchId(String identifier) {  String ident = selector.getId();  if (ident != null)  return ident.equals(identifier);  return false;  }  /**  * Change the style.  *  * @param style  * A style specification.  */  public void setStyle(Style style) {  this.style = style;  }  /**  * Specify that this rule participates in the given style group.  *  * @param groupId  * The group unique identifier.  */  public void addGroup(String groupId) {  if (groups == null)  groups = new HashSet<String>();  groups.add(groupId);  }  /**  * Remove this rule from the style group.  *  * @param groupId  * The group unique identifier.  */  public void removeGroup(String groupId) {  if (groups != null)  groups.remove(groupId);  }  @Override  public String toString() {  return toString(-1);  }  public String toString(int level) {  StringBuilder builder = new StringBuilder();  String prefix = """";  if (level > 0) {  for (int i = 0; i < level; i++) prefix += "" "";  }  builder.append(prefix);  builder.append(selector.toString());  builder.append(style.toString(level + 1));  return builder.toString();  } }",true?((methodResultID!=null&&methodResultID.equals(this.groups))||methodResultID==null):,"[TRUE, QuestionMark, OpeningParenthesis, OpeningParenthesis, MethodResultID, IneqOperator, NULL, LogicalOperator, MethodResultID, Period, MethodName, OpeningParenthesis, This, Period, ClassField, ClosingParenthesis, ClosingParenthesis, LogicalOperator, MethodResultID, EqOperator, NULL, ClosingParenthesis, Colon]",,OpeningParenthesis,


In [19]:
import json

groups = result.groupby(['oracleId', 'oracleSoFar'])

counter = 0

# Iterate over the groups and export each group to JSON
for group_name, group_data in groups:
    # Create a dictionary representation of the group
    group_dict = group_data.to_dict(orient='records')

    # Convert the dictionary to JSON
    json_data = json.dumps(group_dict)

    # Define the file name for the JSON file based on the group name
    file_name = f"next_token_{counter}.json"

    # Export the JSON data to a file
    with open(os.path.join(os.getcwd(),file_name), 'w') as json_file:
        json_file.write(json_data)

    counter += 1

In [16]:
# Convert DataFrame rows to JSON
json_data = result.to_json(orient='records')

# Save JSON data to a file
file_path = 'next_token.json'
with open(os.path.join(os.getcwd(),file_path), 'w') as json_file:
    json_file.write(json_data)

In [99]:
len(df_filtered_oracle_ids)

23421

In [100]:
len(df_random)

20000

In [101]:
df_remaining.shape

(88274, 16)

In [64]:
df_filtered = df_dataset[(df_dataset['tokenClass'] == 'Semicolon') & (df_dataset['oracleSoFar'] == '')]

# Randomly selecting 3000 rows
df_random = df_filtered.sample(n=3000, random_state=42)  # Adjust the random_state as desired

# Removing the other 20000 rows
df_remaining = df_dataset[~((df_dataset['tokenClass'] == 'Semicolon') & (df_dataset['oracleSoFar'] == ''))]

In [73]:
df_remaining_oracle_id = df_remaining["oracleId"].unique()

In [75]:
df_remaining_oracle_id.shape

(26510,)

In [76]:
df_filtered_oracle_id = df_filtered["oracleId"].unique()

In [77]:
df_filtered_oracle_id.shape

(26510,)

In [66]:
df_filtered_oracle_id.head()

3     2900
8     2901
13    2902
18    2903
23    2904
Name: oracleId, dtype: int64

In [67]:
black_list = df_filtered_oracle_id.tolist()


In [68]:
df_filtered_2 = df_dataset[~df_dataset['oracleId'].isin(black_list)]

In [69]:
df_filtered_2.shape

(0, 16)

In [54]:
a = df_filtered_oracle_id.tolist()

In [56]:
a[:100]

[2900,
 2901,
 2902,
 2903,
 2904,
 2905,
 2906,
 2907,
 2908,
 2909,
 2910,
 2911,
 2912,
 2913,
 2914,
 2915,
 2916,
 2917,
 2918,
 2919,
 2920,
 2921,
 2922,
 2923,
 2924,
 2925,
 2926,
 2927,
 2928,
 2929,
 2930,
 2931,
 2932,
 2933,
 2934,
 2935,
 2936,
 2937,
 2938,
 2939,
 2940,
 2941,
 2942,
 2943,
 2944,
 2945,
 2946,
 2947,
 2948,
 2949,
 2950,
 2951,
 2952,
 2953,
 2954,
 2955,
 2956,
 2957,
 2958,
 2959,
 2960,
 2961,
 2962,
 2963,
 2964,
 2965,
 2966,
 2967,
 2968,
 2969,
 2970,
 2971,
 2972,
 2973,
 2974,
 2975,
 2976,
 2977,
 2978,
 2979,
 2980,
 2981,
 2982,
 2983,
 2984,
 2985,
 2986,
 2987,
 2988,
 2989,
 2990,
 2991,
 2992,
 2993,
 2994,
 2995,
 2996,
 2997,
 2998,
 2999]

In [51]:
df_remaining.shape

(162677, 16)

In [14]:
df_filtered.shape

(26510, 16)

In [44]:
df_dataset = pd.concat([df_random, df_remaining])

In [46]:
df_dataset.shape

(165677, 16)

In [6]:
df_dataset.head()

Unnamed: 0,label,oracleId,oracleType,projectName,packageName,className,javadocTag,methodJavadoc,methodSourceCode,classJavadoc,classSourceCode,oracleSoFar,token,tokenClass,tokenInfo
0,False,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,MethodArgument,
1,False,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,Class,
2,False,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,This,
3,False,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,TRUE,
4,True,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,Semicolon,


In [39]:
import random

seed_value = 2  # Use a different seed value for each iteration
random.seed(seed_value)
initial_columns = ['oracleSoFar', 'tokenClassesSoFar', 'javadocTag', 'oracleType',
        'packageName', 'className', 'methodJavadoc', 'methodSourceCode']
shuffled_columns = random.sample(initial_columns, len(initial_columns))
shuffled_columns.insert(0, 'tokenClass')
shuffled_columns.extend(['label','projectName','oracleId','classJavadoc','classSourceCode'])
df_shuffled = df_dataset[shuffled_columns]

In [40]:
df_shuffled.head()

Unnamed: 0,tokenClass,oracleSoFar,methodSourceCode,methodJavadoc,javadocTag,tokenClassesSoFar,packageName,oracleType,className,label,projectName,oracleId,classJavadoc,classSourceCode
0,MethodArgument,,public void setOrderControl(final int maximalO...,/** Set the order control parameters.  * <p...,@param maximalOrder maximal order in the extra...,[],org.apache.commons.math3.ode.nonstiff,PRE,GraggBulirschStoerIntegrator,False,commons-math3-3.6.1,2900,/**  * This class implements a Gragg-Bulirsch...,/*  * Licensed to the Apache Software Foundati...
1,Class,,public void setOrderControl(final int maximalO...,/** Set the order control parameters.  * <p...,@param maximalOrder maximal order in the extra...,[],org.apache.commons.math3.ode.nonstiff,PRE,GraggBulirschStoerIntegrator,False,commons-math3-3.6.1,2900,/**  * This class implements a Gragg-Bulirsch...,/*  * Licensed to the Apache Software Foundati...
2,This,,public void setOrderControl(final int maximalO...,/** Set the order control parameters.  * <p...,@param maximalOrder maximal order in the extra...,[],org.apache.commons.math3.ode.nonstiff,PRE,GraggBulirschStoerIntegrator,False,commons-math3-3.6.1,2900,/**  * This class implements a Gragg-Bulirsch...,/*  * Licensed to the Apache Software Foundati...
3,Semicolon,,public void setOrderControl(final int maximalO...,/** Set the order control parameters.  * <p...,@param maximalOrder maximal order in the extra...,[],org.apache.commons.math3.ode.nonstiff,PRE,GraggBulirschStoerIntegrator,True,commons-math3-3.6.1,2900,/**  * This class implements a Gragg-Bulirsch...,/*  * Licensed to the Apache Software Foundati...
4,OpeningParenthesis,,public void setOrderControl(final int maximalO...,/** Set the order control parameters.  * <p...,@param maximalOrder maximal order in the extra...,[],org.apache.commons.math3.ode.nonstiff,PRE,GraggBulirschStoerIntegrator,False,commons-math3-3.6.1,2900,/**  * This class implements a Gragg-Bulirsch...,/*  * Licensed to the Apache Software Foundati...


In [33]:
for i in range(10):
    seed_value = i + 1  # Use a different seed value for each iteration
    random.seed(seed_value)
    initial_columns = ['oracleSoFar', 'tokenClassesSoFar', 'javadocTag', 'oracleType',
            'packageName', 'className', 'methodJavadoc', 'methodSourceCode']
    shuffled_columns = random.sample(initial_columns, len(initial_columns))
    shuffled_columns.insert(0, 'tokenClass')
    shuffled_columns.extend(['label','projectName','oracleId','classJavadoc','classSourceCode'])
    print(shuffled_columns)

['tokenClass', 'javadocTag', 'packageName', 'oracleSoFar', 'methodSourceCode', 'className', 'tokenClassesSoFar', 'methodJavadoc', 'oracleType', 'label', 'projectName', 'oracleId', 'classJavadoc', 'classSourceCode']
['tokenClass', 'oracleSoFar', 'methodSourceCode', 'methodJavadoc', 'javadocTag', 'tokenClassesSoFar', 'packageName', 'oracleType', 'className', 'label', 'projectName', 'oracleId', 'classJavadoc', 'classSourceCode']
['tokenClass', 'oracleType', 'packageName', 'methodJavadoc', 'tokenClassesSoFar', 'javadocTag', 'methodSourceCode', 'className', 'oracleSoFar', 'label', 'projectName', 'oracleId', 'classJavadoc', 'classSourceCode']
['tokenClass', 'oracleType', 'javadocTag', 'oracleSoFar', 'methodSourceCode', 'packageName', 'className', 'methodJavadoc', 'tokenClassesSoFar', 'label', 'projectName', 'oracleId', 'classJavadoc', 'classSourceCode']
['tokenClass', 'packageName', 'className', 'javadocTag', 'methodSourceCode', 'oracleSoFar', 'tokenClassesSoFar', 'oracleType', 'methodJavado

In [30]:
df_shuffled.head()

Unnamed: 0,tokenClass,tokenClassesSoFar,methodSourceCode,className,oracleSoFar,packageName,methodJavadoc,javadocTag,oracleType
0,MethodArgument,[],public void setOrderControl(final int maximalO...,GraggBulirschStoerIntegrator,,org.apache.commons.math3.ode.nonstiff,/** Set the order control parameters.  * <p...,@param maximalOrder maximal order in the extra...,PRE
1,Class,[],public void setOrderControl(final int maximalO...,GraggBulirschStoerIntegrator,,org.apache.commons.math3.ode.nonstiff,/** Set the order control parameters.  * <p...,@param maximalOrder maximal order in the extra...,PRE
2,This,[],public void setOrderControl(final int maximalO...,GraggBulirschStoerIntegrator,,org.apache.commons.math3.ode.nonstiff,/** Set the order control parameters.  * <p...,@param maximalOrder maximal order in the extra...,PRE
3,Semicolon,[],public void setOrderControl(final int maximalO...,GraggBulirschStoerIntegrator,,org.apache.commons.math3.ode.nonstiff,/** Set the order control parameters.  * <p...,@param maximalOrder maximal order in the extra...,PRE
4,OpeningParenthesis,[],public void setOrderControl(final int maximalO...,GraggBulirschStoerIntegrator,,org.apache.commons.math3.ode.nonstiff,/** Set the order control parameters.  * <p...,@param maximalOrder maximal order in the extra...,PRE


In [18]:
df_shuffled.head()

Unnamed: 0,tokenClass,oracleSoFar,tokenClassesSoFar,javadocTag,oracleType,packageName,className,methodJavadoc,methodSourceCode
0,TRUE,,[],the hash code value for this map,PRE,org.apache.commons.collections4.comparators,FixedOrderComparator,\t/**\n * Find the greatest node from a gi...,protected Iterator<V> createValuesIterator(fin...
1,Semicolon,,[],@param node the node from which we will start ...,NORMAL_POST,org.apache.commons.collections4.bidimap,TreeBidiMap,\t/**\n * Returns a string version of this...,"private Node<K, V> nextSmaller(final Node<K, V..."
2,Class,,[],@param data the key or value to be looked up,NORMAL_POST,org.apache.commons.collections4.bidimap,TreeBidiMap,\t/**\n * Returns a set view of the keys c...,public boolean isEmpty(){\n return nodeCoun...
3,MethodArgument,,[],@throws ClassCastException if the value is of ...,NORMAL_POST,org.apache.commons.collections4.bidimap,TreeBidiMap,\t/**\n * Returns <code>true</code> iff <i...,private static <T extends Comparable<T>> int c...
4,TRUE,,[],@param key the key to search for previous from,NORMAL_POST,org.apache.commons.collections4.bidimap,TreeBidiMap,\t/**\n * Find the greatest node from a gi...,"private Node<K, V> greatestNode(final Node<K, ..."


In [7]:
unique_values = df_dataset['tokenClass'].unique()

In [8]:
unique_values

<StringArray>
[        'MethodArgument',                  'Class',                   'This',
                   'TRUE',              'Semicolon',     'OpeningParenthesis',
            'ArraysClass',           'IneqOperator',             'EqOperator',
                 'Period',     'InstanceOfOperator',                   'NULL',
        'LogicalOperator',      'NonEqIneqOperator',                  'S_INT',
  'BitwiseNegateOperator',           'QuestionMark',         'MethodResultID',
             'MethodName',     'ClosingParenthesis',                  'Colon',
                  'FALSE',           'StreamMethod',                'BOOLEAN',
                  'Comma',             'ClassField', 'BitwiseLogicalOperator',
   'ArithmeticalOperator',   'BitwiseShiftOperator',                 'DOUBLE',
            'MatchMethod',         'MatchMethodVar',             'RightArrow',
          'ClassModifier',               'S_STRING']
Length: 35, dtype: string

In [9]:
df_dataset.columns

Index(['label', 'oracleId', 'oracleType', 'projectName', 'packageName',
       'className', 'javadocTag', 'methodJavadoc', 'methodSourceCode',
       'classJavadoc', 'classSourceCode', 'oracleSoFar', 'token', 'tokenClass',
       'tokenInfo'],
      dtype='object')

In [10]:
# delete the oracle ids and the tgt labels from the input dataset
df_src = df_dataset.drop(['label','oracleId','projectName','classSourceCode','classJavadoc','token','tokenInfo'], axis=1)

In [11]:
df_src.shape

(189187, 8)

In [12]:
%%capture
tokenizer = RobertaTokenizer.from_pretrained('roberta-base', do_lower_case=False) #AutoTokenizer.from_pretrained("microsoft/codebert-base", model_max_length=512)

In [23]:
df_src_concat = df_src.apply(lambda row: tokenizer.tokenize(tokenizer.cls_token.join(row.values)), axis=1)
# The pandas dataframe is transformed in a list of strings: each string is a input
# to the model
src = df_src_concat.to_numpy().tolist()

#max_len = reduce(lambda max_len, s: len(s) if len(s) > max_len else max_len, src,0) + 2

In [28]:
%%capture
tokenizer_2 = AutoTokenizer.from_pretrained("microsoft/codebert-base", model_max_length=512)


In [30]:
df_src_concat_2 = df_src.apply(lambda row: tokenizer_2.tokenize(tokenizer.cls_token.join(row.values)), axis=1)
# The pandas dataframe is transformed in a list of strings: each string is a input
# to the model
src_2 = df_src_concat.to_numpy().tolist()

#max_len = reduce(lambda max_len, s: len(s) if len(s) > max_len else max_len, src,0) + 2

Token indices sequence length is longer than the specified maximum sequence length for this model (653 > 512). Running this sequence through the model will result in indexing errors


In [14]:
unique_values = df_dataset['tokenClass'].unique()

In [15]:
unique_values

<StringArray>
[        'MethodArgument',                  'Class',                   'This',
                   'TRUE',              'Semicolon',     'OpeningParenthesis',
            'ArraysClass',           'IneqOperator',             'EqOperator',
                 'Period',     'InstanceOfOperator',                   'NULL',
        'LogicalOperator',      'NonEqIneqOperator',                  'S_INT',
  'BitwiseNegateOperator',           'QuestionMark',         'MethodResultID',
             'MethodName',     'ClosingParenthesis',                  'Colon',
                  'FALSE',           'StreamMethod',                'BOOLEAN',
                  'Comma',             'ClassField', 'BitwiseLogicalOperator',
   'ArithmeticalOperator',   'BitwiseShiftOperator',                 'DOUBLE',
            'MatchMethod',         'MatchMethodVar',             'RightArrow',
          'ClassModifier',               'S_STRING']
Length: 35, dtype: string

In [17]:
df_src.head()

Unnamed: 0,oracleType,packageName,className,javadocTag,methodJavadoc,methodSourceCode,oracleSoFar,tokenClass
0,NORMAL_POST,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,,MethodArgument
1,NORMAL_POST,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,,Class
2,NORMAL_POST,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,,This
3,NORMAL_POST,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,,TRUE
4,NORMAL_POST,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,,Semicolon


In [31]:
max(src)

['PRE',
 '<s>',
 'pl',
 'ume',
 '<s>',
 'Weak',
 'Id',
 'entity',
 'P',
 'air',
 '<s>',
 '@',
 'param',
 'Ġb',
 'Ġsecond',
 'Ġargument',
 '<s>',
 'ĉ',
 '/**',
 'ĠFactory',
 'Ġmethod',
 'Ġwith',
 'Ġshort',
 'Ġname',
 'Ġand',
 'Ġno',
 'Ġneed',
 'Ġto',
 'Ġname',
 'Ġtype',
 'Ġparameters',
 '.',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ*',
 'Ġ@',
 'param',
 'Ġ<',
 'A',
 '>',
 'Ġtype',
 'Ġof',
 'Ġfirst',
 'Ġargument',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ*',
 'Ġ@',
 'param',
 'Ġ<',
 'B',
 '>',
 'Ġtype',
 'Ġof',
 'Ġsecond',
 'Ġargument',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ*',
 'Ġ@',
 'param',
 'Ġa',
 'Ġfirst',
 'Ġargument',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ*',
 'Ġ@',
 'param',
 'Ġb',
 'Ġsecond',
 'Ġargument',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ*',
 'Ġ@',
 'return',
 'Ġa',
 'ĠWeak',
 'Id',
 'entity',
 'P',
 'air',
 'Ġof',
 'Ġ(',
 'a',
 ',',
 'Ġb',
 ')',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ*/',
 '<s>',
 'public',
 'Ġstatic',
 'Ġ<',
 'A',
 'Ġextends',
 'ĠObject',
 ',',
 'ĠB',
 'Ġextends',
 'ĠObject',
 '>',
 'ĠWeak',
 'Id',
 'entity',
 'P',
 'air',
 '<',
 'A',
 ',',
 'ĠB',
 '>',
 'Ġof'

In [32]:
# Assuming you have a list of strings named 'string_list'
max_length_element = max(src_2, key=lambda x: len(x))

# If you want to get the index of the element instead, you can use argmax()
max_length_index = np.argmax([len(x) for x in src_2])

In [33]:
max_length_element

['NOR',
 'MAL',
 '_',
 'POST',
 '<s>',
 'org',
 '.',
 'apache',
 '.',
 'comm',
 'ons',
 '.',
 'math',
 '3',
 '.',
 'optim',
 '.',
 'non',
 'linear',
 '.',
 'sc',
 'al',
 'ar',
 '.',
 'n',
 'oder',
 'iv',
 '<s>',
 'BO',
 'BY',
 'Q',
 'A',
 'Opt',
 'im',
 'izer',
 '<s>',
 'Ġthe',
 'Ġvalue',
 'Ġof',
 'Ġthe',
 'Ġobjective',
 'Ġat',
 'Ġthe',
 'Ġoptimum',
 '.',
 '<s>',
 'ĉ',
 '/**',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ*',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'ĠThe',
 'Ġarguments',
 'ĠN',
 ',',
 'ĠN',
 'PT',
 ',',
 'ĠX',
 ',',
 'ĠXL',
 ',',
 'ĠX',
 'U',
 ',',
 'ĠRH',
 'OB',
 'EG',
 ',',
 'ĠR',
 'HO',
 'END',
 ',',
 'ĠI',
 'PR',
 'INT',
 'Ġand',
 'ĠMAX',
 'FUN',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ*',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġare',
 'Ġidentical',
 'Ġto',
 'Ġthe',
 'Ġcorresponding',
 'Ġarguments',
 'Ġin',
 'ĠSUB',
 'R',
 'OUT',
 'INE',
 'ĠB',
 'OB',
 'Y',
 'Q',
 'A',
 '.',
 'Ċ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ*',
 'Ġ',
 'Ġ',
 'Ġ',
 'Ġ',
 'ĠX',
 'B',
 'ASE',
 'Ġholds',
 'Ġa',
 'Ġshift',
 'Ġof',
 'Ġorigi

In [34]:
len(max_length_element)

28619

In [22]:
b = []

b.append(max_length_element)

a = tokenizer(
    b,
    truncation=False
)

ValueError: too many values to unpack (expected 2)

In [None]:
len(a["input_ids"][0])

268

In [None]:
# Assuming you have a string named 'my_string'
numerical_representation = [ord(char) for char in my_string]  # Convert string to a list of ASCII values

# Create a PyTorch tensor from the numerical representation
tensor = torch.tensor(numerical_representation)

TypeError: new(): invalid data type 'str'

In [None]:
type(tokenizer.tokenize(max_length_element)[0])

str

In [None]:
max_len

268

In [35]:
string_lengths = np.array([len(sublist) + 2 for sublist in src ])

max_length = np.max(string_lengths)
mean_length = np.mean(string_lengths)
median_length = np.median(string_lengths)

In [36]:
max_length

28621

In [37]:
mean_length

391.1331011115986

In [38]:
median_length

292.0

In [39]:
filtered_strings = [string for string in src if len(string) < 512]

In [40]:
len(src)

189187

In [41]:
len(filtered_strings)

155578

In [42]:
len(df_src["tokenClass"].value_counts())

35

In [19]:
df_dataset["tokenClass"].value_counts()#/len(df_dataset["tokenClass"]) * 100

Class                     32428
Semicolon                 31038
MethodArgument            29226
OpeningParenthesis        29003
This                      14980
TRUE                      12001
ArraysClass                6109
LogicalOperator            5729
EqOperator                 5149
IneqOperator               4098
Period                     3778
InstanceOfOperator         2651
NULL                       2358
S_INT                      2212
ClosingParenthesis         1368
NonEqIneqOperator          1184
MethodResultID              961
DOUBLE                      887
BitwiseNegateOperator       841
FALSE                       828
MethodName                  523
Colon                       386
QuestionMark                296
ArithmeticalOperator        253
ClassField                  239
BitwiseLogicalOperator      186
BitwiseShiftOperator        186
BOOLEAN                     138
StreamMethod                 52
MatchMethodVar               42
MatchMethod                  14
RightArr

In [17]:
np.bincount(df_src["tokenClass"])

ValueError: invalid literal for int() with base 10: 'MethodArgument'

In [14]:
# Get unique values from the column
unique_values = df_dataset['tokenClass'].unique()

# Create a dictionary to map string values to their corresponding vector
mapping = {}
for i, value in enumerate(unique_values):
    vector = np.zeros(len(unique_values))
    vector[i] = 1.0
    mapping[value] = list(vector)

# Add a new column to the original DataFrame with the mapped vectors
df_dataset['tokenClassVectorized'] = df_dataset['tokenClass'].map(mapping)

In [15]:
df_dataset.head()

Unnamed: 0,label,oracleId,oracleType,projectName,packageName,className,javadocTag,methodJavadoc,methodSourceCode,classJavadoc,classSourceCode,oracleSoFar,token,tokenClass,tokenInfo,tokenClassVectorized
0,False,15264,PRE,plume-lib-1.1.0,plume,Lookup,@param reader where to read the entry from,/**  * Returns the next entry. If no more ...,public static Entry old_get_entry(EntryReader ...,"/**  * Lookup searches a set of files, much li...",package plume; import static plume.EntryReade...,,,MethodArgument,,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
1,False,15264,PRE,plume-lib-1.1.0,plume,Lookup,@param reader where to read the entry from,/**  * Returns the next entry. If no more ...,public static Entry old_get_entry(EntryReader ...,"/**  * Lookup searches a set of files, much li...",package plume; import static plume.EntryReade...,,,Class,,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
2,True,15264,PRE,plume-lib-1.1.0,plume,Lookup,@param reader where to read the entry from,/**  * Returns the next entry. If no more ...,public static Entry old_get_entry(EntryReader ...,"/**  * Lookup searches a set of files, much li...",package plume; import static plume.EntryReade...,,,Semicolon,,"[0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
3,False,15264,PRE,plume-lib-1.1.0,plume,Lookup,@param reader where to read the entry from,/**  * Returns the next entry. If no more ...,public static Entry old_get_entry(EntryReader ...,"/**  * Lookup searches a set of files, much li...",package plume; import static plume.EntryReade...,,,OpeningParenthesis,,"[0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."
4,False,15265,NORMAL_POST,plume-lib-1.1.0,plume,Lookup,"the next entry, or null",/**  * Returns the next entry. If no more ...,public static Entry old_get_entry(EntryReader ...,"/**  * Lookup searches a set of files, much li...",package plume; import static plume.EntryReade...,,,MethodArgument,,"[1.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ..."


In [18]:
# Convert string labels to integer labels
unique_labels, integer_labels = np.unique(df_dataset["tokenClass"], return_inverse=True)

# Count the occurrences of each class
class_counts = np.bincount(integer_labels)

# Calculate the inverse class frequencies
total_samples = np.sum(class_counts)
class_frequencies = class_counts / total_samples
class_weights = 1.0 / class_frequencies

# Normalize the class weights
class_weights /= np.sum(class_weights)

class_weights.sum()

1.0000000000000002

In [25]:
np.unique(df_dataset["tokenClass"], return_counts=True)

(array(['ArithmeticalOperator', 'ArraysClass', 'BOOLEAN',
        'BitwiseLogicalOperator', 'BitwiseNegateOperator',
        'BitwiseShiftOperator', 'Class', 'ClassField', 'ClassModifier',
        'ClosingParenthesis', 'Colon', 'Comma', 'DOUBLE', 'EqOperator',
        'FALSE', 'IneqOperator', 'InstanceOfOperator', 'LogicalOperator',
        'MatchMethod', 'MatchMethodVar', 'MethodArgument', 'MethodName',
        'MethodResultID', 'NULL', 'NonEqIneqOperator',
        'OpeningParenthesis', 'Period', 'QuestionMark', 'RightArrow',
        'S_INT', 'S_STRING', 'Semicolon', 'StreamMethod', 'TRUE', 'This'],
       dtype=object),
 array([  253,  6109,   138,   186,   841,   186, 32428,   239,     7,
         1368,   386,     9,   887,  5149,   828,  4098,  2651,  5729,
           14,    42, 29226,   523,   961,  2358,  1184, 29003,  3778,
          296,    14,  2212,    13, 31038,    52, 12001, 14980]))

In [22]:
len(unique_labels)

35

In [11]:
a = df_dataset[df_dataset["tokenClass"]=="Class"]

In [14]:
a

Unnamed: 0,label,oracleId,oracleType,projectName,packageName,className,javadocTag,methodJavadoc,methodSourceCode,classJavadoc,classSourceCode,oracleSoFar,token,tokenClass,tokenInfo
143,False,813,EXCEPT_POST,commons-math3-3.6.1,org.apache.commons.math3.linear,MatrixUtils,@throws org.apache.commons.math3.linear.NonSym...,/**  * Checks whether a matrix is symmetr...,public static void checkSymmetric(RealMatrix m...,/**  * A collection of static methods that ope...,/*  * Licensed to the Apache Software Foundati...,(,RealFieldElement,Class,"['org.apache.commons.math3', 'RealFieldElement']"
144,False,813,EXCEPT_POST,commons-math3-3.6.1,org.apache.commons.math3.linear,MatrixUtils,@throws org.apache.commons.math3.linear.NonSym...,/**  * Checks whether a matrix is symmetr...,public static void checkSymmetric(RealMatrix m...,/**  * A collection of static methods that ope...,/*  * Licensed to the Apache Software Foundati...,(,Primes,Class,"['org.apache.commons.math3.primes', 'Primes']"
145,False,813,EXCEPT_POST,commons-math3-3.6.1,org.apache.commons.math3.linear,MatrixUtils,@throws org.apache.commons.math3.linear.NonSym...,/**  * Checks whether a matrix is symmetr...,public static void checkSymmetric(RealMatrix m...,/**  * A collection of static methods that ope...,/*  * Licensed to the Apache Software Foundati...,(,SmallPrimes,Class,"['org.apache.commons.math3.primes', 'SmallPrim..."
146,False,813,EXCEPT_POST,commons-math3-3.6.1,org.apache.commons.math3.linear,MatrixUtils,@throws org.apache.commons.math3.linear.NonSym...,/**  * Checks whether a matrix is symmetr...,public static void checkSymmetric(RealMatrix m...,/**  * A collection of static methods that ope...,/*  * Licensed to the Apache Software Foundati...,(,PollardRho,Class,"['org.apache.commons.math3.primes', 'PollardRho']"
147,False,813,EXCEPT_POST,commons-math3-3.6.1,org.apache.commons.math3.linear,MatrixUtils,@throws org.apache.commons.math3.linear.NonSym...,/**  * Checks whether a matrix is symmetr...,public static void checkSymmetric(RealMatrix m...,/**  * A collection of static methods that ope...,/*  * Licensed to the Apache Software Foundati...,(,CurveFitter,Class,"['org.apache.commons.math3.fitting', 'CurveFit..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1462,False,22109,NORMAL_POST,commons-collections4-4.1,org.apache.commons.collections4,BagUtils,an empty sorted Bag.,/**  * Get an empty <code>SortedBag</code...,public static <E> SortedBag<E> emptySortedBag(...,/**  * Provides utility methods and decorators...,/*  * Licensed to the Apache Software Foundati...,true?methodResultID.equals(,AbstractUntypedIteratorDecorator,Class,"['org.apache.commons.collections4.iterators', ..."
1463,False,22109,NORMAL_POST,commons-collections4-4.1,org.apache.commons.collections4,BagUtils,an empty sorted Bag.,/**  * Get an empty <code>SortedBag</code...,public static <E> SortedBag<E> emptySortedBag(...,/**  * Provides utility methods and decorators...,/*  * Licensed to the Apache Software Foundati...,true?methodResultID.equals(,AbstractOrderedMapIteratorDecorator,Class,"['org.apache.commons.collections4.iterators', ..."
1464,False,22109,NORMAL_POST,commons-collections4-4.1,org.apache.commons.collections4,BagUtils,an empty sorted Bag.,/**  * Get an empty <code>SortedBag</code...,public static <E> SortedBag<E> emptySortedBag(...,/**  * Provides utility methods and decorators...,/*  * Licensed to the Apache Software Foundati...,true?methodResultID.equals(,FilterListIterator,Class,"['org.apache.commons.collections4.iterators', ..."
1465,False,22109,NORMAL_POST,commons-collections4-4.1,org.apache.commons.collections4,BagUtils,an empty sorted Bag.,/**  * Get an empty <code>SortedBag</code...,public static <E> SortedBag<E> emptySortedBag(...,/**  * Provides utility methods and decorators...,/*  * Licensed to the Apache Software Foundati...,true?methodResultID.equals(,SplitMapUtils,Class,"['org.apache.commons.collections4', 'SplitMapU..."


In [14]:
tokenClassDict = {k: i for i, k in enumerate(df_dataset["tokenClass"].unique())}

In [28]:
tokenClassDict

{'MethodArgument': 0,
 'Class': 1,
 'This': 2,
 'TRUE': 3,
 'Semicolon': 4,
 'OpeningParenthesis': 5,
 'ArraysClass': 6,
 'IneqOperator': 7,
 'EqOperator': 8,
 'Period': 9,
 'InstanceOfOperator': 10,
 'NULL': 11,
 'LogicalOperator': 12,
 'NonEqIneqOperator': 13,
 'S_INT': 14,
 'BitwiseNegateOperator': 15,
 'QuestionMark': 16,
 'MethodResultID': 17,
 'MethodName': 18,
 'ClosingParenthesis': 19,
 'Colon': 20,
 'FALSE': 21,
 'StreamMethod': 22,
 'BOOLEAN': 23,
 'Comma': 24,
 'ClassField': 25,
 'BitwiseLogicalOperator': 26,
 'ArithmeticalOperator': 27,
 'BitwiseShiftOperator': 28,
 'DOUBLE': 29,
 'MatchMethod': 30,
 'MatchMethodVar': 31,
 'RightArrow': 32,
 'ClassModifier': 33,
 'S_STRING': 34}

In [46]:
df_dataset['tokenClass'] = df_dataset['tokenClass'].apply(lambda x: tokenClassDict[x])

In [47]:
df_dataset.head()

Unnamed: 0,label,oracleId,oracleType,projectName,packageName,className,javadocTag,methodJavadoc,methodSourceCode,classJavadoc,classSourceCode,oracleSoFar,token,tokenClass,tokenInfo
0,False,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,0,
1,False,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,1,
2,False,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,2,
3,False,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,3,
4,True,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,4,


In [20]:
# get the list of column names
cols = list(df_dataset)

a = df_dataset.groupby(['oracleId', 'oracleSoFar'])



In [25]:
b = a["tokenClass"].apply(list)

In [31]:
b.head()

oracleId  oracleSoFar
0                        [MethodArgument, Class, TRUE, Semicolon, Openi...
          (                    [MethodArgument, Class, OpeningParenthesis]
          ((                   [MethodArgument, Class, OpeningParenthesis]
          ((start        [IneqOperator, EqOperator, Period, InstanceOfO...
          ((start==                          [MethodArgument, Class, NULL]
Name: tokenClass, dtype: object

In [50]:
cols

['label',
 'oracleId',
 'oracleType',
 'projectName',
 'packageName',
 'className',
 'javadocTag',
 'methodJavadoc',
 'methodSourceCode',
 'classJavadoc',
 'classSourceCode',
 'oracleSoFar',
 'token',
 'tokenClass',
 'tokenInfo']

In [16]:
a = df_dataset.groupby(['oracleId', 'oracleSoFar'])

In [17]:
a.head()

Unnamed: 0,label,oracleId,oracleType,projectName,packageName,className,javadocTag,methodJavadoc,methodSourceCode,classJavadoc,classSourceCode,oracleSoFar,token,tokenClass,tokenInfo
0,False,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,MethodArgument,
1,False,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,Class,
2,False,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,This,
3,False,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,TRUE,
4,True,10700,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.optimization.linear,SimplexTableau,number of constraint with the specified relat...,/**  * Get a count of constraints corresp...,private int getConstraintTypeCounts(final Rela...,/**  * A tableau for use in the Simplex method...,/*  * Licensed to the Apache Software Foundati...,,,Semicolon,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,False,2799,PRE,commons-math3-3.6.1,org.apache.commons.math3.analysis.polynomials,PolynomialsUtils,@param shift Shift value.,/**  * Compute the coefficients of the po...,public static double[] shift(final double[] co...,/**  * A collection of static methods that ope...,/*  * Licensed to the Apache Software Foundati...,,,MethodArgument,
482,False,2799,PRE,commons-math3-3.6.1,org.apache.commons.math3.analysis.polynomials,PolynomialsUtils,@param shift Shift value.,/**  * Compute the coefficients of the po...,public static double[] shift(final double[] co...,/**  * A collection of static methods that ope...,/*  * Licensed to the Apache Software Foundati...,,,Class,
483,True,2799,PRE,commons-math3-3.6.1,org.apache.commons.math3.analysis.polynomials,PolynomialsUtils,@param shift Shift value.,/**  * Compute the coefficients of the po...,public static double[] shift(final double[] co...,/**  * A collection of static methods that ope...,/*  * Licensed to the Apache Software Foundati...,,,Semicolon,
484,False,2799,PRE,commons-math3-3.6.1,org.apache.commons.math3.analysis.polynomials,PolynomialsUtils,@param shift Shift value.,/**  * Compute the coefficients of the po...,public static double[] shift(final double[] co...,/**  * A collection of static methods that ope...,/*  * Licensed to the Apache Software Foundati...,,,OpeningParenthesis,


In [18]:
df = pd.DataFrame({
    'A': ['foo', 'foo', 'bar', 'bar', 'foo', 'foo'],
    'B': ['one', 'one', 'one', 'two', 'two', 'one'],
    'C': [1, 2, 3, 4, 5, 6]
})

# group the DataFrame by columns A and B, and aggregate the values in column C into a list
df['D'] = df.groupby(['A', 'B'])['C'].apply(list)

TypeError: incompatible index of inserted column with frame index

In [20]:
import pandas as pd

# create a sample DataFrame
df = pd.DataFrame({
    'A': ['foo', 'foo', 'bar', 'bar', 'foo', 'foo'],
    'B': ['one', 'one', 'one', 'two', 'two', 'one'],
    'C': [1, 2, 3, 4, 5, 6]
})

# group the DataFrame by columns A and B, and get the unique values in column C
df['D'] = df.groupby(['A', 'B'])['C'].unique().apply(list)

# print the result
print(df)

TypeError: incompatible index of inserted column with frame index

In [41]:
import pandas as pd
# Create a sample DataFrame
data = {'A': ['X', 'X', 'Y', 'Y', 'Z'],
        'B': [1, 1, 1, 2, 1],
        'C': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

# Create column D with lists of different values from column C
unique_C = df.groupby(['A', 'B'])['C'].unique()
pd.merge(df, unique_C, on=["A","B"])

In [43]:
pd.merge(df, unique_C, on=["A","B"])

Unnamed: 0,A,B,C_x,C_y
0,X,1,10,"[10, 20]"
1,X,1,20,"[10, 20]"
2,Y,1,30,[30]
3,Y,2,40,[40]
4,Z,1,50,[50]


In [9]:
df.head()

Unnamed: 0,A,B,C,D
0,X,1,10,"((X, 1), [10])"
1,X,2,20,"((X, 2), [20])"
2,Y,1,30,"((Y, 1), [30])"
3,Y,2,40,"((Y, 2), [40])"
4,Z,1,50,"((Z, 1), [50])"


In [14]:
df = df.astype({
    'D': 'string'
})

df

In [62]:
import pandas as pd
# Create a sample DataFrame
data = {'A': ['X', 'X', 'Y', 'Y', 'Z'],
        'B': [1, 2, 1, 2, 1],
        'C': [10, 20, 30, 40, 50]}
df = pd.DataFrame(data)

# Create column D with lists of different values from column C
df_dataset['A'] = df_dataset.groupby(['oracleId', 'oracleSoFar'])["tokenClass"]
#df["D"] = df["D"].apply(lambda x: "[" + ", ".join(str(x) for x in list(x)) + "]")
df["D"] = df["D"].apply(lambda x: list(x))

ValueError: Length of values (44589) does not match length of index (189187)

In [118]:
import pandas as pd

# Create column D with lists of different values from column C
unique = df_dataset.groupby(['oracleId', 'oracleSoFar'])['tokenClass'].unique().to_frame()
unique = unique.rename(columns={'tokenClass': 'eligibleTokenClasses'})
pd.merge(df_dataset, unique, on=['oracleId', 'oracleSoFar'])

Unnamed: 0,label,oracleId,oracleType,projectName,packageName,className,javadocTag,methodJavadoc,methodSourceCode,classJavadoc,classSourceCode,oracleSoFar,token,tokenClass,tokenInfo,eligibleTokenClasses
0,True,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,,initialStepSize,MethodArgument,"['', 'double']",[MethodArgument]
1,False,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,initialStepSize,>=,NonEqIneqOperator,[],[NonEqIneqOperator]
2,False,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,initialStepSize,<=,NonEqIneqOperator,[],[NonEqIneqOperator]
3,True,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,initialStepSize,>,NonEqIneqOperator,[],[NonEqIneqOperator]
4,False,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,initialStepSize,<,NonEqIneqOperator,[],[NonEqIneqOperator]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88257,True,18330,PRE,guava-19.0,com.google.common.collect,Table,@param columnKey key of column to search for,/**  * Returns {@code true} if the table co...,"boolean contains(@Nullable Object rowKey, @Nul...",/**  * A collection that associates an ordered...,/*  * Copyright (C) 2008 The Guava Authors  * ...,,;,Semicolon,[],[Semicolon]
88258,True,18331,PRE,guava-19.0,com.google.common.collect,Table,@param rowKey key of row to search for,/**  * Returns {@code true} if the table co...,boolean containsRow(@Nullable Object rowKey);,/**  * A collection that associates an ordered...,/*  * Copyright (C) 2008 The Guava Authors  * ...,,;,Semicolon,[],[Semicolon]
88259,True,18332,PRE,guava-19.0,com.google.common.collect,Table,@param columnKey key of column to search for,/**  * Returns {@code true} if the table co...,boolean containsColumn(@Nullable Object column...,/**  * A collection that associates an ordered...,/*  * Copyright (C) 2008 The Guava Authors  * ...,,;,Semicolon,[],[Semicolon]
88260,True,18333,PRE,guava-19.0,com.google.common.collect,Table,@param value value to search for,/**  * Returns {@code true} if the table co...,boolean containsValue(@Nullable Object value);,/**  * A collection that associates an ordered...,/*  * Copyright (C) 2008 The Guava Authors  * ...,,;,Semicolon,[],[Semicolon]


In [48]:
unique.head()

oracleId  oracleSoFar
0                        [MethodArgument, Class, TRUE, Semicolon, Openi...
          (                    [MethodArgument, Class, OpeningParenthesis]
          ((                   [MethodArgument, Class, OpeningParenthesis]
          ((start        [IneqOperator, EqOperator, Period, InstanceOfO...
          ((start==                          [MethodArgument, Class, NULL]
Name: tokenClass, dtype: object

In [55]:
unique

Unnamed: 0_level_0,Unnamed: 1_level_0,eligibleTokenClasses
oracleId,oracleSoFar,Unnamed: 2_level_1
0,,"[MethodArgument, Class, TRUE, Semicolon, Openi..."
0,(,"[MethodArgument, Class, OpeningParenthesis]"
0,((,"[MethodArgument, Class, OpeningParenthesis]"
0,((start,"[IneqOperator, EqOperator, Period, InstanceOfO..."
0,((start==,"[MethodArgument, Class, NULL]"
...,...,...
26710,,"[MethodArgument, Class, This, Semicolon, Openi..."
26711,,"[MethodArgument, Class, This, TRUE, Semicolon,..."
26712,,"[MethodArgument, Class, This, Semicolon, Openi..."
26713,,"[MethodArgument, Class, This, TRUE, Semicolon,..."


In [119]:
import pandas as pd

# Create column D with lists of different values from column C
unique = df_dataset.groupby(['oracleId', 'oracleSoFar'])['tokenClass'].unique().to_frame()
unique = unique.rename(columns={'tokenClass': 'eligibleTokenClasses'})
df_dataset = pd.merge(df_dataset, unique, on=['oracleId', 'oracleSoFar'])



In [122]:
df_dataset.head(-100)

Unnamed: 0,label,oracleId,oracleType,projectName,packageName,className,javadocTag,methodJavadoc,methodSourceCode,classJavadoc,classSourceCode,oracleSoFar,token,tokenClass,tokenInfo,eligibleTokenClasses
0,True,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,,initialStepSize,MethodArgument,"['', 'double']",[MethodArgument]
1,False,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,initialStepSize,>=,NonEqIneqOperator,[],[NonEqIneqOperator]
2,False,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,initialStepSize,<=,NonEqIneqOperator,[],[NonEqIneqOperator]
3,True,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,initialStepSize,>,NonEqIneqOperator,[],[NonEqIneqOperator]
4,False,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,initialStepSize,<,NonEqIneqOperator,[],[NonEqIneqOperator]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
88157,True,8795,NORMAL_POST,commons-math3-3.6.1,org.apache.commons.math3.random,RandomDataGenerator,"random value sampled from the Gamma(shape, sc...",/**  * <p>Generates a random value from t...,"public double nextGamma(double shape, double s...",/**  * Implements the {@link RandomData} inter...,/*  * Licensed to the Apache Software Foundati...,,;,Semicolon,[],[Semicolon]
88158,True,8796,PRE,commons-math3-3.6.1,org.apache.commons.math3.random,RandomDataGenerator,@param populationSize the population size of t...,/**  * Generates a random value from the ...,public int nextHypergeometric(int populationSi...,/**  * Implements the {@link RandomData} inter...,/*  * Licensed to the Apache Software Foundati...,,;,Semicolon,[],[Semicolon]
88159,True,8797,PRE,commons-math3-3.6.1,org.apache.commons.math3.random,RandomDataGenerator,@param numberOfSuccesses number of successes i...,/**  * Generates a random value from the ...,public int nextHypergeometric(int populationSi...,/**  * Implements the {@link RandomData} inter...,/*  * Licensed to the Apache Software Foundati...,,;,Semicolon,[],[Semicolon]
88160,True,8798,PRE,commons-math3-3.6.1,org.apache.commons.math3.random,RandomDataGenerator,@param sampleSize the sample size of the Hyper...,/**  * Generates a random value from the ...,public int nextHypergeometric(int populationSi...,/**  * Implements the {@link RandomData} inter...,/*  * Licensed to the Apache Software Foundati...,,;,Semicolon,[],[Semicolon]


In [123]:
cols = list(df_dataset.columns)

# move column D after column B
cols.insert(cols.index('oracleSoFar') + 1, cols.pop(cols.index('eligibleTokenClasses')))

df_dataset = df_dataset[cols]

In [124]:
cols

['label',
 'oracleId',
 'oracleType',
 'projectName',
 'packageName',
 'className',
 'javadocTag',
 'methodJavadoc',
 'methodSourceCode',
 'classJavadoc',
 'classSourceCode',
 'oracleSoFar',
 'eligibleTokenClasses',
 'token',
 'tokenClass',
 'tokenInfo']

In [125]:
df_dataset.head()

Unnamed: 0,label,oracleId,oracleType,projectName,packageName,className,javadocTag,methodJavadoc,methodSourceCode,classJavadoc,classSourceCode,oracleSoFar,eligibleTokenClasses,token,tokenClass,tokenInfo
0,True,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,,[MethodArgument],initialStepSize,MethodArgument,"['', 'double']"
1,False,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,initialStepSize,[NonEqIneqOperator],>=,NonEqIneqOperator,[]
2,False,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,initialStepSize,[NonEqIneqOperator],<=,NonEqIneqOperator,[]
3,True,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,initialStepSize,[NonEqIneqOperator],>,NonEqIneqOperator,[]
4,False,1100,PRE,commons-math3-3.6.1,org.apache.commons.math3.ode.nonstiff,AdaptiveStepsizeIntegrator,@param initialStepSize initial step size to us...,/** Set the initial step size.  * <p>This m...,public void setInitialStepSize(final double in...,/**  * This abstract class holds the common p...,/*  * Licensed to the Apache Software Foundati...,initialStepSize,[NonEqIneqOperator],<,NonEqIneqOperator,[]


In [126]:
"[" + ", ".join(df_dataset["eligibleTokenClasses"][0]) + "]"

'[MethodArgument]'

In [15]:
tokenClassDict = {k: i for i, k in enumerate(df_dataset["tokenClass"].unique())}


In [20]:
value_counts = df_dataset['tokenClass'].value_counts()

# Determine the threshold value based on 5% of the total number of rows
threshold = len(df_dataset) * 0.005

# Filter the DataFrame to keep only the rows meeting the threshold
filtered_df = df_dataset[df_dataset['tokenClass'].map(value_counts) >= threshold]

In [21]:
filtered_df["tokenClass"].unique()

<StringArray>
[    'MethodArgument',              'Class',               'This',
               'TRUE',          'Semicolon', 'OpeningParenthesis',
        'ArraysClass',       'IneqOperator',         'EqOperator',
             'Period', 'InstanceOfOperator',               'NULL',
    'LogicalOperator',  'NonEqIneqOperator',              'S_INT',
     'MethodResultID', 'ClosingParenthesis']
Length: 17, dtype: string

In [23]:
from transformers import RobertaTokenizer, T5ForConditionalGeneration

tokenizer = RobertaTokenizer.from_pretrained('Salesforce/codet5-base')
model = T5ForConditionalGeneration.from_pretrained('Salesforce/codet5-base')

text = "def greet(user): print(f'hello <extra_id_0>!')"
input_ids = tokenizer(text, return_tensors="pt").input_ids

# simply generate a single sequence
generated_ids = model.generate(input_ids, max_length=8)
print(tokenizer.decode(generated_ids[0], skip_special_tokens=True))
# this prints "{user.username}"


Downloading pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

ConnectionError: HTTPSConnectionPool(host='cdn-lfs.huggingface.co', port=443): Read timed out.

In [2]:
# list of partial dataframes
dfs = []
# datasets path
oracles_dataset = os.path.join("..","..","dataset","token-classes-dataset")
# collects partial dataframes from oracles
for file_name in os.listdir(oracles_dataset):
    if "plume-lib" in file_name:
        df = pd.read_json(os.path.join(oracles_dataset, file_name))
        dfs.append(df)
df_dataset = pd.concat(dfs)

In [3]:
df_dataset.shape

(6982, 17)

In [4]:
# Assuming you have a DataFrame called 'df' with columns 'oracleId' and 'oracleSoFar'

# Group the rows by 'oracleId' and 'oracleSoFar'
grouped = df_dataset.groupby(['oracleId', 'oracleSoFar'])

# Create an empty dictionary to store the separate datasets
datasets = []

# Iterate through the groups and assign them to separate datasets
for group_name, group_data in grouped:
    datasets.append(group_data)

# Accessing a specific dataset
dataset_1_2 = datasets[0]

In [6]:
datasets[1].head()

Unnamed: 0,id,label,oracleId,oracleType,projectName,packageName,className,javadocTag,methodJavadoc,methodSourceCode,classJavadoc,classSourceCode,oracleSoFar,tokenClassesSoFar,token,tokenClass,tokenInfo
4,12627,True,14664,PRE,plume-lib-1.1.0,plume,TimeLimitProcess,@param p p is !=null. !=null Process to limit ...,\t/**\n * Creates a TimeLimitProcess with th...,"public TimeLimitProcess(Process p, long timeLi...",/**\n * TimeLimitProcess is a subclass of Proc...,package plume;\n\nimport java.io.ByteArrayInpu...,p,[MethodArgument],,IneqOperator,
5,12628,False,14664,PRE,plume-lib-1.1.0,plume,TimeLimitProcess,@param p p is !=null. !=null Process to limit ...,\t/**\n * Creates a TimeLimitProcess with th...,"public TimeLimitProcess(Process p, long timeLi...",/**\n * TimeLimitProcess is a subclass of Proc...,package plume;\n\nimport java.io.ByteArrayInpu...,p,[MethodArgument],,EqOperator,
6,12629,False,14664,PRE,plume-lib-1.1.0,plume,TimeLimitProcess,@param p p is !=null. !=null Process to limit ...,\t/**\n * Creates a TimeLimitProcess with th...,"public TimeLimitProcess(Process p, long timeLi...",/**\n * TimeLimitProcess is a subclass of Proc...,package plume;\n\nimport java.io.ByteArrayInpu...,p,[MethodArgument],,Period,
7,12630,False,14664,PRE,plume-lib-1.1.0,plume,TimeLimitProcess,@param p p is !=null. !=null Process to limit ...,\t/**\n * Creates a TimeLimitProcess with th...,"public TimeLimitProcess(Process p, long timeLi...",/**\n * TimeLimitProcess is a subclass of Proc...,package plume;\n\nimport java.io.ByteArrayInpu...,p,[MethodArgument],,InstanceOfOperator,
