In [1]:
from noise_detection import *
from rule_cleaner import RuleCleaner

# Feature

## 1. Get first sentence of comment

In [2]:
raw_comment = "\t/**\n\t * Returns the high-value (as a double primitive) \n\t * for an item within a series.\n\t " \
                  "* \n\t * @param series\n\t * @param item \n\t * @return The high-value.\n\t */\n "
print(raw_comment)

	/**
	 * Returns the high-value (as a double primitive) 
	 * for an item within a series.
	 * 
	 * @param series
	 * @param item 
	 * @return The high-value.
	 */
 


In [3]:
getFirstSentence(raw_comment)
# the benchmark comment in Funcom is 'Returns the high-value (as a double primitive)'

'Returns the high-value (as a double primitive) for an item within a series.'

## 2. Noise Detection

In [4]:
## Content Tamering
raw_comment = "\t/**\n\t * Creates a new adapter for an object of class '{@link " \
                  "org.jsenna.eclipse.schema.dictionary.AbstractDataGroup <em>Abstract Data Group</em>}'.\n\t * <!-- " \
                  "begin-user-doc -->\n\t * This default implementation returns null so that we can easily ignore " \
                  "cases;\n\t * it's useful to ignore a case when inheritance will catch all the cases anyway.\n\t * " \
                  "<!-- end-user-doc -->\n\t * @return the new adapter.\n\t * @see " \
                  "org.jsenna.eclipse.schema.dictionary.AbstractDataGroup\n\t * @generated\n\t */\n "
comment = getFirstSentence(raw_comment)
print(raw_comment)
print(comment)
print(if_ContentTamper(comment))

	/**
	 * Creates a new adapter for an object of class '{@link org.jsenna.eclipse.schema.dictionary.AbstractDataGroup <em>Abstract Data Group</em>}'.
	 * <!-- begin-user-doc -->
	 * This default implementation returns null so that we can easily ignore cases;
	 * it's useful to ignore a case when inheritance will catch all the cases anyway.
	 * <!-- end-user-doc -->
	 * @return the new adapter.
	 * @see org.jsenna.eclipse.schema.dictionary.AbstractDataGroup
	 * @generated
	 */
 
Creates a new adapter for an object of class '{@link org.jsenna.eclipse.schema.dictionary.AbstractDataGroup <em>Abstract Data Group</em>}'.
True


In [5]:
## Non-Literal
raw_comment = "/**\n     * relayTbList\u3068\u306e\u5916\u90e8\u7d50\u5408\u3092\u30c6\u30b9\u30c8\u3057\u307e" \
                  "\u3059\u3002\n     * \n     * @throws Exception\n     */\n "
comment = getFirstSentence(raw_comment)
print(raw_comment)
print(comment)
print(if_NonLiteral(comment))

/**
     * relayTbListとの外部結合をテストします。
     * 
     * @throws Exception
     */
 
relayTbListとの外部結合をテストします。
True


In [6]:
## Under-Development 
raw_comment = '\t/**\n\t * NOT IMPLEMENTED YET\n\t */\n'
comment = getFirstSentence(raw_comment)
print(raw_comment)
print(comment)
print(if_UnderDevelop(comment))

	/**
	 * NOT IMPLEMENTED YET
	 */

NOT IMPLEMENTED YET
True


In [7]:
## Empty Function
raw_code = "\tpublic void\n\tpreinitPage() { }\n"
print(raw_code)
print(if_EmptyFunc(raw_code))

	public void
	preinitPage() { }

True


In [8]:
## CommentedOut
raw_code = "    //    public String transformTypeID(URI typeuri) {\n    //\treturn typeuri.toString();\n    //    }\n"
print(raw_code)
print(if_CommentedOut(raw_code))

    //    public String transformTypeID(URI typeuri) {
    //	return typeuri.toString();
    //    }

True


In [9]:
## BlockComment
raw_code = "\tpublic int compareTo(Inparalog inpara) {\n\t\t// sort by 2 digits after .\n\t\treturn (int) (inpara.confidence * 100 - confidence * 100);\n\t}\n"
print(raw_code)
print(if_CommentedOut(raw_code))
print(if_BlockComment(raw_code))

	public int compareTo(Inparalog inpara) {
		// sort by 2 digits after .
		return (int) (inpara.confidence * 100 - confidence * 100);
	}

False
True


## 3. Clean Dataset

In [10]:
# prepare code-comment data that may contain noises
with open('./test.data', 'r') as f:
    data_lines = f.readlines()
import json

raw_code_list, raw_comment_list = [], []
for line in data_lines:
    json_line = json.loads(line.strip())
    raw_code_list.append(json_line['raw_code'])
    raw_comment_list.append(json_line['raw_comment'])
len(raw_code_list)

2000

In [11]:
# get the cleaned code-comment data
cleaner = RuleCleaner(raw_code_list, raw_comment_list)
cleaned_code, cleaned_comment = cleaner.get_clean_data()
len(cleaned_code)

2000it [00:00, 11149.80it/s]


1124

In [12]:
# get the noisy code-comment data that have been removed or updated
noisy_data = cleaner.get_noisy_data()

In [13]:
for raw_code, raw_comment in noisy_data['Interrogation']:
    print(raw_comment,'\n')

  /**
   * Is this pool currently having something in cache?
   */
 

  /**
   * Is any life-cycle phase defined at all?
   */
 

  /**
   * Is the val1 attribute set?
   */
 

  /**
   * Is the val2 attribute set?
   */
 

  /**
   * Is this registration eagerly instantiated?
   */
 

  /**
   * Is the a property explicitly specified?
   * @param key the property key.
   */
 

  /**
   * When no property names is specified, no property values are set,
   * do we by default create a bean component?
   */
 

  /**
   * Is wildcard used?
   */
 

  /**
   * Is this a collection nut?
   * <p>
   * A collection nut is one that has a <code>set</code> with
   * an array parameter.
   * </p>
   */
 

  /**
   * Is this tag defined globally?
   */
 

    /**
     * How useful is ontology annotation for integration across [phylogenetic] studies?
     * @throws IOException 
     */
 



In [14]:
for raw_code, raw_comment in noisy_data['CommentOut']:
    print(raw_code,'\n')

  /*protected void checkSingleChild(Object[] vals){
    if(vals.length>1)
      throw raise("only one sub-element is allowed");
  }*/
 

//            	public double get(int index) {
//            		return buffer[(startIndex + index) % buffer.length];
//            	}
 

