In [1]:
import pandas as pd
import re

In [2]:
df = pd.read_csv("/home/user/dev/pure-analysis/pdfbox-pure-methods.csv")
df.shape

(1369, 5)

In [3]:
def extract_from_tags(tag, tags):
    search_string = tag + "=[\w]+"
    extracted_tag = re.findall(search_string, tags)[0]
    extracted_tag = re.findall("\=(.*)", extracted_tag)[0]
    if extracted_tag == "true":
        return True
    else:
        return False

In [4]:
cols = ["parent-FQN", "method-signature", "return-type", 
        "local-variables", "conditionals", "multiple-statements", "loops", "parameters", "returns",
        "switches", "ifs", "static", "returns-primitives"]
final_df = pd.DataFrame(columns = cols)
for index, row in df.iterrows():
    final_df.loc[index, 'parent-FQN'] = row['parent-FQN']
    final_df.loc[index, 'method-signature'] = row['method-signature']
    final_df.loc[index, 'return-type'] = row['return-type']
    final_df.loc[index, 'visibility'] = row['visibility']
    final_df.loc[index, 'local-variables'] = extract_from_tags("local_variables", str(row['tags']))
    final_df.loc[index, 'conditionals'] = extract_from_tags("conditionals", str(row['tags']))
    final_df.loc[index, 'multiple-statements'] = extract_from_tags("multiple_statements", str(row['tags']))
    final_df.loc[index, 'loops'] = extract_from_tags("loops", str(row['tags']))
    final_df.loc[index, 'parameters'] = extract_from_tags("parameters", str(row['tags']))
    final_df.loc[index, 'returns'] = extract_from_tags("returns", str(row['tags']))
    final_df.loc[index, 'switches'] = extract_from_tags("switches", str(row['tags']))
    final_df.loc[index, 'ifs'] = extract_from_tags("ifs", str(row['tags']))
    final_df.loc[index, 'static'] = extract_from_tags("static", str(row['tags']))
    final_df.loc[index, 'returns-primitives'] = extract_from_tags("returns_primitives", str(row['tags']))

final_df.head()

Unnamed: 0,parent-FQN,method-signature,return-type,local-variables,conditionals,multiple-statements,loops,parameters,returns,switches,ifs,static,returns-primitives,visibility
0,org.apache.fontbox.cff.CIDKeyedType2CharString,getCID(),int,False,False,False,False,False,True,False,False,False,True,public
1,org.apache.xmpbox.type.ArrayProperty,getArrayType(),Cardinality,False,False,False,False,False,True,False,False,False,False,public
2,org.apache.fontbox.ttf.HeaderTable,getFontRevision(),float,False,False,False,False,False,True,False,False,False,True,public
3,org.apache.fontbox.ttf.VerticalMetricsTable,getAdvanceHeight(int),int,False,False,False,False,True,True,False,True,False,True,public
4,org.apache.pdfbox.text.TextPosition,getHeightDir(),float,False,False,False,False,False,True,False,False,False,True,public


### Methods that return objects

In [5]:
(final_df["returns-primitives"] == False).sum()

754

### Methods that are public

In [6]:
(final_df["visibility"] == "public").sum()

1169

### Methods that accept parameters

In [7]:
(final_df["parameters"] == True).sum()

151

### Methods with if statements

In [8]:
(final_df["ifs"] == True).sum()

37

### Methods wtih conditionals ?:

In [9]:
(final_df["conditionals"] == True).sum()

21

### Methods with local variables

In [10]:
(final_df["local-variables"] == True).sum()

40

### Methods with multiple statements

In [11]:
(final_df["multiple-statements"] == True).sum()

52

### Interesting methods from this set

In [12]:
instrumentation_candidates_df = pd.DataFrame(columns = cols)
instrumentation_candidates_df = final_df[((final_df['multiple-statements'] == True) | 
                                          (final_df['ifs'] == True) |
                                          (final_df['conditionals'] == True) | 
                                          (final_df['parameters'] == True) | 
                                          (final_df['switches'] == True) |
                                          (final_df['loops'] == True) |
                                          (final_df['local-variables'] == True)) &
                                          (final_df['static'] == False)]
print("Methods that meet these criteria: ", instrumentation_candidates_df.shape)
print("Of which public: ", (instrumentation_candidates_df["visibility"] == "public").sum())
instrumentation_candidates_df.head(20)

Methods that meet these criteria:  (137, 14)
Of which public:  52


Unnamed: 0,parent-FQN,method-signature,return-type,local-variables,conditionals,multiple-statements,loops,parameters,returns,switches,ifs,static,returns-primitives,visibility
3,org.apache.fontbox.ttf.VerticalMetricsTable,getAdvanceHeight(int),int,False,False,False,False,True,True,False,True,False,True,public
10,org.apache.fontbox.util.BoundingBox,"contains(float,float)",boolean,False,False,False,False,True,True,False,False,False,True,public
28,org.apache.pdfbox.pdmodel.encryption.AccessPer...,isPermissionBitOn(int),boolean,False,False,False,False,True,True,False,False,False,True,private
32,org.apache.pdfbox.util.SmallMap,size(),int,False,True,False,False,False,True,False,False,False,True,public
42,org.apache.pdfbox.tools.PDFText2HTML$FontState,openTag(java.lang.String),String,False,False,False,False,True,True,False,False,False,False,private
64,org.apache.pdfbox.debugger.flagbitspane.FieldFlag,"isFlagBitSet(int,int)",Boolean,True,False,True,False,True,True,False,False,False,False,private
96,org.apache.fontbox.ttf.PostScriptTable,getName(int),String,False,False,True,False,True,True,False,True,False,False,public
181,org.apache.fontbox.cmap.CMapParser,isWhitespaceOrEOF(int),boolean,False,False,False,False,True,True,False,False,False,True,private
182,org.apache.pdfbox.debugger.flagbitspane.Panose...,getFamilyKindValue(int),String,False,False,False,False,True,True,False,False,False,False,private
186,org.apache.fontbox.cmap.CMapParser,isDelimiter(int),boolean,False,False,False,False,True,True,True,False,False,True,private


In [13]:
instrumentation_candidates_public_df = pd.DataFrame(columns = cols)
instrumentation_candidates_public_df = final_df[((final_df['multiple-statements'] == True) | 
                                                 (final_df['ifs'] == True) |
                                                 (final_df['conditionals'] == True) | 
                                                 (final_df['parameters'] == True) | 
                                                 (final_df['switches'] == True) |
                                                 (final_df['loops'] == True) |
                                                 (final_df['local-variables'] == True)) &
                                                 (final_df['static'] == False) & 
                                                 (final_df['visibility'] == "public")]
print(instrumentation_candidates_public_df.shape)
instrumentation_candidates_public_df.head(60)

(52, 14)


Unnamed: 0,parent-FQN,method-signature,return-type,local-variables,conditionals,multiple-statements,loops,parameters,returns,switches,ifs,static,returns-primitives,visibility
3,org.apache.fontbox.ttf.VerticalMetricsTable,getAdvanceHeight(int),int,False,False,False,False,True,True,False,True,False,True,public
10,org.apache.fontbox.util.BoundingBox,"contains(float,float)",boolean,False,False,False,False,True,True,False,False,False,True,public
32,org.apache.pdfbox.util.SmallMap,size(),int,False,True,False,False,False,True,False,False,False,True,public
96,org.apache.fontbox.ttf.PostScriptTable,getName(int),String,False,False,True,False,True,True,False,True,False,False,public
189,org.apache.fontbox.ttf.GlyfSimpleDescript,getEndPtOfContours(int),int,False,False,False,False,True,True,False,False,False,True,public
197,org.apache.fontbox.ttf.GlyfSimpleDescript,getFlags(int),byte,False,False,False,False,True,True,False,False,False,True,public
230,org.apache.fontbox.ttf.GlyfSimpleDescript,getYCoordinate(int),short,False,False,False,False,True,True,False,False,False,True,public
279,org.apache.pdfbox.pdmodel.font.PDCIDFontType0,getCFFFont(),CFFFont,False,False,False,False,False,True,False,True,False,False,public
456,org.apache.fontbox.cmap.CIDRange,unmap(int),int,False,False,True,False,True,True,False,True,False,True,public
539,org.apache.pdfbox.util.Matrix,"getValue(int,int)",float,False,False,False,False,True,True,False,False,False,True,public
