# Code Similarity

## w/Python AST
* [pycode-similar · PyPI](https://pypi.org/project/pycode-similar/)

In [4]:
import pycode_similar

In [5]:
s1 = """
def y():
    pass
if x:
    return False
"""

s2 = """
def x():
    pass
if x:
    return True
"""

result = pycode_similar.detect([s1, s2])

In [6]:
result[0][1][0].plagiarism_percent

1.0

In [7]:
import ast

s1 = """
def x():
    pass
if x:
    return float(1)
"""

t = ast.parse(s1)

In [10]:
ast.dump(t)

"Module(body=[FunctionDef(name='x', args=arguments(posonlyargs=[], args=[], kwonlyargs=[], kw_defaults=[], defaults=[]), body=[Pass()], decorator_list=[]), If(test=Name(id='x', ctx=Load()), body=[Return(value=Call(func=Name(id='float', ctx=Load()), args=[Constant(value=1)], keywords=[]))], orelse=[])], type_ignores=[])"

In [11]:
ast.unparse(t)

'def x():\n    pass\nif x:\n    return float(1)'

In [26]:
import ast

s1 = """
l=[]
for i in range(10):
    l.append(i)
l
"""

t0 = ast.parse(s1)

In [27]:
print(ast.dump(t0, indent=4))

Module(
    body=[
        Assign(
            targets=[
                Name(id='l', ctx=Store())],
            value=List(elts=[], ctx=Load())),
        For(
            target=Name(id='i', ctx=Store()),
            iter=Call(
                func=Name(id='range', ctx=Load()),
                args=[
                    Constant(value=10)],
                keywords=[]),
            body=[
                Expr(
                    value=Call(
                        func=Attribute(
                            value=Name(id='l', ctx=Load()),
                            attr='append',
                            ctx=Load()),
                        args=[
                            Name(id='i', ctx=Load())],
                        keywords=[]))],
            orelse=[]),
        Expr(
            value=Name(id='l', ctx=Load()))],
    type_ignores=[])


In [28]:
print(ast.unparse(t0))

l = []
for i in range(10):
    l.append(i)
l


In [29]:
import ast

s1 = """
[i for i in range(10)]
"""

t1 = ast.parse(s1)

In [30]:
print(ast.dump(t1, indent=4))

Module(
    body=[
        Expr(
            value=ListComp(
                elt=Name(id='i', ctx=Load()),
                generators=[
                    comprehension(
                        target=Name(id='i', ctx=Store()),
                        iter=Call(
                            func=Name(id='range', ctx=Load()),
                            args=[
                                Constant(value=10)],
                            keywords=[]),
                        ifs=[],
                        is_async=0)]))],
    type_ignores=[])


In [31]:
print(ast.unparse(t1))

[i for i in range(10)]


In [32]:
import difflib

In [46]:
ast.dump(t0,indent=0).split('\n')

['Module(',
 'body=[',
 'Assign(',
 'targets=[',
 "Name(id='l', ctx=Store())],",
 'value=List(elts=[], ctx=Load())),',
 'For(',
 "target=Name(id='i', ctx=Store()),",
 'iter=Call(',
 "func=Name(id='range', ctx=Load()),",
 'args=[',
 'Constant(value=10)],',
 'keywords=[]),',
 'body=[',
 'Expr(',
 'value=Call(',
 'func=Attribute(',
 "value=Name(id='l', ctx=Load()),",
 "attr='append',",
 'ctx=Load()),',
 'args=[',
 "Name(id='i', ctx=Load())],",
 'keywords=[]))],',
 'orelse=[]),',
 'Expr(',
 "value=Name(id='l', ctx=Load()))],",
 'type_ignores=[])']

In [47]:
print('\n'.join(difflib.unified_diff(ast.dump(t0,indent=0).split('\n'),ast.dump(t1,indent=0).split('\n'),lineterm='')))

--- 
+++ 
@@ -1,27 +1,16 @@
 Module(
 body=[
-Assign(
-targets=[
-Name(id='l', ctx=Store())],
-value=List(elts=[], ctx=Load())),
-For(
+Expr(
+value=ListComp(
+elt=Name(id='i', ctx=Load()),
+generators=[
+comprehension(
 target=Name(id='i', ctx=Store()),
 iter=Call(
 func=Name(id='range', ctx=Load()),
 args=[
 Constant(value=10)],
 keywords=[]),
-body=[
-Expr(
-value=Call(
-func=Attribute(
-value=Name(id='l', ctx=Load()),
-attr='append',
-ctx=Load()),
-args=[
-Name(id='i', ctx=Load())],
-keywords=[]))],
-orelse=[]),
-Expr(
-value=Name(id='l', ctx=Load()))],
+ifs=[],
+is_async=0)]))],
 type_ignores=[])


## w/o Python AST

In [582]:
import difflib

In [583]:
difflib.SequenceMatcher(None,s1,s2).ratio()

0.9876543209876543

In [584]:
a="""1 ** 2"""
b="""1 * * 2"""

In [585]:
SequenceMatcher(None,a,b).ratio()

0.9230769230769231

### w/Tokenizer

* [The Token Types — Brown Water Python documentation](https://www.asmeurer.com/brown-water-python/tokens.html)
* [How to tokenize python code using the Tokenize module? - Stack Overflow](https://stackoverflow.com/questions/62166362/how-to-tokenize-python-code-using-the-tokenize-module)
* [Ultimate Guide To Text Similarity With Python - NewsCatcher](https://newscatcherapi.com/blog/ultimate-guide-to-text-similarity-with-python)
* [Python学習チャンネル by PyQ](https://blog.pyq.jp/)

In [586]:
from tokenize import tokenize, untokenize, generate_tokens, NUMBER, STRING, NAME, OP

In [587]:
import tokenize
import io

inp = """if x.y(x) :
       return (3.14 * * 3) + "x" """

def _tokenize(inp):
    t = []
    for token in tokenize.generate_tokens(io.StringIO(inp).readline):
        #print(tokenize.tok_name[token.type], tokenize.tok_name[token.exact_type], repr(token.string))
        if tokenize.tok_name[token.type] == 'DEDENT':
            break
        elif tokenize.tok_name[token.type] == 'ENDMARKER':
            pass
        elif (tokenize.tok_name[token.type] == 'INDENT' or
              tokenize.tok_name[token.type] == 'NEWLINE' or
              tokenize.tok_name[token.type] == 'NUMBER' or
              tokenize.tok_name[token.type] == 'STRING'):
            t.append(tokenize.tok_name[token.type])
        else:
            #print(tokenize.tok_name[token.exact_type])
            t.append(token.string)
    return(t)

print(_tokenize(inp))

['if', 'x', '.', 'y', '(', 'x', ')', ':', 'NEWLINE', 'INDENT', 'return', '(', 'NUMBER', '*', '*', 'NUMBER', ')', '+', 'STRING', 'NEWLINE']


### w/Unified Diff

In [588]:
t0=_tokenize('3 * 4')
t1=_tokenize('3 **             4')

In [612]:
t0=_tokenize('[len(s) for s in huck_finn_chapters]')
t1=_tokenize('''for s in huck_finn_chapters:
    print(len(s))''')

In [613]:
[t0,t1]

[['[',
  'len',
  '(',
  's',
  ')',
  'for',
  's',
  'in',
  'huck_finn_chapters',
  ']',
  'NEWLINE'],
 ['for',
  's',
  'in',
  'huck_finn_chapters',
  ':',
  'NEWLINE',
  'INDENT',
  'print',
  '(',
  'len',
  '(',
  's',
  ')',
  ')',
  'NEWLINE']]

In [614]:
print('\n'.join(difflib.unified_diff(t0,t1,lineterm='')))

--- 
+++ 
@@ -1,11 +1,15 @@
-[
+for
+s
+in
+huck_finn_chapters
+:
+NEWLINE
+INDENT
+print
+(
 len
 (
 s
 )
-for
-s
-in
-huck_finn_chapters
-]
+)
 NEWLINE


In [615]:
import re
re_add = re.compile(r"^\+(?!\+)")
re_del = re.compile(r"^-(?!-)")

num_add, num_del = 0, 0
for line in difflib.unified_diff(t0,t1,lineterm=''):
    if re_add.match(line):
        num_add+=1
    elif re_del.match(line):
        num_del+=1
    else:
        pass
print(len(t0), num_add, num_del, len(t0)+num_add-num_del, len(t1)-num_add)
# len(t0), +, -, len(t1), number of unmodified lines

11 10 6 15 5


### w/TF-IDF (ngram)

In [604]:
from sklearn.feature_extraction.text import TfidfVectorizer
def identity_tokenizer(text):
    return(text)

In [628]:
tfidf = TfidfVectorizer(tokenizer=identity_tokenizer, lowercase=False, ngram_range=(4,4))

In [629]:
x=tfidf.fit_transform([t0,t1])

In [630]:
tfidf.get_feature_names()

['( len ( s',
 '( s ) )',
 '( s ) for',
 ') for s in',
 ': NEWLINE INDENT print',
 'INDENT print ( len',
 'NEWLINE INDENT print (',
 '[ len ( s',
 'for s in huck_finn_chapters',
 'huck_finn_chapters : NEWLINE INDENT',
 'in huck_finn_chapters : NEWLINE',
 'in huck_finn_chapters ] NEWLINE',
 'len ( s )',
 'print ( len (',
 's ) ) NEWLINE',
 's ) for s',
 's in huck_finn_chapters :',
 's in huck_finn_chapters ]']

In [631]:
tfidf.idf_

array([1.40546511, 1.40546511, 1.40546511, 1.40546511, 1.40546511,
       1.40546511, 1.40546511, 1.40546511, 1.        , 1.40546511,
       1.40546511, 1.40546511, 1.        , 1.40546511, 1.40546511,
       1.40546511, 1.40546511, 1.40546511])

In [632]:
for t, idf in zip(tfidf.get_feature_names(), tfidf.idf_):
    if idf <= 1.:
        print(t)

for s in huck_finn_chapters
len ( s )


In [627]:
t0=_tokenize('[len(s) for s in huck_finn_chapters]')
t1=_tokenize('''for s in huck_finn_chapters:
    print(len(s))''')

##  Distance
* [Jaccard index - Wikipedia](https://en.wikipedia.org/wiki/Jaccard_index)

In [565]:
from datasketch import MinHash

In [572]:
t0=_tokenize('3 * 4')
t1=_tokenize('3 * * 4')
[t0,t1]

[['NUMBER', '*', 'NUMBER', 'NEWLINE'],
 ['NUMBER', '*', '*', 'NUMBER', 'NEWLINE']]

In [573]:
m1,m2=MinHash(num_perm=512), MinHash(num_perm=512)

In [574]:
for d in t0:
    m1.update(d.encode('utf8'))
for d in t1:
    m2.update(d.encode('utf8'))

In [575]:
m1.jaccard(m2), m2.jaccard(m1)

(1.0, 1.0)

In [576]:
s1 = set(t0)
s2 = set(t1)

In [577]:
float(len(s1.intersection(s2)))/float(len(s1.union(s2)))

1.0