In [1]:
from google.cloud import bigquery
import pandas as pd
import ast
import glove_helper
import tensorflow as tf
import numpy as np

from itertools import groupby
from os.path import basename, splitext
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

  from ._conv import register_converters as _register_converters


Before running the script, you will need to CMD and authenticate with 

'gcloud auth application-default login'


In [2]:
client = bigquery.Client(project='manifest-frame-203601')



In [3]:
QUERY = (
    """
    select * from w266_final.final_20k
    LIMIT 1000""")
query_job = client.query(QUERY)  # API request
rows = query_job.result()  # Waits for query to finish

df = []
for row in rows:
    df.append([row.repo_path,row.c_content])

In [4]:
df = pd.DataFrame(df)
df.columns = ['repo_path','content']
df

Unnamed: 0,repo_path,content
0,watchdogpolska/feder feder/records/types.py,"from abc import abstractmethod, ABCMeta\n\n\nc..."
1,softappeal/yass py2/test/contract_test.py,import unittest\nfrom typing import Any\n\nimp...
2,gcarq/freqtrade freqtrade/tests/test_fiat_conv...,"# pragma pylint: disable=missing-docstring, to..."
3,devilry/devilry-django devilry/devilry_compres...,# -*- coding: utf-8 -*-\n# Generated by Django...
4,erigones/esdc-ce api/dc/storage/serializers.py,from api import serializers as s\nfrom vms.mod...
5,moddevices/mod-ui mod/settings.py,# -*- coding: utf-8 -*-\n\n# Copyright 2012-20...
6,ms-iot/python cpython/Tools/unicode/gencodec.py,""""""" Unicode Mapping Parser and Codec Generator..."
7,flammified/terrabot terrabot/packets/packet39.py,import struct\n\n\nclass Packet39Parser(object...
8,lukasmonk/lucaschess Code/QT/PantallaConfig.py,from PyQt4 import QtCore\n\nfrom Code import D...
9,MetaMetricsInc/django-static-version example/e...,"""""""\nDjango settings for example project.\n\nG..."


In [5]:
def cleanup(docstring_list):
    
    """takes a list of doc strings and converts to a single flat list of tokens"""
    
    tokens = [tf.keras.preprocessing.text.text_to_word_sequence(i) for i in docstring_list]
    flat_tokens = [item for sublist in tokens for item in sublist]
    flat_string = " ".join(flat_tokens)
    
    return flat_string

def get_docstrings(source):
    
    """function to walk through parse tree and return list of docstrings"""
    
    NODE_TYPES = {
    ast.ClassDef: 'Class',
    ast.FunctionDef: 'Function/Method',
    ast.Module: 'Module'
    }
    
    docstrings = []
    
    try:
        tree = ast.parse(source)
    except:
        return " "
       
    for node in ast.walk(tree):
        if isinstance(node, tuple(NODE_TYPES)):
            docstring = ast.get_docstring(node)
            docstrings.append(docstring)
    
    docstrings =  [x for x in docstrings if x is not None]
    clean_string = cleanup(docstrings)
            
    return clean_string

In [6]:
df['docstrings'] = [get_docstrings(x) for x in list(df['content'])]

In [7]:
hands = glove_helper.Hands(ndim=100)

Loading vectors from data/glove/glove.6B.zip
Parsing file: data/glove/glove.6B.zip:glove.6B.100d.txt
Found 400,000 words.
Parsing vectors... Done! (W.shape = (400003, 100))


In [12]:
#Set up corpus for count vectorizer
corpus = list(df['docstrings'])

#count values for tfidf calculations
count_vect = CountVectorizer()
count_vect = count_vect.fit(corpus)
freq_term_matrix = count_vect.transform(corpus)

#generate idf scores
tfidf = TfidfTransformer(norm="l2")
tfidf.fit(freq_term_matrix)

#to grab columns for words
vocab = count_vect.vocabulary_

def sent_to_embed(sentence):
    
    """converts a sentence to the average of it's tfidf weighted word embedding vectors"""
    
    if len(sentence)==0:
        return np.zeros(100)
    
    global vocab, count_vect, tfidf
    
    doc_freq_term = count_vect.transform([sentence])
    idfs = tfidf.transform(doc_freq_term)
    sent_list = sentence.split(" ")
    embeddings = []
    
    for i in range(len(sent_list)):
        
        if sent_list[i] in vocab:
            
            col = vocab[sent_list[i]]
            embed = hands.get_vector(sent_list[i], strict=False)
            tfidf = idfs[0, col]
            embeddings.append(np.multiply(embed, tfidf))
    
    embed_array = np.asarray(embeddings)
        
    return np.mean(embed_array, axis=0)


In [13]:
doc = corpus[4]
print(sent_to_embed(doc))
print(corpus[4])

for x in list(df['docstrings']):
    print(sent_to_embed(x).shape)

[ 0.0234355   0.22777049 -0.0171718  -0.0553103   0.0016924  -0.07032032
  0.0743405   0.05483138  0.03267015 -0.08054673  0.08635088  0.15677272
  0.29848596  0.00899725  0.08804607 -0.06751103  0.12732211  0.17513788
  0.23478861 -0.0958536  -0.08928827 -0.18141979  0.11200067  0.21903269
 -0.02076004 -0.12438046  0.0974158   0.00058226 -0.04466866  0.08916865
  0.08171917  0.14848487 -0.23854826 -0.07286189  0.30173206 -0.00879507
  0.09035531 -0.02529267  0.12300045 -0.24701552 -0.00617683 -0.07225616
 -0.10411046  0.19979866 -0.00975291  0.09237226 -0.059384   -0.10374748
 -0.0994691   0.1631269  -0.00237415  0.04395526 -0.06288457  0.07528418
  0.24503791  0.0820905   0.04279137 -0.04692107  0.06633011 -0.00681209
 -0.1287261   0.01459729 -0.16214044  0.22237132  0.02553525  0.09783664
 -0.09344202 -0.32609722  0.09482899  0.01330843  0.00610403  0.08294348
 -0.04318903  0.01845386 -0.13952835 -0.01660035  0.18413396  0.16456003
  0.05464598 -0.17527129  0.11942548 -0.14637512 -0

AttributeError: 'numpy.float64' object has no attribute 'transform'