In [26]:
import json

from tqdm import tqdm

import re

import random

from sklearn.decomposition import LatentDirichletAllocation

from sklearn.feature_extraction.text import TfidfVectorizer

from time import time

import pickle

graphs = []
with open('./graphs/cell_with_func_python23_1_27.txt','r') as f:
    for l in tqdm(f):
        graphs.append(json.loads(l))

def clean_code_snippet(code):
    
    return re.sub('[^a-zA-Z\n]+', ' ', code)


def split_func_name(func):
    """
    split function names
    eg. sklearn.metrics.pairwise.cosine_similarity -> [sklearn, metrics, pairwise, cosine, similarity]
    """
    new_str = ''
    for i, l in enumerate(func):
        if i > 0 and l.isupper() and func[i - 1].islower():
            new_str += '.'
        elif i > 0 and i < len(func) - 1 and l.isupper() and func[i - 1].isupper() and func[i + 1].islower():
            new_str += '.'
        elif i > 0 and l.isdigit() and func[i - 1].isalpha():
            new_str += '.'
        elif i < len(func) - 1 and l.isalpha() and func[i - 1].isdigit():
            new_str += '.'
        else:
            pass
        new_str += l
    return re.split('\.|_|\s', new_str.lower())


corpus = [clean_code_snippet(g["context"]) for g in graphs]

clean_data = []
for c in tqdm(corpus):
    token = split_func_name(c)
    token = [t for t in token if t]
    clean_data.append(' '.join(token))
    

documents = clean_data

vectorizer = TfidfVectorizer(max_features=10000)

X = vectorizer.fit_transform(documents)

2516572it [02:55, 14298.82it/s]
100%|██████████| 2516572/2516572 [03:01<00:00, 13869.84it/s]


In [27]:
lda = LatentDirichletAllocation(n_components=100, random_state=0, max_iter=1)

start_time = time()

lda.fit(X)

print(time()-start_time)

pickle.dump(lda, open('./lda.sav', 'wb'))

848.4674320220947


In [40]:
loaded_model = pickle.load(open('./lda.sav', 'rb'))

In [7]:
loaded_model

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=1,
                          mean_change_tol=0.001, n_components=50, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [41]:
results = loaded_model.transform(X)

In [11]:
import numpy

In [140]:
type(results)

numpy.ndarray

In [42]:
results.dump('./lda_results_1_30_2.npy')

In [43]:
numpy.load('./lda_results_1_30_2.npy', allow_pickle=True).shape


(2516572, 100)

In [126]:
documents

['import numpy as np import pandas as pd from statsmodels sandbox regression import gmm dta pd read csv consumption csv dta iloc',
 'def moment consumption params exog beta gamma params r forw c forw c exog t err beta r forw np power c forw c gamma return err',
 'endog np zeros exog shape mod gmm nonlinear ivgmm endog exog instrument moment consumption k moms w inv np dot instrument t instrument len endog res mod fit maxiter inv weights w inv',
 'print res summary yname euler eq xname discount crra',
 'res hac s mod fit maxiter inv weights w inv weights method hac wargs maxlag print print res hac s summary yname euler eq xname discount crra',
 'def moment consumption params exog beta gamma params r forw c forw c exog t predicted beta r forw np power c forw c gamma return predicted',
 'endog np ones exog shape mod gmm nonlinear ivgmm endog exog instrument moment consumption k moms w inv np dot instrument t instrument len endog res hac s mod fit maxiter inv weights w inv weights method h

## Load lda predict

In [12]:
lda_results = numpy.load('./lda_results_1_30_2.npy', allow_pickle=True)

In [13]:
import torch

import torch.nn as nn

In [15]:
targets = [cell_type(g["funcs"], g["nodes"], g["header"]) for g in graphs]

In [16]:
targets

[0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 4,
 2,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 4,
 0,
 1,
 2,
 0,
 0,
 0,
 5,
 0,
 0,
 2,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 3,
 0,
 0,
 2,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 0,
 1,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 2,
 2,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 3,
 0,
 0,
 0,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 3,
 0,
 2,
 0,
 0,
 5,
 0,
 0,
 0,
 5,
 4,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 1,
 0,
 3,
 0,
 2,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 5,
 0,
 0,
 4,
 3,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 4,
 0,
 0,
 0,
 0,
 2,
 0,
 5,
 0,
 0,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 0,
 0,
 0,
 0,
 2,
 0,
 5,
 0,
 0,
 5,
 5,


In [156]:
cell_type(graphs[2]["funcs"], graphs[2]["nodes"])

0

In [14]:

STAGE_PAD = 0
WRANGLE = 1
EXPLORE = 2
MODEL = 3
EVALUATE = 4
IMPORT = 5

SPV_MODE = [STAGE_PAD, WRANGLE, EXPLORE, MODEL, EVALUATE, IMPORT]

wrangle_funcs = ['pandas.read_csv', 'pandas.read_csv.dropna', 'pandas.read_csv.fillna',
                 'pandas.DataFrame.fillna', 'sklearn.datasets.load_iris', 'scipy.misc.imread',
                 'scipy.io.loadmat','sklearn.preprocessing.LabelEncoder', 'scipy.interpolate.interp1d']

explore_funcs = ['seaborn.distplot', 'matplotlib.pyplot.show', 'matplotlib.pyplot.plot', 'matplotlib.pyplot.figure',
                 'seaborn.pairplot', 'seaborn.heatmap', 'seaborn.lmplot','pandas.read_csv.describe',
                 'pandas.DataFrame.describe']
# 'matplotlib.pyplot.xlabel', 'matplotlib.pyplot.ylabel'
model_funcs = ['sklearn.cluster.KMeans',
               'sklearn.decomposition.PCA',
               'sklearn.naive_bayes.GaussianNB',
               'sklearn.ensemble.RandomForestClassifier',
               'sklearn.linear_model.LinearRegression',
               'sklearn.linear_model.LogisticRegression',
               'sklearn.tree.DecisionTreeRegressor',
               'sklearn.ensemble.BaggingRegressor',
               'sklearn.neighbors.KNeighborsClassifier',
               'sklearn.naive_bayes.MultinomialNB',
               'sklearn.svm.SVC',
               'sklearn.tree.DecisionTreeClassifier',
               'tensorflow.Session',
               'sklearn.linear_model.Ridge',
               'sklearn.linear_model.Lasso']

evaluate_funcs = ['sklearn.metrics.confusion_matrix', 'sklearn.cross_validation.cross_val_score',
                  'sklearn.metrics.mean_squared_error', 'sklearn.model_selection.cross_val_score', 'scipy.stats.ttest_ind', 'sklearn.metrics.accuracy_score']




def cell_type(funcs, nodes=None, header=None):
    # pdb.set_trace()
    # print(header)
    grams = [t.lower() for t in header.split() if t]
    bi_grams = ['{} {}'.format(t, grams[i + 1])
                for i, t in enumerate(grams[:-1])]

    if sum([1 for n in nodes if (n["type"] == 'Import' or n["type"] == 'ImportFrom')]) / len(nodes) > 0.3:
        return IMPORT

    if any([g in bi_grams for g in ['logistic regression', 'machine learning', 'random forest']]) and len(grams) <= 3:
        return MODEL
    if 'cross validation' in bi_grams and len(grams) <= 3:
        return EVALUATE

    if any([f in funcs for f in model_funcs]):
        return MODEL
    if any([f in funcs for f in evaluate_funcs]):
        return EVALUATE
    if any([f in funcs for f in explore_funcs]):
        return EXPLORE
    if len(nodes) == 3 and nodes[1]["type"] == "Expr":
        return EXPLORE

    if any([f in funcs for f in wrangle_funcs]):
        return WRANGLE
        # print(h)
    return STAGE_PAD



In [17]:
class Classifier(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear = nn.Linear(50, 6)
    
    def forward(self, inputs):
        return self.linear(inputs)

In [18]:
classifier = Classifier()

In [173]:
lda_results = torch.Tensor(lda_results.astype(float))

In [169]:
lda_results

tensor([[0.1351, 0.0043, 0.0043,  ..., 0.0043, 0.0043, 0.0043],
        [0.0050, 0.0050, 0.0050,  ..., 0.0050, 0.0050, 0.0050],
        [0.0046, 0.0046, 0.0046,  ..., 0.0046, 0.0046, 0.0046],
        ...,
        [0.0069, 0.0069, 0.0069,  ..., 0.0069, 0.0069, 0.0069],
        [0.0074, 0.0074, 0.0074,  ..., 0.0074, 0.0074, 0.0074],
        [0.0083, 0.0083, 0.0083,  ..., 0.0083, 0.0083, 0.0083]],
       dtype=torch.float64)

In [174]:
classifier(lda_results)

tensor([[ 0.1117,  0.0832, -0.0803, -0.0395, -0.1218, -0.0333],
        [ 0.1098,  0.0388, -0.0758,  0.0370, -0.0503, -0.1018],
        [ 0.1259,  0.0391, -0.0464,  0.0418, -0.0769, -0.1443],
        ...,
        [ 0.1383,  0.1006, -0.0658, -0.0789, -0.1210, -0.1581],
        [ 0.1233,  0.0295, -0.1152,  0.0500, -0.0075, -0.0391],
        [ 0.1318,  0.0112, -0.1470,  0.0350, -0.1115, -0.1156]],
       grad_fn=<AddmmBackward>)

In [175]:
loaded_model

LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
                          evaluate_every=-1, learning_decay=0.7,
                          learning_method='batch', learning_offset=10.0,
                          max_doc_update_iter=100, max_iter=10,
                          mean_change_tol=0.001, n_components=50, n_jobs=None,
                          perp_tol=0.1, random_state=0, topic_word_prior=None,
                          total_samples=1000000.0, verbose=0)

In [31]:
test_graphs = []
with open('./graphs/test_cells_1_27.txt','r') as f:
    for l in f:
        test_graphs.append(json.loads(l))

In [32]:
len(test_graphs)

1745

In [33]:

test_corpus = [clean_code_snippet(g["context"]) for g in test_graphs]


In [179]:
test_corpus

['\nimport pandas as pd\n\nfrom pandas import Series DataFrame\n\nimport matplotlib pyplot as plt\n\nimport seaborn as sns\n\nfrom sklearn import datasets\n\nfrom sklearn import metrics\n\nfrom sklearn naive bayes import GaussianNB\n',
 '\niris datasets load iris \n\nX iris data\n\nY iris target\n\nprint iris DESCR \n',
 '\nmodel GaussianNB \n',
 '\nfrom sklearn cross validation import train test split\n\n X train X test Y train Y test train test split X Y \n',
 '\nmodel fit X train Y train \n',
 '\npredicted model predict X test \n\nexpected Y test\n',
 '\nprint metrics accuracy score expected predicted \n',
 '\nplayers pd read csv atp data atp players csv header None names playerId first name last name hand birthdate nationality \n\nplayers head \n',
 '\ncolumns tourney id tourney name surface draw size tourney level tourney date match num winner name loser name score best of round minutes winner hand winner ht winner age w ace w df w svpt w stIn w stWon w ndWon w SvGms w bpSaved w b

In [34]:

test_clean_data = []
for c in tqdm(test_corpus):
    token = split_func_name(c)
    token = [t for t in token if t]
    test_clean_data.append(' '.join(token))
    

100%|██████████| 1745/1745 [00:00<00:00, 6149.35it/s]


In [44]:

test_documents = test_clean_data
# vectorizer.fit(documents)
test_X = vectorizer.transform(test_documents)

In [45]:
test_X

<1745x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 24737 stored elements in Compressed Sparse Row format>

In [46]:
test_lda_results = loaded_model.transform(test_X)

In [47]:
test_lda_results.dump('./test_lda_results_1_30_2.npy')

In [None]:
test_graphs[]