# Experiments
TODO: 24-27 June 2019:
* Create pipeline
    * with initial unigrams baseline
    * accuracy measure (e.g. precision-recall with AUROC)
## Setup environment:
* import libraries
* load csv data

In [1]:
import random
import pandas as pd
from nltk.corpus import treebank
from sklearn.model_selection import train_test_split

description_df = pd.read_csv('./data/description.csv')
installation_df = pd.read_csv('./data/installation.csv')
invocation_df = pd.read_csv('./data/invocation.csv')
citation_df = pd.read_csv('./data/citation.csv')

## Data Preview
Make sure that csv data has been successfully imported.

In [2]:
description_df.head()

Unnamed: 0,URL,excerpt
0,https://github.com/GoogleChrome/puppeteer,Puppeteer is a Node library which provides a h...
1,https://github.com/JimmySuen/integral-human-pose,The major contributors of this repository incl...
2,https://github.com/JimmySuen/integral-human-pose,Integral Regression is initially described in ...
3,https://github.com/JimmySuen/integral-human-pose,We build a 3D pose estimation system based mai...
4,https://github.com/JimmySuen/integral-human-pose,The Integral Regression is also known as soft-...


In [3]:
installation_df.head()

Unnamed: 0,URL,excerpt
0,https://github.com/GoogleChrome/puppeteer,Installation
1,https://github.com/GoogleChrome/puppeteer,"To use Puppeteer in your project, run:"
2,https://github.com/GoogleChrome/puppeteer,npm i puppeteer
3,https://github.com/GoogleChrome/puppeteer,"# or ""yarn add puppeteer"""
4,https://github.com/GoogleChrome/puppeteer,puppeteer-core


In [4]:
invocation_df.head()

Unnamed: 0,URL,excerpt
0,https://github.com/JimmySuen/integral-human-pose,Usage
1,https://github.com/JimmySuen/integral-human-pose,We have placed some example config files in ex...
2,https://github.com/JimmySuen/integral-human-pose,Train
3,https://github.com/JimmySuen/integral-human-pose,"For Integral Human Pose Regression, cd to pyto..."
4,https://github.com/JimmySuen/integral-human-pose,Integral Regression


In [5]:
citation_df.head()

Unnamed: 0,URL,excerpt
0,https://github.com/JimmySuen/integral-human-pose,If you find Integral Regression useful in your...
1,https://github.com/JimmySuen/integral-human-pose,"@article{sun2017integral,"
2,https://github.com/JimmySuen/integral-human-pose,"title={Integral human pose regression},"
3,https://github.com/JimmySuen/integral-human-pose,"author={Sun, Xiao and Xiao, Bin and Liang, Shu..."
4,https://github.com/JimmySuen/integral-human-pose,"journal={arXiv preprint arXiv:1711.08229},"


Each data set currently contains positive samples of its respective trait. However, negative samples are necessary to distinguish the positive against some sort of control. Per category, negative samples include those from the other categories and also text samples completely unrelated to repository information. For example, in the description classifier, positive samples would be those that were labelled as a description, and negative samples would include those labelled as a installation, invocation, or citation in addition to nonpertinent text such as the Treebank corpus.

As there are many more negative samples than there are positive samples, randomly selected negative samples will be used. The aim is for about 60% positive and 40% negative. Of the 40% negative, 10% for each outside category and 10% for random, e.g. Treebank, text. 

*Question: Treebank sentences are already tokenized / split by word. Does nltk have sentences not already split or is it possible to utilize the already split state of the sentences for later tokenizer usage?*
## Description Classifier

In [6]:
neg_quant = int(len(description_df) / 6)
treebank_background = pd.DataFrame(list(map(lambda sent: ' '.join(sent), random.sample(list(treebank.sents()), neg_quant))), columns=["excerpt"]).assign(description=False)
description_corpus = pd.concat([description_df.assign(description=True), installation_df.sample(neg_quant).assign(description=False), invocation_df.sample(neg_quant).assign(description=False), citation_df.sample(neg_quant).assign(description=False),treebank_background], sort=False)
description_corpus.drop('URL', 1, inplace=True)
description_corpus.reset_index(drop=True, inplace=True)
description_corpus

Unnamed: 0,excerpt,description
0,Puppeteer is a Node library which provides a h...,True
1,The major contributors of this repository incl...,True
2,Integral Regression is initially described in ...,True
3,We build a 3D pose estimation system based mai...,True
4,The Integral Regression is also known as soft-...,True
5,This is an official implementation for Integra...,True
6,The original implementation is based on our in...,True
7,LibGEOS is a LGPL-licensed package for manipul...,True
8,"Among other things, it allows you to parse Wel...",True
9,This repository contains the experiments in th...,True


## Build basic pipeline
### Train-test split

In [7]:
X, y = description_corpus.excerpt, description_corpus.description
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [8]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

def display_accuracy_score(y_test, y_pred_class):
    score = accuracy_score(y_test, y_pred_class)
    print('accuracy score: %s' % '{:.2%}'.format(score))
    return score
def display_null_accuracy(y_test):
    value_counts = pd.value_counts(y_test)
    null_accuracy = max(value_counts) / float(len(y_test))
    print('null accuracy: %s' % '{:.2%}'.format(null_accuracy))
    return null_accuracy

def display_accuracy_difference(y_test, y_pred_class):
    null_accuracy = display_null_accuracy(y_test)
    accuracy_score = display_accuracy_score(y_test, y_pred_class)
    difference = accuracy_score - null_accuracy
    if difference > 0:
        print('model is %s more accurate than null accuracy' % '{:.2%}'.format(difference))
    elif difference < 0:
        print('model is %s less accurate than null accuracy' % '{:.2%}'.format(abs(difference)))
    elif difference == 0:
        print('model is exactly as accurate as null accuracy')
    return null_accuracy, accuracy_score

pipeline = make_pipeline(CountVectorizer(), LogisticRegression())
pipeline.fit(X_train, y_train)
print(pipeline)
y_pred_class = pipeline.predict(X_test)
print(confusion_matrix(y_test, y_pred_class))
print('-' * 75 + '\nClassification Report\n')
print(classification_report(y_test, y_pred_class))
display_accuracy_difference(y_test, y_pred_class)


Pipeline(memory=None,
         steps=[('countvectorizer',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('logisticregression',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,




(0.6428571428571429, 0.8174603174603174)

In [9]:
len(description_df)

304