In [3]:
import os
from datetime import datetime

import fasttext

DIR = os.path.abspath('')


def train(tf, vf, size='1M'):
    now = datetime.now().strftime('%Y%m%d%H%M')
    m = os.path.join(DIR, f'commits-{size.lower()}-{now}.bin')
    # auto-tuned model through supervised learning
    # and hyper-parameters autotuned through validation file
    model = fasttext.train_supervised(
        input=tf,
        autotuneValidationFile=vf,
        autotuneModelSize=size,
    )
    model.save_model(m)
    return m

In [6]:
tf = os.path.join(DIR, 'data', 'input.txt')
vf = os.path.join(DIR, 'data', 'valid.txt')

m = train(tf, vf)
f'{os.stat(m).st_size / 1024 / 1024} MB'

'0.9604415893554688 MB'

In [7]:
model = fasttext.load_model(m)
model.test(vf)



(1689, 0.8052101835405565, 0.8052101835405565)

In [12]:
def threshold_test(threshold=0.8):
    count = 0
    below = 0
    sf = os.path.join(DIR, 'data', 'snoke')
    ff = os.path.join(DIR, 'data', 'firstorder')

    with open(sf) as snoke, open(ff) as fo:
        for commit in snoke:
            count += 1
            label, score = model.predict(commit.strip().lower())
            if score < threshold:
                below += 1

        for commit in fo:
            count += 1
            label, score = model.predict(commit.strip().lower())
            if score < threshold:
                below += 1
    
    return below / count

In [14]:
threshold = 0.9
percentage = threshold_test(threshold)

print(f'{percentage * 100}% below {threshold} confidence')

35.6% below 0.9 confidence
