# Import all the things! sklearn and glob namely.

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.datasets import load_files
import glob

## Load in the data from the 'data' folder. load_files takes the folder name as the target value for the data in each file in that folder

In [2]:
dataset = load_files('data', encoding='UTF-8', decode_error='replace')

## Create a vectorizer to transform the text in the files into token counts and train it for prediction.

In [3]:
vec = CountVectorizer(ngram_range=(1, 2))
X_train_counts = vec.fit_transform(dataset.data)

## The tfidf transformer uses token frequency instead of straight counts. tfidf takes the length of the file into account when counting how many times a certain character, or set of characters (depending on how the vectorizer was set - in our case it takes up to 2 words as a token) appears.

In [4]:
tf_transformer = TfidfTransformer()
X_train_tfidf = tf_transformer.fit_transform(X_train_counts)

## The SGDClassifier is similar to the Naive Bayes in that it is what predicts new data fed into it. Scikit-learn's documentation describes it as "a linear support vector machine (SVM), which is widely regarded as one of the best text classification algorithms (although it’s also a bit slower than naïve Bayes)." It did indeed surpass the Naive Bayes by 5-10% accuracy.

In [5]:
sgd = SGDClassifier(alpha=1e-3, random_state=42).fit(X_train_tfidf, dataset.target)
print("Train classifier Score: {}".format(sgd.score(X_train_tfidf, dataset.target)))

Train classifier Score: 0.9866496401207336


## Preparing a few funtions to read in test data for the classifier.

In [6]:
def get_file(code):
    f = open(code)
    text = [f.read()]
    return text

def get_lang(code_file):
    test_doc = get_file(code_file)
    X_new_counts = vec.transform(test_doc)
    X_new_tfidf = tf_transformer.transform(X_new_counts)

    predicted = dataset.target_names[sgd.predict(X_new_tfidf)[0]]

    return predicted

def run_tests(tests):
    results = []
    for t in tests:
        results.append(get_lang(t))
    return(results)

## Tests is a list of the test files in the test folder. Results is the output of running those tests through the classifier. Test_expected holds the actual code type of each test to check the results of prediction.

In [7]:
tests = glob.glob('test/*')
results = run_tests(tests)
test_expected = [('1', 'clojure'), ('10', 'javascript'), ('11', 'javascript'), ('12', 'javascript'), ('13', 'ruby'),
                 ('14', 'ruby'), ('15', 'ruby'), ('16', 'haskell'), ('17', 'haskell'), ('18', 'haskell'),
                 ('19', 'scheme'), ('2', 'clojure'), ('20', 'scheme'), ('21', 'scheme'), ('22', 'java'),
                 ('23', 'java'), ('24', 'scala'), ('25', 'scala'), ('26', 'tcl'), ('27', 'tcl'), ('28', 'php'),
                 ('29', 'php'), ('3', 'clojure'), ('30', 'php'), ('31', 'ocaml'), ('32', 'ocaml'), ('4', 'clojure'),
                 ('5', 'python'), ('6', 'python'), ('7', 'python'), ('8', 'python'), ('9', 'javascript')]


# Puts the results and target together and compares them. Outputs print statements for each test* and calculates the percent it got right at the bottom as well as noting which languages it failed to recognize correctly.
### *tcl files were not included in training, and so have been omitted in the testing.

In [9]:

both = zip(test_expected, results)
count = 0
failed_on = []

for i in both:
    correct = 'WRONG'
    if i[0][1] == i[1]:
        correct = 'CORRECT'
        count += 1
        print('Test {}: Expected: {}; Predicted: {}; {}'.format(i[0][0],
                                                                i[0][1], i[1],
                                                                correct))
    else:
        if i[0][1] != 'tcl':
            failed_on.append(i[0][1])
            print('Test {}: Expected: {}; Predicted: {}; {}'.format(i[0][0],
                                                                    i[0][1],
                                                                    i[1],
                                                                    correct))

print('\nPercent correct: %{}'.format(count/30*100))
print('Failed on: ', failed_on)

Test 1: Expected: clojure; Predicted: clojure; CORRECT
Test 10: Expected: javascript; Predicted: javascript; CORRECT
Test 11: Expected: javascript; Predicted: javascript; CORRECT
Test 12: Expected: javascript; Predicted: javascript; CORRECT
Test 13: Expected: ruby; Predicted: ruby; CORRECT
Test 14: Expected: ruby; Predicted: ruby; CORRECT
Test 15: Expected: ruby; Predicted: ruby; CORRECT
Test 16: Expected: haskell; Predicted: haskell; CORRECT
Test 17: Expected: haskell; Predicted: haskell; CORRECT
Test 18: Expected: haskell; Predicted: haskell; CORRECT
Test 19: Expected: scheme; Predicted: scheme; CORRECT
Test 2: Expected: clojure; Predicted: clojure; CORRECT
Test 20: Expected: scheme; Predicted: scheme; CORRECT
Test 21: Expected: scheme; Predicted: scheme; CORRECT
Test 22: Expected: java; Predicted: java; CORRECT
Test 23: Expected: java; Predicted: java; CORRECT
Test 24: Expected: scala; Predicted: scala; CORRECT
Test 25: Expected: scala; Predicted: scala; CORRECT
Test 28: Expected: p