In [1]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import train_test_split
import glob

# Polyglot

Code snippets classification into programming languages.
#### Task 
Create a classifier that can take snippets of code and guesses the programming language of the code.

Creating a dictionary with keys as file extensions and values as the name of the programming language. 

In [2]:
keys = ['.clojure', '.hs', '.java', '.javascript', '.ocaml', '.php', '.py', '.jruby', '.scala', '.racket', 
        '.python3', '.hack', '.yarv', '.tcl', '.js', '.gcc', '.csharp', '.sbcl']
values = ['Clojure', 'Haskell', 'Java', 'JavaScript', 'OCaml', 'PHP', 'Python', 'Ruby', 'Scala', 'Scheme', 
          'Python', 'PHP', 'Ruby', 'TCL', 'JavaScript', 'C', 'C#', 'Common Lisp']
dictionary = dict(zip(keys, values))
#, 'Perl', 'Perl', 'Perl'   , '.pm', '.pl', '.t'

Creating a function that reads all files in the folder and adds to the list only those that correspond to the file extensions listed above. This functions returns the list of file contests and list of names of the programming languages. 

In [3]:
def read_files():
    files = glob.glob('extensions/*/*')
    texts = []
    labels = []
    for file in files:
        for item in keys:
            if file.endswith(item):
                with open(file, encoding = 'latin_1') as f:
                    texts.append(f.read())
                    labels.append(dictionary[item])
    return texts, labels

In [4]:
X, y = read_files()

In [5]:
#X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.9, random_state = 0)

Training the model with data:

In [6]:
text_clf = Pipeline([('vect', CountVectorizer(max_df = 0.8)), 
                     #('tfidf', TfidfTransformer()), 
                     #('tfidf_vec', TfidfVectorizer(analyzer='char')),
                     #('rfc', RandomForestClassifier(n_estimators = 10, random_state = 0)),
                     #('sgdc', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=4)),
                     ('clf', MultinomialNB())
                    ])
#text_clf.fit(X_train, y_train)
text_clf.fit(X, y)

Pipeline(steps=[('vect', CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=0.8, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None, stop_words=None,
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)), ('clf', MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))])

In [7]:
#text_clf.score(X_train, y_train)
text_clf.score(X, y)

0.97922848664688422

In [8]:
#text_clf.score(X_test, y_test)

Creating a function that takes a snippet of code as a string and returns the programming language of the code.

In [9]:
def classify_code(string):
    text_clf = Pipeline([('vect', CountVectorizer(max_df = 0.8)), 
                     #('tfidf', TfidfTransformer()), 
                     #('tfidf_vec', TfidfVectorizer(analyzer='char')),
                     #('rfc', RandomForestClassifier(n_estimators = 10, random_state = 0)),
                     #('sgdc', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, n_iter=5, random_state=4)),
                     ('clf', MultinomialNB())
                    ])
    #text_clf.fit(X_train, y_train)
    text_clf.fit(X, y)
    lang = text_clf.predict(string) 
    return lang[0]

Creating a function that reads test files that contain snippets of code.

In [10]:
def get_test_files():
    files = glob.glob('test/*')
    snippets = []
    for file in files:
        with open(file) as f:
            snippets.append(f.read())
    return snippets

Predicting the result:

In [11]:
for num in range(0,32):
    code = get_test_files()
    predicted = text_clf.predict([code[num]])
    print(num + 1, predicted)

1 ['PHP']
2 ['Clojure']
3 ['Clojure']
4 ['Clojure']
5 ['Python']
6 ['Python']
7 ['Python']
8 ['Python']
9 ['JavaScript']
10 ['JavaScript']
11 ['JavaScript']
12 ['JavaScript']
13 ['Ruby']
14 ['Python']
15 ['Ruby']
16 ['Haskell']
17 ['Haskell']
18 ['TCL']
19 ['Scheme']
20 ['Scheme']
21 ['Scheme']
22 ['Java']
23 ['TCL']
24 ['Scala']
25 ['Scala']
26 ['TCL']
27 ['TCL']
28 ['TCL']
29 ['JavaScript']
30 ['JavaScript']
31 ['OCaml']
32 ['OCaml']


Result: 25 out of 32 snippets of code were guessed correctly (78%). 

## Testing 

These snippets of code were taken from https://highlightjs.org/

In [12]:
# Clojure
my_snippet_1 = ["""(def ^:dynamic chunk-size 17)
(defn next-chunk [rdr]
  (let [buf (char-array chunk-size)
        s (.read rdr buf)]
  (when (pos? s)
    (java.nio.CharBuffer/wrap buf 0 s))))
(defn chunk-seq [rdr]
  (when-let [chunk (next-chunk rdr)]
    (cons chunk (lazy-seq (chunk-seq rdr)))))"""]
classify_code(my_snippet_1)

'Clojure'

In [13]:
#Python
my_snippet_2 = ["""@requires_authorization
def somefunc(param1='', param2=0):
    r'''A docstring'''
    if param1 > param2: # interesting
        print 'Gre\'ater'
    return (param2 - param1 + 1 + 0b10l) or None
class SomeClass:
    pass
>>> message = '''interpreter
... prompt'''"""]
classify_code(my_snippet_2)

'Python'

In [14]:
#JavaScript
my_snippet_3 = ["""function $initHighlight(block, cls) {
  try {
    if (cls.search(/\bno\-highlight\b/) != -1)
      return process(block, true, 0x0F) +
             ` class="${cls}"`;
  } catch (e) {
    /* handle exception */
  }
  for (var i = 0 / 2; i < classes.length; i++) {
    if (checkCondition(classes[i]) === undefined)
      console.log('undefined');
  }
}
export  $initHighlight;"""]
classify_code(my_snippet_3)

'JavaScript'

In [15]:
#Ruby
my_snippet_4 = ["""# The Greeter class
class Greeter
  def initialize(name)
    @name = name.capitalize
  end
  def salute
    puts "Hello #{@name}!"
  end
end
g = Greeter.new("world")
g.salute"""]
classify_code(my_snippet_4)

'Ruby'

In [16]:
#Haskell
my_snippet_5 = ["""{-# LANGUAGE TypeSynonymInstances #-}
module Network.UDP
( DataPacket(..)
, openBoundUDPPort
, openListeningUDPPort
, pingUDPPort
, sendUDPPacketTo
, recvUDPPacket
, recvUDPPacketFrom
) where
import qualified Data.ByteString as Strict (ByteString, concat, singleton)
import qualified Data.ByteString.Lazy as Lazy (ByteString, toChunks, fromChunks)
import Data.ByteString.Char8 (pack, unpack)
import Network.Socket hiding (sendTo, recv, recvFrom)
import Network.Socket.ByteString (sendTo, recv, recvFrom)
-- Type class for converting StringLike types to and from strict ByteStrings
class DataPacket a where
  toStrictBS :: a -> Strict.ByteString
 fromStrictBS :: Strict.ByteString -> a
"""]
classify_code(my_snippet_5)

'Haskell'

In [17]:
#Scheme
my_snippet_6 = [""";; Calculation of Hofstadter's male and female sequences as a list of pairs
(define (hofstadter-male-female n)
(letrec ((female (lambda (n)
           (if (= n 0)
           1
           (- n (male (female (- n 1)))))))
     (male (lambda (n)
         (if (= n 0)
             0
             (- n (female (male (- n 1))))))))
  (let loop ((i 0))
    (if (> i n)
    '()
    (cons (cons (female i)
            (male i))
      (loop (+ i 1)))))))
(hofstadter-male-female 8)
(define (find-first func lst)
(call-with-current-continuation
 (lambda (return-immediately)
   (for-each (lambda (x)
       (if (func x)
           (return-immediately x)))
         lst)
   #f)))
"""]
classify_code(my_snippet_6)

'Common Lisp'

In [18]:
#Java
my_snippet_7 = ["""/**
 * @author John Smith <john.smith@example.com>
*/
package l2f.gameserver.model;
public abstract class L2Char extends L2Object {
  public static final Short ERROR = 0x0001;
  public void moveTo(int x, int y, int z) {
    _ai = null;
    log("Should not be called");
    if (1 > 5) { // wtf!?
      return;
    }
  }
}
"""]
classify_code(my_snippet_7)

'Java'

In [19]:
#Scala
my_snippet_8 = ["""/**
 * A person has a name and an age.
 */
case class Person(name: String, age: Int)
abstract class Vertical extends CaseJeu
case class Haut(a: Int) extends Vertical
case class Bas(name: String, b: Double) extends Vertical
sealed trait Ior[+A, +B]
case class Left[A](a: A) extends Ior[A, Nothing]
case class Right[B](b: B) extends Ior[Nothing, B]
case class Both[A, B](a: A, b: B) extends Ior[A, B]
"""]
classify_code(my_snippet_8)

'Scala'

In [20]:
#TCL
my_snippet_9 = ["""package json
source helper.tcl
# randomness verified by a die throw
set ::rand 4
proc give::recursive::count {base p} { ; # 2 mandatory params
    while {$p > 0} {
        set result [expr $result * $base]; incr p -1
    }
    return $result
}
set a 'a'; set b "bcdef"; set lst [list "item"]
puts [llength $a$b]

set ::my::tid($id) $::my::tid(def)
lappend lst $arr($idx) $::my::arr($idx) $ar(key)
lreplace ::my::tid($id) 4 4
puts $::rand ${::rand} ${::AWESOME::component::variable}
puts "$x + $y is\t [expr $x + $y]"
proc isprime x {
    expr {$x>1 && ![regexp {^(oo+?)\1+$} [string repeat o $x]]}
}
"""]
classify_code(my_snippet_9)

'TCL'

In [21]:
#PHP
my_snippet_10 = ["""require_once 'Zend/Uri/Http.php';
namespace Location\Web;
interface Factory
{
    static function _factory();
}
abstract class URI extends BaseURI implements Factory
{
    abstract function test();
    public static $st1 = 1;
  
"""]
classify_code(my_snippet_10)

'PHP'

In [22]:
#Ocaml
my_snippet_11 = ["""(* This is a
multiline, (* nested *) comment *)
type point = { x: float; y: float };;
let some_string = "this is a string";;
let rec length lst =
    match lst with
      [] -> 0
    | head :: tail -> 1 + length tail
  ;;
exception Test;;
type expression =
"""]
classify_code(my_snippet_11)

'Common Lisp'

In [23]:
#C
my_snippet_12 = ["""#include <iostream>

int main(int argc, char *argv[]) {

  /* An annoying "Hello World" example */
  for (auto i = 0; i < 0xFFFF; i++)
    cout << "Hello, World!" << endl;

  char c = '\n';
  unordered_map <string, vector<string> > m;
  m["key"] = "\\\\"; // this is an error

  return -2e3 + 12l;
}
"""]
classify_code(my_snippet_12)

'C'

In [24]:
#C_sharp
my_snippet_13 = ["""using System;

#pragma warning disable 414, 3021

/// <summary>Main task</summary>
async Task<int> AccessTheWebAsync()
{
    Console.WriteLine("Hello, World!");
    string urlContents = await getStringTask;
    return urlContents.Length;
}
"""]
classify_code(my_snippet_13)

'C#'

In [25]:
#Common Lisp
my_snippet_14 = ["""(etypecase ${1:key-form}
  (${2:match} ${3:result}))
"""]
classify_code(my_snippet_14)

'Common Lisp'

In [26]:
#Perl
my_snippet_15 = ["""# loads object
sub load
{
  my $flds = $c->db_load($id,@_) || do {
    Carp::carp "Can`t load (class: $c, id: $id): '$!'"; return undef
  };
  my $o = $c->_perl_new();
  $id12 = $id / 24 / 3600;
  $o->{'ID'} = $id12 + 123;
  #$o->{'SHCUT'} = $flds->{'SHCUT'};
  my $p = $o->props;
  my $vt;
  $string =~ m/^sought_text$/;
  $items = split //, 'abc';
  $string //= "bar";
  for my $key (keys %$p)
  {


"""]
classify_code(my_snippet_15)

'Clojure'