In [107]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_validate

import pickle
import gzip

In [92]:
with open("labels.pkl", "rb") as f:
    labels = pickle.load(f)
labels

['C#', 'Haskell', 'Java', 'Python', 'Scala', 'TypeScript']

In [93]:
df = pd.read_pickle("dataset.pkl")
df["target"].value_counts() / len (df)

2    0.179456
5    0.176331
4    0.174139
0    0.169762
1    0.153057
3    0.147255
Name: target, dtype: float64

In [94]:
df.head()

Unnamed: 0,text,target
0,/*,4
1,"* Copyright 2017 MongoDB, Inc.",4
2,*,4
3,"* Licensed under the Apache License, Version ...",4
4,* you may not use this file except in complia...,4


In [95]:
docs = df["text"]
y = df["target"]

In [96]:
vectorizer = TfidfVectorizer(ngram_range=(1,4), analyzer="char")

In [99]:
clf = RandomForestClassifier(max_depth=50)

In [110]:
model = make_pipeline(vectorizer, clf)

In [111]:
cv_res = cross_validate(model, docs, y, scoring="accuracy", cv=5, n_jobs=5)

In [112]:
cv_res



{'fit_time': array([69.30797696, 75.06799793, 79.50800252, 75.48804307, 71.70800161]),
 'score_time': array([4.74000525, 4.47999787, 4.76800275, 4.54396057, 4.60399294]),
 'test_score': array([0.81360776, 0.81826732, 0.80792187, 0.81706699, 0.79720001]),
 'train_score': array([0.87771417, 0.87350106, 0.88704806, 0.88184013, 0.8779707 ])}

In [113]:
model.fit(docs, y)

Pipeline(memory=None,
     steps=[('tfidfvectorizer', TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 4), norm='l2', preprocessor=None, smooth_i...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

In [118]:
with gzip.open('model.pkl.gz', 'wb') as f:
    pickle.dump([model, labels], f)