In [8]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score,learning_curve  
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB

from xgboost import XGBClassifier

from sklearn.metrics import classification_report

from sklearn.metrics import confusion_matrix

**Data preparation**

In [9]:
df= pd.read_csv('data.csv')

In [10]:
X=[]
y=[]

len(df)

16272

In [11]:
df = df.dropna() # remove nan from Data Frame

In [12]:
len(df)

16268

In [13]:
import seaborn as sns

x=df['language'].value_counts()
print(len(x))

19


In [None]:
for language in df['language']:
    for code_snippet in df['file_body']:
        X.append(code_snippet)
        y.append(language)

In [None]:
print ("Vectorization started")
cv = TfidfVectorizer(input ='X', stop_words = {'english'},lowercase=True,analyzer ='word',max_features =10000,min_df=10)

X1 = cv.fit_transform(X)

Vectorization started


In [None]:
X1.shape

(2000000, 8443)

In [None]:
cv.get_feature_names()

['00',
 '000',
 '0000',
 '000000',
 '0000000',
 '00000000000000000000000000000000',
 '00000000000000000000000000000000000000000000000000000000000',
 '00000000000000000000000000000001',
 '00000000000000000000000000000010',
 '00000000000000000000000000000011',
 '00000000000000000000000000001101',
 '00000000000000000000000000001110',
 '00000000000000000000000000001111',
 '00000000000000000000000001110000',
 '00000000000000000000ffffc0a80001',
 '00002',
 '00007',
 '00011111100000111111100000000011111110000000111111000000000',
 '00011111100000111111100000000111111100000000000000000000000',
 '00011111111111111110000000000111111100000000000000000000000',
 '00011111111111111111000000000111111100000000000000000000000',
 '0007',
 '000i',
 '001',
 '0010010010',
 '002',
 '003',
 '0030278255',
 '004',
 '005',
 '01',
 '010',
 '0100100',
 '0100101101',
 '011',
 '01110000111000001110000111000000',
 '01110001111000001110000000000000',
 '01110001111000001111001111000000',
 '01110001111011100111111110011

In [None]:
#Lets split the data into train-test
X_train, X_test, y_train, y_test = train_test_split(X1, y, test_size=0.20, random_state=42)

In [None]:
print (X_train.shape)

(1600000, 8443)


In [None]:
clf = RandomForestClassifier()
y_pred = clf.fit(X_train, y_train).predict(X_test)


In [None]:
def plot_confusion_matrix(cm, classes,
                          normalize=True,
                          title='Confusion matrix',
                          cmap=plt.cm.Blues):
    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        print("Normalized confusion matrix")
    else:
        print('Confusion matrix, without normalization')


    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()
    tick_marks = np.arange(len(classes))
    plt.xticks(tick_marks, classes, rotation=90)
    plt.yticks(tick_marks, classes)

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        plt.text(j, i, format(cm[i, j], fmt),
                 horizontalalignment="center",
                 color="white" if cm[i, j] > thresh else "black")

    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted Label')
    plt.show()
    plt.savefig("RandomForestClassifierUsingCode.png", dpi=300)


In [None]:
confusion_matrix(y_test,y_pred1)

In [None]:
plot_confusion_matrix(cnf_matrix1, classes=labels,title='Confusion Matrix for XG BOOST Classifier',normalize=True)

In [None]:
classification_report = classification_report(y_test, y_pred, target_names=labels)
print (classification_report)

**Testowanie modelu**

In [None]:
fortran_snippet = str("""program hello
  ! This is a comment line; it is ignored by the compiler
  print *, 'Hello, World!'
end program hello""")

In [None]:
X_new_counts = cv.transform([fortran_snippet])

predicted = clf.predict(X_new_counts)
print('predicted as',predicted)

In [None]:
python3_snippet = str("""def all(iterable):
    for element in iterable:
        if not element:
            return False
    return True""")

In [None]:
X_new_counts = cv.transform([python3_snippet])

predicted = clf.predict(X_new_counts)
print('predicted as',predicted)

**XGBoost**

In [None]:
clf1 = XGBClassifier()
y_pred1 = clf1.fit(X_train, y_train).predict(X_test)

In [None]:
cnf_matrix1 =  confusion_matrix(y_test,y_pred1)

In [None]:
plot_confusion_matrix(cnf_matrix1, classes=labels,title='Confusion Matrix for XG BOOST Classifier',normalize=True)

In [None]:
fortran_snippet = str("""program hello
  ! This is a comment line; it is ignored by the compiler
  print *, 'Hello, World!'
end program hello""")

In [None]:
X_new_counts = cv.transform([fortran_snippet])

predicted = clf1.predict(X_new_counts)
print('predicted as',predicted)

**The multinomial Naive Bayes classifier**

In [None]:
clf2=MultinomialNB().fit(X_train, y_train).predict(X_test)

In [None]:
cnf_matrix2 =  confusion_matrix(y_test,pred2)

In [None]:
plot_confusion_matrix(cnf_matrix2, classes=labels,title='Confusion Matrix for Multinomial Naive Bayes',normalize=True)

In [None]:
fortran_snippet = str("""program hello
  ! This is a comment line; it is ignored by the compiler
  print *, 'Hello, World!'
end program hello""")

In [None]:
X_new_counts = cv.transform([fortran_snippet])

predicted = clf2.predict(X_new_counts)
print('predicted as',predicted)

**LightGBM Classifier**

In [None]:
import lightgbm as lgb
clf3 = lgb.LGBMClassifier()
pred3=clf3.fit(X_train, y_train).predict(X_test)

In [None]:
cnf_matrix3 =  confusion_matrix(y_test,pred3)

In [None]:
plot_confusion_matrix(cnf_matrix3, classes=labels,title='Confusion Matrix for Light GBM',normalize=True)

In [None]:
fortran_snippet = str("""program hello
  ! This is a comment line; it is ignored by the compiler
  print *, 'Hello, World!'
end program hello""")

In [None]:
X_new_counts = cv.transform([fortran_snippet])

predicted = clf3.predict(X_new_counts)
print('predicted as',predicted)