In [69]:
from pprint import pprint

import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

import ry_prepare as p
import acquire_ry as a
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
import ry_wrangle as w

from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

In [2]:
df = a.get_github2(cached=True)

In [3]:
df.head()

Unnamed: 0,language,content
0,GLSL,EnvironmentalVisualEnhancements\nVisual enhanc...
1,PowerShell,dbachecks\n\ndbachecks is a framework created ...
2,Python,ESC-50: Dataset for Environmental Sound Classi...
3,JavaScript,Leaflet Environmental Layers (LEL)\n\n \n\n\n\...
4,PHP,Emoncms\n\n\nEmoncms is an open-source web app...


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 827 entries, 0 to 826
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   language  827 non-null    object
 1   content   827 non-null    object
dtypes: object(2)
memory usage: 19.4+ KB


In [5]:
df = p.prep_data(df, 'content', extra_words=[], exclude_words=[])
df.head()

Unnamed: 0,language,content,clean,stemmed,lemmatized,words,doc_length
2,Python,ESC-50: Dataset for Environmental Sound Classi...,esc50 dataset environmental sound classificati...,esc50 dataset environment sound classif overvi...,esc50 dataset environmental sound classificati...,"[esc50, dataset, environmental, sound, classif...",1313
3,JavaScript,Leaflet Environmental Layers (LEL)\n\n \n\n\n\...,leaflet environmental layers lel leaflet plugi...,leaflet environment layer lel leaflet plugin c...,leaflet environmental layer lel leaflet plugin...,"[leaflet, environmental, layer, lel, leaflet, ...",1098
5,Python,\n\nEARS: Environmental Audio Recognition Syst...,ears environmental audio recognition system ea...,ear environment audio recognit system ear proo...,ear environmental audio recognition system ear...,"[ear, environmental, audio, recognition, syste...",401
6,Python,Mycodo\nEnvironmental Regulation System\nLates...,mycodo environmental regulation system latest ...,mycodo environment regul system latest version...,mycodo environmental regulation system latest ...,"[mycodo, environmental, regulation, system, la...",2447
7,Python,NO LONGER SUPPORTED\nEBOWLA\nUSAGE: ./ebowla.p...,longer supported ebowla usage ebowlapy exedlls...,longer support ebowla usag ebowlapi exedllshel...,longer supported ebowla usage ebowlapy exedlls...,"[longer, supported, ebowla, usage, ebowlapy, e...",203


In [6]:
train_exp, X_train, y_train, X_validate, y_validate, X_test, y_test = w.split(df,'language')
train_exp

Unnamed: 0,language,content,clean,stemmed,lemmatized,words,doc_length
26,HTML,Environmental_Data_Analytics\nData analytics c...,environmentaldataanalytics data analytics cour...,environmentaldataanalyt data analyt cours duke...,environmentaldataanalytics data analytics cour...,"[environmentaldataanalytics, data, analytics, ...",74
36,Java,MinCED - Mining CRISPRs in Environmental Datas...,minced mining crisprs environmental datasets m...,minc mine crispr environment dataset minc prog...,minced mining crisprs environmental datasets m...,"[minced, mining, crisprs, environmental, datas...",219
35,Python,Open Simulation Interface (OSI)\n\nThe Open Si...,open simulation interface osi open simulation ...,open simul interfac osi open simul interfac 1 ...,open simulation interface osi open simulation ...,"[open, simulation, interface, osi, open, simul...",259
156,Python,Community Water Model (CWatM)\nIIASA\n13rd Oct...,community water model cwatm iiasa 13rd october...,commun water model cwatm iiasa 13rd octob 2020...,community water model cwatm iiasa 13rd october...,"[community, water, model, cwatm, iiasa, 13rd, ...",245
546,HTML,EnvironmentalGame\nThis game was developed for...,environmentalgame game developed ' environment...,environmentalgam game develop ' environ scienc...,environmentalgame game developed ' environment...,"[environmentalgame, game, developedenvironment...",9
...,...,...,...,...,...,...,...
807,JavaScript,environmental-issues.\n,environmentalissues,environmentalissu,environmentalissues,[environmentalissues],1
192,Java,EnvironmentalProject\n,environmentalproject,environmentalproject,environmentalproject,[environmentalproject],1
2,Python,ESC-50: Dataset for Environmental Sound Classi...,esc50 dataset environmental sound classificati...,esc50 dataset environment sound classif overvi...,esc50 dataset environmental sound classificati...,"[esc50, dataset, environmental, sound, classif...",1313
308,Python,Image Dehazing via Joint Estimation of Transmi...,image dehazing via joint estimation transmitta...,imag dehaz via joint estim transmitt map envir...,image dehazing via joint estimation transmitta...,"[image, dehazing, via, joint, estimation, tran...",167


In [7]:
#df.language.value_counts().head(15)

In [8]:
#df.info()

In [9]:
#remove_lang_list = ['CoffeeScript','Other','Go','Stata','Lua','Kotlin','Objective-C','Shell','Vim script','Batchfile','Swift','PowerShell','Vue', 'Mathematica','TypeScript']

In [10]:
#df = df[~df.language.isin(remove_lang_list)]

In [11]:
print(X_train.shape)
print(X_validate.shape)
print(X_test.shape)

(234, 6)
(79, 6)
(79, 6)


In [12]:
from sklearn.feature_extraction.text import CountVectorizer

# Create CountVectorizer, which create bag-of-words model.
# stop_words : Specify language to remove stopwords. 
# min_df: ignore terms that have a document frequency strictly 
# lower than the given threshold. This value is also called cut-off in the literature. 
# If float, the parameter represents a proportion of documents, integer absolute counts. 
# ngram_range: the lower and upper boundary of the range of n-values for 
# different word n-grams or char n-grams to be extracted. 

vectorizer = CountVectorizer(stop_words='english', 
                             min_df=20, 
                             ngram_range=(1,2), 
                             binary=True)

# Learn vocabulary in sentences. 
vectorizer.fit(X_train.clean)

# Get dictionary. 
vectorizer.get_feature_names()

['access',
 'add',
 'allows',
 'analysis',
 'api',
 'app',
 'application',
 'available',
 'based',
 'build',
 'cd',
 'change',
 'check',
 'class',
 'clone',
 'code',
 'command',
 'config',
 'configuration',
 'contains',
 'control',
 'copy',
 'create',
 'created',
 'creating',
 'current',
 'data',
 'database',
 'default',
 'dependencies',
 'description',
 'designed',
 'details',
 'developed',
 'development',
 'different',
 'directory',
 'documentation',
 'download',
 'end',
 'environment',
 'environmental',
 'example',
 'features',
 'file',
 'files',
 'folder',
 'following',
 'free',
 'git',
 'git clone',
 'github',
 'help',
 'image',
 'include',
 'including',
 'information',
 'input',
 'install',
 'installation',
 'installed',
 'instructions',
 'interface',
 'license',
 'like',
 'line',
 'list',
 'local',
 'location',
 'look',
 'make',
 'make sure',
 'model',
 'module',
 'monitoring',
 'multiple',
 'need',
 'needs',
 'new',
 'note',
 'number',
 'object',
 'open',
 'order',
 'output',
 

In [13]:
# Transform each sentences in vector space.
bow = vectorizer.transform(X_train.clean)

In [14]:
# this is just to see the array of 0's and 1's
bow_array = bow.toarray()
bow_array[3]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 0,
       0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 1, 0])

In [15]:
# Show sentences and vector space representation.
# purely to visualize what's happening.
for i, v in zip(X_train.clean, bow_array):
    print(i)
    print(v)

environmentaldataanalytics data analytics course duke university course code env 872l user kateri salk instructions using repository fork repository github account clone forked repository onto local drive pull updates repository add repository upstream remote git remote add upstream httpsgithubcomkaterisalkenvironmentaldataanalytics verify repository upstream remote git remote v repository listed origin repository listed upstream pull updates repository git pull upstream master git fetch upstream git merge upstreammaster conflict arises merge update files liking stage commit testing merge error taylor
[0 1 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0]
minced mining crisprs environmental datasets minced program find clustered regularly interspaced short

 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
electric imp environmental data streaming hub electric imp environmental data streaming electric imp nora motion environmental data streaming tutorials find code materials tutorials well tutorials wiki lot internetconnected devices price going usually cheaper harder program make secure electric imp platform paired hardware makes connecting internet quickly securely seamlessly piece cake hard network work without change thing way focus data want collect send ' super cool looking read
[0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0]
national co2 emissions fossilfuel burning cement manufacture gas flaring 17512014 contributors ta boden rj andres carbon dioxide information analysis center en

In [16]:
X_bow = bow

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(stop_words='english', min_df=20, 
                             ngram_range=(1,2), 
                             binary=True)

tfidf_sparse_matrix = tfidf.fit_transform(X_train.clean)
tfidf_sparse_matrix

<234x142 sparse matrix of type '<class 'numpy.float64'>'
	with 4862 stored elements in Compressed Sparse Row format>

In [18]:
pd.DataFrame(tfidf_sparse_matrix.todense(), columns=tfidf.get_feature_names())

Unnamed: 0,access,add,allows,analysis,api,app,application,available,based,build,...,uses,using,value,values,version,want,way,web,work,working
0,0.0,0.276654,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.200326,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
1,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.117245,0.0,0.0,0.156506,0.179983,0.200383,0.0,0.000000,0.000000
2,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.175979,0.186493,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
3,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.200548,0.000000
4,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
229,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
230,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.000000,0.000000
231,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.146907,0.149583,0.000000,...,0.0,0.105442,0.0,0.0,0.140751,0.000000,0.000000,0.0,0.000000,0.175556
232,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,0.000000,...,0.0,0.156853,0.0,0.0,0.000000,0.000000,0.000000,0.0,0.212915,0.000000


In [19]:
# Get vocabularies.
tfidf.vocabulary_

{'data': 26,
 'code': 15,
 'user': 131,
 'instructions': 61,
 'using': 133,
 'repository': 98,
 'github': 51,
 'clone': 14,
 'local': 67,
 'add': 1,
 'git': 49,
 'update': 127,
 'files': 45,
 'environmental': 41,
 'program': 89,
 'want': 137,
 'read': 97,
 'tool': 125,
 'installation': 59,
 'need': 76,
 'install': 58,
 'dependencies': 29,
 'source': 116,
 'directory': 36,
 'simple': 114,
 'cd': 10,
 'make': 70,
 'run': 104,
 'help': 52,
 'page': 85,
 'version': 136,
 'way': 138,
 'note': 79,
 'folder': 46,
 'number': 80,
 'example': 42,
 'output': 84,
 'file': 44,
 'time': 124,
 'license': 63,
 'free': 48,
 'software': 115,
 'public': 95,
 'details': 32,
 'copy': 21,
 'open': 82,
 'interface': 62,
 'based': 8,
 'requirements': 101,
 'test': 123,
 'development': 34,
 'environment': 40,
 'information': 56,
 'documentation': 37,
 'usage': 128,
 'python': 96,
 'line': 65,
 'set': 112,
 'use': 129,
 'sudo': 119,
 'build': 9,
 'pip': 86,
 'git clone': 50,
 'pip install': 87,
 'model': 72,
 '

In [20]:
# Transform to document-term matrix
vector_spaces = tfidf.transform(X_train.clean)
vector_spaces.toarray()

array([[0.        , 0.27665357, 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ],
       ...,
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.17555646],
       [0.        , 0.        , 0.        , ..., 0.        , 0.21291492,
        0.        ],
       [0.        , 0.        , 0.        , ..., 0.        , 0.        ,
        0.        ]])

In [21]:
# Show sentences and vector space representation.
# 
# (A, B) C
# A : Document Index
# B : Specific word-vector index
# C : TF-IDF score
for i, v in zip(X_train.clean, vector_spaces):
    print(i)
    print(v)

environmentaldataanalytics data analytics course duke university course code env 872l user kateri salk instructions using repository fork repository github account clone forked repository onto local drive pull updates repository add repository upstream remote git remote add upstream httpsgithubcomkaterisalkenvironmentaldataanalytics verify repository upstream remote git remote v repository listed origin repository listed upstream pull updates repository git pull upstream master git fetch upstream git merge upstreammaster conflict arises merge update files liking stage commit testing merge error taylor
  (0, 133)	0.20032584697705205
  (0, 131)	0.2791028227330898
  (0, 127)	0.34710454361729953
  (0, 98)	0.247413767015319
  (0, 67)	0.3293833913226699
  (0, 61)	0.3215611267079073
  (0, 51)	0.32539702637573514
  (0, 49)	0.28161256203573326
  (0, 45)	0.23384173274879666
  (0, 26)	0.19053482266623098
  (0, 15)	0.22767988999020847
  (0, 14)	0.28953651678151426
  (0, 1)	0.2766535650016892
mince

In [22]:
X_tfidf = tfidf_sparse_matrix

# Modeling Logistic Regression

In [None]:
def logistic_regression(X_train, y_train):
    '''
    This function takes in X_train (features using for model) and y_train (target 'win') and performs logistic
    regression giving us accuracy of the model and the classification report
    '''
    # Calling out funtion
    logit = LogisticRegression()

    # Fit the training data set
    logit = logit.fit(X_train, y_train)

    # Make predictions
    y_pred = logit.predict(X_train)

    #Accuracy of model
    score = logit.score(X_train, y_train)

    print(f'The logistic regression models accuracy is {round(score * 100,2)}%\n')     
    print(f'Confusion Matrix\n\n {confusion_matrix(y_train, y_pred)}\n') 
    
    # Coefficients for each feature  
    coef_df = pd.DataFrame(logit.coef_)

    print(f'Classification Report\n {classification_report(y_train, y_pred)}')

In [54]:
y_train.value_counts()

language  
Python        83
JavaScript    62
HTML          49
Java          40
dtype: int64

In [56]:
train_exp.language.value_counts()

Python        83
JavaScript    62
HTML          49
Java          40
Name: language, dtype: int64

In [57]:
y_train.value_counts()

language  
Python        83
JavaScript    62
HTML          49
Java          40
dtype: int64

In [58]:
len(train_exp)

234

In [59]:
# Getting baseline
baseline = len(train_exp[train_exp.language == 'Python']) / len(train_exp)
round(baseline,2)

0.35

In [53]:
y = y_train

X_bow
X_tfidf

<234x142 sparse matrix of type '<class 'numpy.float64'>'
	with 4862 stored elements in Compressed Sparse Row format>

In [55]:
from sklearn.linear_model import LogisticRegression

lm = LogisticRegression().fit(X_bow, y)

X_train['predicted'] = lm.predict(X_bow)
# test['predicted'] = lm.predict(X_test)

  return f(**kwargs)


In [60]:
from sklearn.metrics import confusion_matrix, classification_report

confusion_matrix(y.language, X_train.predicted)

array([[41,  2,  5,  1],
       [12, 26,  0,  2],
       [ 8,  1, 50,  3],
       [ 4,  3,  1, 75]])

In [61]:
confusion_matrix(y.language, X_train.predicted)

array([[41,  2,  5,  1],
       [12, 26,  0,  2],
       [ 8,  1, 50,  3],
       [ 4,  3,  1, 75]])

In [28]:
pd.crosstab(X_train.predicted, y_train.language)

language,HTML,Java,JavaScript,Python
predicted,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HTML,41,12,8,4
Java,2,26,1,3
JavaScript,5,0,50,1
Python,1,2,3,75


In [62]:
print(classification_report(y.language, X_train.predicted))

              precision    recall  f1-score   support

        HTML       0.63      0.84      0.72        49
        Java       0.81      0.65      0.72        40
  JavaScript       0.89      0.81      0.85        62
      Python       0.93      0.90      0.91        83

    accuracy                           0.82       234
   macro avg       0.82      0.80      0.80       234
weighted avg       0.84      0.82      0.82       234



In [63]:
lm_tfidf = LogisticRegression().fit(X_tfidf, y)
X_train['pred_tfidf'] = lm_tfidf.predict(X_tfidf)

  return f(**kwargs)


In [65]:
pd.crosstab(y.language, X_train.pred_tfidf)

pred_tfidf,HTML,Java,JavaScript,Python
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HTML,30,0,4,15
Java,13,11,2,14
JavaScript,8,1,39,14
Python,4,0,2,77


In [67]:
print(classification_report(y.language, X_train.pred_tfidf))

              precision    recall  f1-score   support

        HTML       0.55      0.61      0.58        49
        Java       0.92      0.28      0.42        40
  JavaScript       0.83      0.63      0.72        62
      Python       0.64      0.93      0.76        83

    accuracy                           0.67       234
   macro avg       0.73      0.61      0.62       234
weighted avg       0.72      0.67      0.65       234



# Decesion Tree

In [153]:
def decesion_tree(X_train, y_train, k):
    '''
    This function requires X_train, y_train and k (max_depth). A confusion matrix, models accuracy and 
    classification report are outputed
    '''
    # Creating the decision tree object
    clf = DecisionTreeClassifier(max_depth=k, random_state=123)

    # Fitting the data to the trained data
    clf.fit(X_bow, y)

   # Array of the predicitons
    X_train['predicted'] = clf.predict(X_bow)

    # Crosstab confusion matrix
    pd.crosstab(y.language, X_train.predicted)
    
    # Classification report
    print(classification_report(y.language, X_train.predicted))
                                       
    clf_tfidf = clf.fit(X_tfidf, y)
    X_train['pred_tfidf'] = clf_tfidf.predict(X_tfidf)
                                       
    # Confusion matrix
    # Confusion matrix
    print('Accuracy: {:.2%}'.format(accuracy_score(y_train.language, X_train.pred_tfidf)))
    print(f'Confusion Matrix: \n\n {pd.crosstab(y.language, X_train.pred_tfidf)}\n' )
    print("K-Nearest Neighbor Classification Report:\n", classification_report(y.language, X_train.pred_tfidf))

In [154]:
decesion_tree(X_train, y_train, k=3)

              precision    recall  f1-score   support

        HTML       0.26      0.67      0.38        49
        Java       0.75      0.07      0.14        40
  JavaScript       0.78      0.34      0.47        62
      Python       0.66      0.60      0.63        83

    accuracy                           0.46       234
   macro avg       0.61      0.42      0.40       234
weighted avg       0.62      0.46      0.45       234

Accuracy: 48.29%
Confusion Matrix: 

 pred_tfidf  HTML  Java  JavaScript  Python
language                                  
HTML          46     0           0       3
Java          35     3           1       1
JavaScript    37     0          22       3
Python        38     0           3      42

K-Nearest Neighbor Classification Report:
               precision    recall  f1-score   support

        HTML       0.29      0.94      0.45        49
        Java       1.00      0.07      0.14        40
  JavaScript       0.85      0.35      0.50        62
      Py

In [78]:
# for classification you can change the algorithm to gini or entropy (information gain).  Default is gini.
clf = DecisionTreeClassifier(max_depth=3, random_state=123)

In [80]:
# Fitting the data to the trained data
clf.fit(X_bow, y)

DecisionTreeClassifier(max_depth=3, random_state=123)

In [81]:
# Array of the predicitons
X_train['predicted'] = clf.predict(X_bow)

In [83]:
# Crosstab confusion matrix
pd.crosstab(y.language, X_train.predicted)

predicted,HTML,Java,JavaScript,Python
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HTML,41,2,5,1
Java,12,26,0,2
JavaScript,8,1,50,3
Python,4,3,1,75


In [86]:
print(classification_report(y.language, X_train.predicted))

              precision    recall  f1-score   support

        HTML       0.63      0.84      0.72        49
        Java       0.81      0.65      0.72        40
  JavaScript       0.89      0.81      0.85        62
      Python       0.93      0.90      0.91        83

    accuracy                           0.82       234
   macro avg       0.82      0.80      0.80       234
weighted avg       0.84      0.82      0.82       234



In [87]:
clf_tfidf = clf.fit(X_tfidf, y)
X_train['pred_tfidf'] = clf_tfidf.predict(X_tfidf)

In [88]:
pd.crosstab(y.language, X_train.pred_tfidf)

pred_tfidf,HTML,Java,JavaScript,Python
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HTML,46,0,0,3
Java,35,3,1,1
JavaScript,37,0,22,3
Python,38,0,3,42


In [89]:
print(classification_report(y.language, X_train.pred_tfidf))

              precision    recall  f1-score   support

        HTML       0.29      0.94      0.45        49
        Java       1.00      0.07      0.14        40
  JavaScript       0.85      0.35      0.50        62
      Python       0.86      0.51      0.64        83

    accuracy                           0.48       234
   macro avg       0.75      0.47      0.43       234
weighted avg       0.76      0.48      0.48       234



# Random Forest

In [147]:
def random_forest(X_train, y_train, k):
    # Random forest object
    rf = RandomForestClassifier(n_estimators=100, max_depth=k, random_state=123)

    # Fitting the data to the trained data
    rf.fit(X_bow, y)

    # Array of the predicitons
    X_train['predicted'] = rf.predict(X_bow)

    # Crosstab confusion matrix
    pd.crosstab(y.language, X_train.predicted)
    
    rf_tfidf = rf.fit(X_tfidf, y)
    X_train['pred_tfidf'] = rf_tfidf.predict(X_tfidf)

   # Confusion matrix
    print('Accuracy: {:.2%}'.format(accuracy_score(y_train.language, X_train.pred_tfidf)))
    print(f'Confusion Matrix: \n\n {pd.crosstab(y.language, X_train.pred_tfidf)}\n' )
    print("K-Nearest Neighbor Classification Report:\n", classification_report(y.language, X_train.pred_tfidf))

In [148]:
# K =4
random_forest(X_train, y_train, k=4)

  
  


Accuracy: 62.39%
Confusion Matrix: 

 pred_tfidf  HTML  Java  JavaScript  Python
language                                  
HTML          16     0          17      16
Java           0     3          16      21
JavaScript     0     0          47      15
Python         0     0           3      80

K-Nearest Neighbor Classification Report:
               precision    recall  f1-score   support

        HTML       1.00      0.33      0.49        49
        Java       1.00      0.07      0.14        40
  JavaScript       0.57      0.76      0.65        62
      Python       0.61      0.96      0.74        83

    accuracy                           0.62       234
   macro avg       0.79      0.53      0.51       234
weighted avg       0.75      0.62      0.56       234



In [137]:
# Random forest object
rf = RandomForestClassifier(n_estimators=100, max_depth=3, random_state=123)

In [138]:
# Fitting the data to the trained data
rf.fit(X_bow, y)

  


RandomForestClassifier(max_depth=3, random_state=123)

In [139]:
# Array of the predicitons
X_train['predicted'] = rf.predict(X_bow)

In [140]:
# Crosstab confusion matrix
pd.crosstab(y.language, X_train.predicted)

predicted,HTML,JavaScript,Python
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
HTML,1,3,45
Java,0,4,36
JavaScript,0,26,36
Python,0,3,80


In [141]:
print(classification_report(y.language, X_train.predicted))

              precision    recall  f1-score   support

        HTML       1.00      0.02      0.04        49
        Java       0.00      0.00      0.00        40
  JavaScript       0.72      0.42      0.53        62
      Python       0.41      0.96      0.57        83

    accuracy                           0.46       234
   macro avg       0.53      0.35      0.29       234
weighted avg       0.54      0.46      0.35       234



  _warn_prf(average, modifier, msg_start, len(result))


In [142]:
rf_tfidf = rf.fit(X_tfidf, y)
X_train['pred_tfidf'] = rf_tfidf.predict(X_tfidf)

  """Entry point for launching an IPython kernel.


In [143]:
pd.crosstab(y.language, X_train.pred_tfidf)

pred_tfidf,HTML,Java,JavaScript,Python
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HTML,11,0,2,36
Java,0,1,3,36
JavaScript,0,0,29,33
Python,0,0,1,82


In [144]:
print(classification_report(y.language, X_train.pred_tfidf))

              precision    recall  f1-score   support

        HTML       1.00      0.22      0.37        49
        Java       1.00      0.03      0.05        40
  JavaScript       0.83      0.47      0.60        62
      Python       0.44      0.99      0.61        83

    accuracy                           0.53       234
   macro avg       0.82      0.43      0.41       234
weighted avg       0.76      0.53      0.46       234



# KNN

In [145]:
def knn(X_train, y, k):
    # KNN object
    knn = KNeighborsClassifier(n_neighbors=k, weights='uniform')

    # Fit the model
    knn.fit(X_bow, y)

    # Make predictions
    X_train['predicted'] = knn.predict(X_bow)
    
    # Crosstab confusion matrix of predicted
    pd.crosstab(y.language, X_train.predicted)
    
    # Predicted Classification report
    print(classification_report(y.language, X_train.predicted))
    
    knn_tfidf = knn.fit(X_tfidf, y)
    X_train['pred_tfidf'] = knn_tfidf.predict(X_tfidf)

    # Confusion matrix
    print('Accuracy: {:.2%}'.format(accuracy_score(y_train.language, X_train.pred_tfidf)))
    print(f'Confusion Matrix: \n\n {pd.crosstab(y.language, X_train.pred_tfidf)}\n' )
    print("K-Nearest Neighbor Classification Report:\n", classification_report(y.language, X_train.pred_tfidf))

In [146]:
knn(X_train, y, k=5)

              precision    recall  f1-score   support

        HTML       0.39      0.59      0.47        49
        Java       0.31      0.50      0.38        40
  JavaScript       0.54      0.40      0.46        62
      Python       0.76      0.45      0.56        83

    accuracy                           0.47       234
   macro avg       0.50      0.49      0.47       234
weighted avg       0.55      0.47      0.49       234

Accuracy: 33.76%
Confusion Matrix: 

 pred_tfidf  HTML  Java  JavaScript  Python
language                                  
HTML          35    13           1       0
Java          12    25           2       1
JavaScript    19    36           5       2
Python        16    49           4      14

K-Nearest Neighbor Classification Report:
               precision    recall  f1-score   support

        HTML       0.43      0.71      0.53        49
        Java       0.20      0.62      0.31        40
  JavaScript       0.42      0.08      0.14        62
      Py

  


In [126]:
 # KNN object
knn = KNeighborsClassifier(n_neighbors=5, weights='uniform')

In [127]:
# Fitting the data to the trained data
knn.fit(X_bow, y)

  


KNeighborsClassifier()

In [128]:
# Array of the predicitons
X_train['predicted'] = knn.predict(X_bow)

In [129]:
# Crosstab confusion matrix
pd.crosstab(y.language, X_train.predicted)

predicted,HTML,Java,JavaScript,Python
language,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
HTML,29,11,4,5
Java,15,20,5,0
JavaScript,14,16,25,7
Python,16,18,12,37


In [130]:
print(classification_report(y.language, X_train.predicted))

              precision    recall  f1-score   support

        HTML       0.39      0.59      0.47        49
        Java       0.31      0.50      0.38        40
  JavaScript       0.54      0.40      0.46        62
      Python       0.76      0.45      0.56        83

    accuracy                           0.47       234
   macro avg       0.50      0.49      0.47       234
weighted avg       0.55      0.47      0.49       234



In [131]:
knn_tfidf = knn.fit(X_tfidf, y)
X_train['pred_tfidf'] = knn_tfidf.predict(X_tfidf)

  """Entry point for launching an IPython kernel.


In [132]:
print(f'Confusion Matrix: \n\n {pd.crosstab(y.language, X_train.pred_tfidf)}\n' )

Confusion Matrix: 

 pred_tfidf  HTML  Java  JavaScript  Python
language                                  
HTML          35    13           1       0
Java          12    25           2       1
JavaScript    19    36           5       2
Python        16    49           4      14



In [133]:
print(classification_report(y.language, X_train.pred_tfidf))

              precision    recall  f1-score   support

        HTML       0.43      0.71      0.53        49
        Java       0.20      0.62      0.31        40
  JavaScript       0.42      0.08      0.14        62
      Python       0.82      0.17      0.28        83

    accuracy                           0.34       234
   macro avg       0.47      0.40      0.31       234
weighted avg       0.53      0.34      0.30       234



In [134]:
print('Accuracy: {:.2%}'.format(accuracy_score(y_train.language, X_train.pred_tfidf)))

Accuracy: 33.76%
