In [1]:
import re, os
import unicodedata
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import nltk.sentiment

from wordcloud import WordCloud

from acquire_c import *
from prepare_c import *
from explore_c import *
from model_m import *

import warnings
warnings.simplefilter('ignore')

plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')

## Acquire data and find the dominant language in each row

In [2]:
# You can pass a threshold argument but the default is 75
df = get_readme_data(lang = 'javascript', lang_threshold= 100, z_cutoff=0.5)

Did not find the file javascript_clean_readme_100_z0_5.csv
cleaning data, hold your horses....
cleaning the orginial data
Removing words who's zscore falls below the cutoff, this will take a moment
calculating word counts, please wait...
Before: 32439 words in the dataframe
After: 1139 words will remain
Removing the words from the column
stemming the reduced cleaned data
lemmatizing the reduced claned data
Removing words who's zscore falls below the cutoff, this will take a moment
calculating word counts, please wait...
Before: 102284 words in the dataframe
After: 2927 words will remain
Removing the words from the column
stemming the reduced cleaned data
lemmatizing the reduced claned data
saved file: javascript_clean_readme_100_z0_5.csv


In [15]:
lang = 'javascript'
not_lang = f'not_{lang}'
# df['label']  = df.prog_lang.apply(lambda x: lang_or_not(x, lang))
java_obj = NLP_explore(df, 'label', 'cleaned', lang, not_lang)

## Modeling

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [17]:
df.head()

Unnamed: 0,prog_lang,original,cleaned,label,stemmed,lemmatized
18,Python,game this game is done by python,game game done python,python,game game done python,game game done python
23,Python,Attendance-provider Make a attendance in a exc...,make excel file screenshot google,python,make excel file screenshot googl,make excel file screenshot google
24,Python,Open-cv-tutorial All the function for open cv,function open,python,function open,function open
27,Python,Python Text App Using Twilio API With a free T...,python text app using api free account get tex...,python,python text app use api free account get text ...,python text app using api free account get tex...
29,Python,Real-Time Voice Cloning This repository is an ...,realtime repository implementation works realt...,python,realtim repositori implement work realtim feel...,realtime repository implementation work realti...


In [220]:
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 
         'Gaussian N-Bayes', 'Multinomial N-Bayes']

classifiers = [
    KNeighborsClassifier(n_neighbors = 6),
    DecisionTreeClassifier(max_depth = 7),
    RandomForestClassifier(n_estimators = 10),
    GaussianNB(),
    MultinomialNB(alpha = .5)
    ]

model_obj = NLP_model(df, classifiers, names, lang = 'JavaScript')

In [221]:
metric_df = model_obj.metrics(splits = 10)
metric_df

Have not run count_vectorize method yet, running now...
Creating vectorized dataframe now. Vectorization may take a while, please wait...
All done! Moving on to modeling, this may take a while...
K Nearest Neighbors: Validate accuracy: 0.8206901098901099
Decision Tree: Validate accuracy: 0.8643688311688311
Random Forest: Validate accuracy: 0.8999472527472527
Gaussian N-Bayes: Validate accuracy: 0.8932528471528471
Multinomial N-Bayes: Validate accuracy: 0.9159434565434564


Unnamed: 0,model,average_accuracy%
4,Multinomial N-Bayes,91.59
2,Random Forest,89.99
3,Gaussian N-Bayes,89.33
1,Decision Tree,86.44
0,K Nearest Neighbors,82.07


Best model: MultinomialNB(alpha=0.5)
Validate score: 0.931
Test Score: 92.4%


MultinomialNB(alpha=0.5)

## Modeling Performance:
### JavaScript
##### Hyperparams:
- KNeighborsClassifier(n_neighbors = 3),
- DecisionTreeClassifier(max_depth = 5),
- RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features = 1 ),
- GaussianNB(),
- MultinomialNB()


> #### First iteration: (75% threshold, no zscore)
> - K Nearest Neighbors: Accuracy: 0.4437958746786057
> - Decision Tree: Accuracy: 0.5243970413600353
> - Random Forest: Accuracy: 0.3019957833163259
> - Gaussian N-Bayes: Accuracy: 0.43499919463886333
> - Multinomial N-Bayes: Accuracy: 0.5169951603916912

> #### Second Iteration (75% lang threshold, zscore .5)
> - K Nearest Neighbors: Accuracy: 0.42000479731350443
> - Decision Tree: Accuracy: 0.6441992484208843
> - Random Forest: Accuracy: 0.6461981290477333
> - Gaussian N-Bayes: Accuracy: 0.5742384264811705
> - Multinomial N-Bayes: Accuracy: 0.6247701287279124

> #### Third Iteration (100% lang threshold, zscore .5)
> - K Nearest Neighbors: Validate accuracy: 0.8817125081859856
> - Decision Tree: Validate accuracy: 0.9279783272817043
> - Random Forest: Validate accuracy: 0.8400252694089291
> - Gaussian N-Bayes: Validate accuracy: 0.9172856871827753
> - Multinomial N-Bayes: Validate accuracy: 0.9334353652863924

> #### Fourth Iteration (100% lang threshold, zscore .5, KFolds = 10)
> - K Nearest Neighbors: Validate accuracy: 0.8886948083454633
> - Decision Tree: Validate accuracy: 0.9277563718354882
> - Random Forest: Validate accuracy: 0.8400267336434817
> - Gaussian N-Bayes: Validate accuracy: 0.9172855362426388
> - Multinomial N-Bayes: Validate accuracy: 0.9351794769339079

##### Hyperparams:
- MultinomialNB(alpha = .5)

> #### Best Model Iteration: (100% lang threshold, zscore .5, KFolds = 10)
> - Multinomial N-Bayes: Validate accuracy: 0.9412896842385668

### Python:
##### Hyperparams:
- KNeighborsClassifier(n_neighbors = 6),
- DecisionTreeClassifier(max_depth = 7),
- RandomForestClassifier(n_estimators = 10),
- GaussianNB(),
- MultinomialNB(alpha = .5)

> #### First iteration: (75% threshold, no zscore)
> - K Nearest Neighbors: Validate accuracy: 0.8206901098901099
> - Decision Tree: Validate accuracy: 0.8648681318681319
> - Random Forest: Validate accuracy: 0.9007505494505494
> - Gaussian N-Bayes: Validate accuracy: 0.8932528471528471
> - Multinomial N-Bayes: Validate accuracy: 0.9159434565434564

> #### Second iteration: (75% threshold, zscore .5)
> - K Nearest Neighbors: Validate accuracy: 0.8206901098901099
> - Decision Tree: Validate accuracy: 0.8635692307692308
> - Random Forest: Validate accuracy: 0.8984497502497503
> - Gaussian N-Bayes: Validate accuracy: 0.8932528471528471
> - Multinomial N-Bayes: Validate accuracy: 0.9159434565434564

> #### Third iteration: (100% threshold, zscore .5)
> - K Nearest Neighbors: Validate accuracy: 0.7704337190524877
> - Decision Tree: Validate accuracy: 0.8218125615128089
> - Random Forest: Validate accuracy: 0.8699010061823358
> - Gaussian N-Bayes: Validate accuracy: 0.8718856602295204
> - Multinomial N-Bayes: Validate accuracy: 0.885873792437866

> #### Fourth iteration: (90% threshold, zscore .5)
> - K Nearest Neighbors: Validate accuracy: 0.7991583768344331
> - Decision Tree: Validate accuracy: 0.8479582389441545
> - Random Forest: Validate accuracy: 0.8916349847335763
> - Gaussian N-Bayes: Validate accuracy: 0.8852309662168818
> - Multinomial N-Bayes: Validate accuracy: 0.9072623855018221

##### Hyperparams:
- MultinomialNB(alpha = .5)

> #### Best Model Iteration: (75% threshold, zscore .5)
> - Multinomial N-Bayes: Validate accuracy: 0.9138447552447554
> - Test Score: 92.4%

In [8]:
model_obj.tf()

Unnamed: 0,raw_count,frequency,augmented_frequency
data,25128,0.008218,1.000000
use,20312,0.006643,0.808341
gt,19874,0.006500,0.790911
yes,19795,0.006474,0.787767
project,18964,0.006202,0.754696
...,...,...,...
invalidation,100,0.000033,0.003980
convolution,100,0.000033,0.003980
ranking,100,0.000033,0.003980
multilingual,100,0.000033,0.003980


In [9]:
model_obj.tf_idf()

Unnamed: 0,00,01,02,03,04,05,0527,0528,06,07,...,youll,youre,youtube,youtubedl,youve,zappa,zero,zip,zoom,zsh
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.200226,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
12503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
12504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.078921,0.0,0.0,0.025644,0.0,0.0,0.0,0.0,0.0
12505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
