In [13]:
import re, os
import unicodedata
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import nltk.sentiment

from wordcloud import WordCloud

from acquire_c import *
from prepare_c import *
from explore_c import *
from model_m import *

import warnings
warnings.simplefilter('ignore')

plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')

## Acquire data and find the dominant language in each row

In [14]:
# You can pass a threshold argument but the default is 75
df = get_readme_data(lang = 'python', lang_threshold= 75, z_cutoff=0.5)

In [15]:
lang = 'python'
not_lang = f'not_{lang}'
# df['label']  = df.prog_lang.apply(lambda x: lang_or_not(x, lang))
java_obj = NLP_explore(df, 'label', 'cleaned', lang, not_lang)

## Modeling

In [16]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer


In [17]:
df.head()

Unnamed: 0,prog_lang,original,cleaned,label,stemmed,lemmatized
18,Python,game this game is done by python,game game done python,python,game game done python,game game done python
23,Python,Attendance-provider Make a attendance in a exc...,make excel file screenshot google,python,make excel file screenshot googl,make excel file screenshot google
24,Python,Open-cv-tutorial All the function for open cv,function open,python,function open,function open
27,Python,Python Text App Using Twilio API With a free T...,python text app using api free account get tex...,python,python text app use api free account get text ...,python text app using api free account get tex...
29,Python,Real-Time Voice Cloning This repository is an ...,realtime repository implementation works realt...,python,realtim repositori implement work realtim feel...,realtime repository implementation work realti...


In [59]:
class NLP_model():
    ''' Creates classification models using a variety of Sklearn models.

        Methods:
        ----------------------------------------------------------------
        > split: preforms train/test split. Can also preform X/y split if given a target array.
        
        > tf: gets the term frequency of the lemmatized column of the dataframe.
        
        > tf_idf: gets the term frequency-inverse document frequency 
        ----------------------------------------------------------------
        
        Arguments:
            - data: Pandas DataFrame
            - classifiers: List of classification models
            - names: Names of classification models
            - lang: Specifies a language to create a lang/not_lang label from
            - top_langs: Specifies the top n langs to create labels for, non-top_langs will be labeled 'other'
    '''
    def __init__(self, data:pd.DataFrame, classifiers: list, names: list, lang = None, top_langs = None):
        ''' Passes dataframe, list of actual classifiers and their names, as well as checks 
            for kwargs lang or top_lang
            Creates a zip of classifiers and their names
        '''
        # Creating class instance of df
        self.df = data.copy(deep = True)
        
        #Checking for individual language specified or n_langs and creating label column
        # For individual lang specification
        if lang != None and top_langs == None: # Checking for lang
            self.lang = lang # assigning lang attribute
            # creating label column
            self.df['label'] = self.df.prog_lang.apply(lambda x: x.lower() if x == self.lang else f'not_{self.lang.lower()}')
        if top_langs != None and lang == None: # Checking for top_langs
            self.top_langs = self.df.prog_lang.value_counts()[:top_langs] # getting top n langs
            # Creating labels column from top n languages            
            self.df['label'] = self.df.prog_lang.apply(lambda x: x.lower() if x in self.top_langs else 'other')
        if lang != None and top_langs != None:
            raise AttributeError('Must specify either lang or top_langs, cant create labels for both.')
        if top_langs != None and top_langs < 2:
            raise AttributeError("Must specify more than one lang, if you want to check for a single language, use lang argument instead.")
        
        # Clean dataframe
        self.df.lemmatized = self.df.lemmatized.apply(basic_clean)
        
        # Creating class attributes
        self.classifiers = classifiers
        self.names = names
        
        models = {'models': (names, classifiers)} # creating dict models and names
        self.models = models
        
    def split(self, df, target = None):
        '''
        This function takes in a dataframe and, optionally, a target_var array. Performs a train,
        test split with no stratification. Returns train and test dfs.
        '''
        
        # Checking for y specified
        if target is None: # if no y, preform regular train, validate, test split
            train, test = train_test_split(df, test_size=.2, 
                                          random_state=1312)
            
            self.train, self.test = train, test # setting self versions of each df
            return train, test
        
        # If y is specified preform X/y train, validate, test split
        else:
            X_train, X_test, y_train, y_test = train_test_split(df, target, test_size=.2, random_state=1312)
            self.X_train, self.X_test,\
            self.y_train, self.y_test = X_train, X_test, y_train, y_test # attributes for each X/y df and array
            
            return X_train, X_test, y_train, y_test
    
    
    def tf(self):
        ''' Gets the term frequency of lematized column in the df and returns
            a dataframe with raw value_counts, frequency, and augmented frequency
        '''
        
        # For each lemmatized doc, append to series
        docs = [] # init empty series for split documents
        words = [] # init empty series for unique words
        for doc in self.df['lemmatized'].values:
            for word in doc.split(): # iterating through each word in a split doc
                words.append(word) # add to words
        
        word_ser = pd.Series(words) # turn w
        
        # Creating a df from unique words containing raw term count, 
        tf_df = (pd.DataFrame({'raw_count': word_ser.value_counts()})) # raw counts of each term
        tf_df['frequency'] = tf_df.raw_count / tf_df.raw_count.sum() # frequency of each term
        tf_df['augmented_frequency'] = tf_df.frequency / tf_df.frequency.max() # augmented freq of words
        
        return tf_df
    
    def tf_idf(self):
        ''' Gets tf_idf and returns the dataframe of TfidVectorizer
        '''
        tfidf = TfidfVectorizer() # Make the opbject
        bag_of_words = tfidf.fit_transform(self.df['lemmatized'].values) # Fit_transform on lemmatized
        tfidf_df = pd.DataFrame(bag_of_words.todense(), columns=tfidf.get_feature_names()) # Wrapping in a dataframe
        return tfidf_df
    
    def count_vectorize(self, ngram_range = (1,1)):
        ''' Preforms a count vectorizeation with ngrams of n length.
            WARNING: If not cached on system can take a long time to process, 
            creates a cacehd csv for faster use in future iterations.
        '''
        # Checking for cached vectorized csv
        print('''Creating vectorized dataframe now. Vectorization may take a while, please wait...''')
        
        # Using Bag of Words count vectorizer for hexamers
        cv = CountVectorizer(ngram_range=(1,1)) # make the object
        vectors = cv.fit_transform(self.df.lemmatized.values) # fit_transform on lemmatized col
        self.vocab_count = cv.vocabulary_
        
        # Wraps vectorized array in a dataframe with feature names as the columns
        vector_df = pd.DataFrame(vectors.todense(), columns = cv.get_feature_names())
                
        # assigning vectorized dataframe as an attribute
        self.vectorized = vector_df.copy()
        
        return vector_df
        
    
    def metrics(self, metric_type = 'accuracy', splits = 3):
        ''' Checks for and encodes label column
            Creates a metrics df measuring metric_type, accuracy by default.
            Preforms a kfold a number of times determined by splits.
        '''
        try: # checking if label exists, if not raise KeyError, didnt specify a lang or top_langs
            self.df['label']
        except KeyError:
            return KeyError('Must specify language target in class to create models')
        
        try: # Checking if vectorization has already run, if yes there will be an attribute vectorized df
            self.vectorized
        except AttributeError: # If no vectorized attribute exists get vectorized df calling self.count_vectorize
            print('Have not run count_vectorize method yet, running now...')
            self.vectorized = self.count_vectorize()
            print('All done! Moving on to modeling, this may take a while...')
        target = 'label' # Setting target to label
        
        # checking for lang or top_langs
        if self.df[target].nunique() == 2: # If one lang chosen
            s = self.df[target].replace([f'{self.lang.lower()}', f'not_{self.lang.lower()}'], [1,0]) # Endode lang as 1 not_lang as 0
        else: # if top_langs
            lang_list = [l.lower() for l in list(self.top_langs.index)] # getting a list of all lower case langs in top lang
            lang_list.append('other') # appending 'other' label
            
            lang_encode = list(range(1, len(self.top_langs)+1)) # list of numbers to encode top_langs as
            lang_encode.append(0) # appending 0 for other
            s = self.df[target].replace(lang_list, lang_encode) # encoding top_langs
            
        
        X_train, X_test, y_train, y_test = self.split(self.vectorized, s)
        
        result = [] # init empty results list
        for model in self.models['models']: # iterate through zipped models
            kfold = KFold(n_splits = splits) # number of kfolds set to splits
            scores = cross_validate(classifier, X_train, y_train, cv = kfold, scoring = metric_type, return_estimator=True) # cross validate on each kfold
            result.append(scores) # append to results
            
            msg = "{0}: Validate accuracy: {1}".format(name, scores['test_score'].mean())
            print(msg)
        
        estimators = [res['estimator'] for res in result]
        results = [res['test_score'] for res in result]
        avg_res = [round(res['test_score'].mean(), 4) * 100 for res in result] # list comp to get mean of cross val tests for each model
        metrics_df = pd.DataFrame(data = zip(self.names, avg_res), columns = ['model', f'average_{metric_type}%']) # wrap zipped model names and results in dataframe
        
        
        return metrics_df.sort_values(by = [f'average_{metric_type}%'], ascending = False), zip(estimators, results) # return sorted by metric
        

In [60]:
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 
         'Gaussian N-Bayes', 'Multinomial N-Bayes']

classifiers = [
    KNeighborsClassifier(n_neighbors = 6),
    DecisionTreeClassifier(max_depth = 7),
    RandomForestClassifier(n_estimators = 10),
    GaussianNB(),
    MultinomialNB(alpha = .5)
    ]

model_obj = NLP_model(df, classifiers, names, lang = 'Python')

In [70]:
model_obj.models['models'][1]

[KNeighborsClassifier(n_neighbors=6),
 DecisionTreeClassifier(max_depth=7),
 RandomForestClassifier(n_estimators=10),
 GaussianNB(),
 MultinomialNB(alpha=0.5)]

In [64]:
metric_df, models_zip = model_obj.metrics(splits = 10)
metric_df

Have not run count_vectorize method yet, running now...
Creating vectorized dataframe now. Vectorization may take a while, please wait...
All done! Moving on to modeling, this may take a while...


ValueError: too many values to unpack (expected 2)

In [48]:
list(models_zip)

[([KNeighborsClassifier(n_neighbors=6),
   KNeighborsClassifier(n_neighbors=6),
   KNeighborsClassifier(n_neighbors=6),
   KNeighborsClassifier(n_neighbors=6),
   KNeighborsClassifier(n_neighbors=6),
   KNeighborsClassifier(n_neighbors=6),
   KNeighborsClassifier(n_neighbors=6),
   KNeighborsClassifier(n_neighbors=6),
   KNeighborsClassifier(n_neighbors=6),
   KNeighborsClassifier(n_neighbors=6)],
  array([0.8031968 , 0.81018981, 0.83316683, 0.82617383, 0.82617383,
         0.807     , 0.826     , 0.817     , 0.823     , 0.835     ])),
 ([DecisionTreeClassifier(max_depth=7),
   DecisionTreeClassifier(max_depth=7),
   DecisionTreeClassifier(max_depth=7),
   DecisionTreeClassifier(max_depth=7),
   DecisionTreeClassifier(max_depth=7),
   DecisionTreeClassifier(max_depth=7),
   DecisionTreeClassifier(max_depth=7),
   DecisionTreeClassifier(max_depth=7),
   DecisionTreeClassifier(max_depth=7),
   DecisionTreeClassifier(max_depth=7)],
  array([0.85714286, 0.86013986, 0.86113886, 0.86513487, 

IndexError: list index out of range

## Modeling Performance:
### JavaScript
##### Hyperparams:
- KNeighborsClassifier(n_neighbors = 3),
- DecisionTreeClassifier(max_depth = 5),
- RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features = 1 ),
- GaussianNB(),
- MultinomialNB()


> #### First iteration: (75% threshold, no zscore)
> - K Nearest Neighbors: Accuracy: 0.4437958746786057
> - Decision Tree: Accuracy: 0.5243970413600353
> - Random Forest: Accuracy: 0.3019957833163259
> - Gaussian N-Bayes: Accuracy: 0.43499919463886333
> - Multinomial N-Bayes: Accuracy: 0.5169951603916912

> #### Second Iteration (75% lang threshold, zscore .5)
> - K Nearest Neighbors: Accuracy: 0.42000479731350443
> - Decision Tree: Accuracy: 0.6441992484208843
> - Random Forest: Accuracy: 0.6461981290477333
> - Gaussian N-Bayes: Accuracy: 0.5742384264811705
> - Multinomial N-Bayes: Accuracy: 0.6247701287279124

> #### Third Iteration (100% lang threshold, zscore .5)
> - K Nearest Neighbors: Validate accuracy: 0.8817125081859856
> - Decision Tree: Validate accuracy: 0.9279783272817043
> - Random Forest: Validate accuracy: 0.8400252694089291
> - Gaussian N-Bayes: Validate accuracy: 0.9172856871827753
> - Multinomial N-Bayes: Validate accuracy: 0.9334353652863924

> #### Fourth Iteration (100% lang threshold, zscore .5, KFolds = 10)
> - K Nearest Neighbors: Validate accuracy: 0.8886948083454633
> - Decision Tree: Validate accuracy: 0.9277563718354882
> - Random Forest: Validate accuracy: 0.8400267336434817
> - Gaussian N-Bayes: Validate accuracy: 0.9172855362426388
> - Multinomial N-Bayes: Validate accuracy: 0.9351794769339079

##### Hyperparams:
- MultinomialNB(alpha = .5)

> #### Best Model Iteration: (100% lang threshold, zscore .5, KFolds = 10)
> - Multinomial N-Bayes: Validate accuracy: 0.9412896842385668

### Python:
##### Hyperparams:
- KNeighborsClassifier(n_neighbors = 6),
- DecisionTreeClassifier(max_depth = 7),
- RandomForestClassifier(n_estimators = 10),
- GaussianNB(),
- MultinomialNB(alpha = .5)

> #### First iteration: (75% threshold, no zscore)
> - K Nearest Neighbors: Validate accuracy: 0.8206901098901099
> - Decision Tree: Validate accuracy: 0.8648681318681319
> - Random Forest: Validate accuracy: 0.9007505494505494
> - Gaussian N-Bayes: Validate accuracy: 0.8932528471528471
> - Multinomial N-Bayes: Validate accuracy: 0.9159434565434564

> #### Second iteration: (75% threshold, zscore .5)
> - K Nearest Neighbors: Validate accuracy: 0.8206901098901099
> - Decision Tree: Validate accuracy: 0.8635692307692308
> - Random Forest: Validate accuracy: 0.8984497502497503
> - Gaussian N-Bayes: Validate accuracy: 0.8932528471528471
> - Multinomial N-Bayes: Validate accuracy: 0.9159434565434564

> #### Third iteration: (100% threshold, zscore .5)
> - K Nearest Neighbors: Validate accuracy: 0.7704337190524877
> - Decision Tree: Validate accuracy: 0.8218125615128089
> - Random Forest: Validate accuracy: 0.8699010061823358
> - Gaussian N-Bayes: Validate accuracy: 0.8718856602295204
> - Multinomial N-Bayes: Validate accuracy: 0.885873792437866

> #### Fourth iteration: (90% threshold, zscore .5)
> - K Nearest Neighbors: Validate accuracy: 0.7991583768344331
> - Decision Tree: Validate accuracy: 0.8479582389441545
> - Random Forest: Validate accuracy: 0.8916349847335763
> - Gaussian N-Bayes: Validate accuracy: 0.8852309662168818
> - Multinomial N-Bayes: Validate accuracy: 0.9072623855018221

##### Hyperparams:
- MultinomialNB(alpha = .5)

> #### Best Model Iteration: (75% threshold, zscore .5)
> - Multinomial N-Bayes: Validate accuracy: 0.9138447552447554

In [8]:
model_obj.tf()

Unnamed: 0,raw_count,frequency,augmented_frequency
data,25128,0.008218,1.000000
use,20312,0.006643,0.808341
gt,19874,0.006500,0.790911
yes,19795,0.006474,0.787767
project,18964,0.006202,0.754696
...,...,...,...
invalidation,100,0.000033,0.003980
convolution,100,0.000033,0.003980
ranking,100,0.000033,0.003980
multilingual,100,0.000033,0.003980


In [9]:
model_obj.tf_idf()

Unnamed: 0,00,01,02,03,04,05,0527,0528,06,07,...,youll,youre,youtube,youtubedl,youve,zappa,zero,zip,zoom,zsh
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.200226,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12502,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
12503,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
12504,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.078921,0.0,0.0,0.025644,0.0,0.0,0.0,0.0,0.0
12505,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.000000,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0
