In [1]:
import re, os
import unicodedata
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
import nltk.sentiment

from wordcloud import WordCloud

from acquire_c import *
from prepare_c import *
from explore_c import *

import warnings
warnings.simplefilter('ignore')

plt.rc('figure', figsize=(13, 7))
plt.style.use('seaborn-darkgrid')

## Acquire data and find the dominant language in each row

In [2]:
# You can pass a threshold argument but the default is 75
df = get_readme_data()
ongoing_stopwords = ['1', '2', '3']

In [None]:
def lang_or_not(s:str, lang:str) -> str:
    '''Takes string and returns if it is java or not java
    '''
    if s.lower() == lang:
        return lang
    return f'not_{lang}'

In [None]:
lang = 'javascript'
not_lang = f'not_{lang}'
df['label']  = df.prog_lang.apply(lambda x: lang_or_not(x, lang))
java_obj = NLP_explore(df, 'label', 'cleaned', lang, not_lang)

## Explore

### Look at word freqencies for JavaScript
|          |   word_count |
|:---------|-------------:|
| data     |        25128 |
| use      |        20312 |
| gt       |        19874 |
| yes      |        19795 |
| code     |        18020 |
| python   |        17961 |
| using    |        17762 |
| top      |        16057 |
| project  |        15087 |
| 1        |        13589 |
| run      |        13366 |
| api      |        12797 |
| unknown  |        12742 |
| github   |        12660 |
| file     |        12109 |
| learning |        11736 |
| open     |        11354 |
| app      |        11260 |
| create   |        10836 |
| 2        |        10439 |

In [None]:
# print(pd.DataFrame({'word_count': java_obj.all_freq}).head(20).to_markdown())

## Look at some word count visualizations

In [None]:
# Looking at just JavaScript hplot
java_obj.hplot_word_freq_viz(n=5, sort=lang)
# Looking at just JavaScript bplot stacked
java_obj.stacked_bplot_freq(n=5, sort=not_lang)

In [None]:
# Looking at just not_JavaScript hplot
java_obj.hplot_word_freq_viz(n=5, sort=lang)
# Looking at just not_JavaScript bplot stacked
java_obj.stacked_bplot_freq(n=5, sort=not_lang)

In [None]:
# Looking at just all hplot
java_obj.hplot_word_freq_viz(n=5)
# Looking at just all bplot stacked
java_obj.stacked_bplot_freq(n=5)

## Look at N-Grams Visualizations

### Look at Bigrams

In [None]:
java_bigram = java_obj.n_gram(top_n= 10, col=lang)

In [None]:
not_java_bigram = java_obj.n_gram(top_n = 10, col=not_lang)

In [None]:
both_bigrams = java_obj.n_gram(top_n=10)

### Look at trigrams

In [None]:
java_trigram = java_obj.n_gram(n=3, top_n=10, col=lang)

In [None]:
not_java_trigram = java_obj.n_gram(n=3, top_n=10, col=not_lang)

In [None]:
all_trigram = java_obj.n_gram(n=3, top_n=10)

### Plot some wordclouds

In [None]:
java_obj.plot_wordcloud(col=lang, save=True)

In [None]:
java_obj.plot_wordcloud(col=not_lang, save=True)

In [None]:
java_obj.plot_wordcloud(save=True)

## Add some sentiment analysis and some features

In [None]:
# Add sentiment analysis
java_obj.add_sentiment_analysis()
# Add features
java_obj.add_features()

java_obj.df.head()

## Sentiment analysis bivariate plots

In [None]:
java_obj.sentiment_bivariate_plots()

## Sentiment distribution plots

In [None]:
java_obj.sentiment_distributions()

In [None]:
%%html
<style>
table {margin-left: 0 !important;}
</style>

## Modeling

In [200]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn import svm
from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import MultinomialNB
from sklearn.gaussian_process.kernels import RBF
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.model_selection import KFold, cross_val_score, cross_validate
from sklearn.model_selection import train_test_split

In [301]:
class NLP_model():
    ''' Creates classification models using a variety of Sklearn models.

        Models:
        ----------------------------------------------------------------
        KNeighborsClassifier, DecisionTreeClassifier, svm, GaussianNB, 
        MultinomialNB, GaussianProcessClassifier, MLPClassifier, RandomForestClassifier, AdaBoostClassifier
        ----------------------------------------------------------------
        
        Arguments:
            - data: Pandas DataFrame
            - classifiers: List of classification models
            - names: Names of classification models
            - lang: Specifies a language to create a lang/not_lang label from
            - top_langs: Specifies the top n langs to create labels for, non-top_langs will be labeled 'other'
    '''
    def __init__(self, data:pd.DataFrame, classifiers: list, names: list, lang = None, top_langs = None):
        ''' Passes dataframe, list of actual classifiers and their names, as well as checks 
            for kwargs lang or top_lang
            Creates a zip of classifiers and their names
        '''
        
        df = data.copy(deep = True)
        
        #Checking for individual language specified or n_langs and creating label column
        # For individual lang specification
        if lang and not top_langs: # Checking for lang
            self.lang = lang # assigning lang attribute
            # creating label column
            df['label'] = df.prog_lang.apply(lambda x: x.lower() if x == self.lang else f'not_{self.lang.lower()}')
        if top_langs and not langs: # Checking for top_langs
            self.top_langs = df.prog_lang.value_counts()[:n_langs] # getting top n langs
            # Creating labels column from top n languages            
            df['label'] = df.prog_lang.apply(lambda x: x.lower() if x in self.top_langs else 'other')
        if lang and top_langs:
            raise AttributeError('Must specify either lang or top_langs, cant create labels for both')
        
        # Clean dataframe
        df.lemmatized = df.lemmatized.apply(basic_clean)
        
        # Creating class attributes
        self.classifiers = classifiers.copy()
        self.names = names.copy()
        
        # Creating class instance of df
        self.df = df.copy(deep = True)
        
        self.models = zip(self.names, self.classifiers) # zipping models and names
        
        
    def split(self, target = None):
        '''
        This function takes in a dataframe and, optionally, a target_var array. Performs a train, validate, 
        test split with no stratification. Returns train, validate, and test dfs.
        '''
        
        # Checking for y specified
        if target is None: # if no y, preform regular train, validate, test split
            train_validate, test = train_test_split(self.df, test_size=.2, 
                                                    random_state=1312)
            train, validate = train_test_split(train_validate, test_size=.3, 
                                                    random_state=1312)
            
            self.train, self.validate, self.test = train, validate, test # setting self versions of each df
            return train, validate, test
        
        # If y is specified preform X/y train, validate, test split
        else:
            X_train_validate, X_test, y_train_validate, y_test = train_test_split(self.df, target,
                                                                            test_size=.2, 
                                                                            random_state=1312)
            X_train, X_validate, y_train, y_validate = train_test_split(X_train_validate, y_train_validate,
                                                                            test_size=.3, 
                                                                            random_state=1312)
            self.X_train, self.X_validate, self.X_test,\
            self.y_train, self.y_validate, self.y_test = X_train, X_validate, X_test, y_train,\
                                                        y_validate, y_test # attributes for each X/y df and array
            
            return X_train, X_validate, X_test, y_train, y_validate, y_test
    
    
    def tf(self):
        ''' Gets the term frequency of lematized column in the df
        '''
        
        # For each lemmatized doc, append to series
        docs = [] # init empty series for split documents
        words = [] # init empty series for unique words
        for doc in self.df['lemmatized'].values:
            for word in doc.split(): # iterating through each word in a split doc
                words.append(word) # add to words
        
        word_ser = pd.Series(words) # turn w
        
        # Creating a df from unique words containing raw term count, 
        tf_df = (pd.DataFrame({'raw_count': word_ser.value_counts()})) # raw counts of each term
        tf_df['frequency'] = tf_df.raw_count / tf_df.raw_count.sum() # frequency of each term
        tf_df['augmented_frequency'] = tf_df.frequency / tf_df.frequency.max() # augmented freq of words
        
        return tf_df
    
    
    def tf_idf(self):
        return 'Yet to make method'
    
        
    def metrics(self, metric_type = 'accuracy', splits = 3):
        ''' Creates a metrics df measuring metric_type, accuracy by default.
            Preforms a kfold a number of times determined by splits
        '''
        try: # checking if label exists, if not, didnt specify a lang or top_langs
            self.df['label']
        except KeyError:
            return KeyError('Must specify language target in class to create model')
            
        target = 'label' # Setting target to label
        
        result = [] # init empty results list
        for i, (self.name, self.classifier) in enumerate(self.models): # iterate through zipped models
            kfold = KFold(n_splits = splits) # number of kfolds set to splits
            scores = cross_validate(self.classifier, self.df.drop(columns = [target]), self.df[target], cv = kfold, scoring = metric_type) # cross validate on each kfold
            result.append(scores) # append to results
            
        results = [res['test_score'].mean() for res in result] # list comp to get mean of cross val tests for each model
        metrics_df = pd.DataFrame(data = zip(self.names, results), columns = ['model', metric_type]) # wrap zipped model names and results in dataframe
        return metrics_df.sort_values(by = [metric_type], ascending = False) # return sorted by metric

In [302]:
names = ['K Nearest Neighbors', 'Decision Tree', 'Random Forest', 'AddaBoost', 
         'Gaussian N-Bayes', 'Multinomial N-Bayes']

classifiers = [
    KNeighborsClassifier(n_neighbors = 3),
    DecisionTreeClassifier(max_depth = 5),
    RandomForestClassifier(max_depth = 5, n_estimators = 10, max_features = 1 ),
    AdaBoostClassifier(),
    GaussianNB(),
    MultinomialNB()
    ]

model_obj = NLP_model(df, classifiers, names, lang = 'Python')

In [305]:
list(model_obj.models)

[]

In [304]:
def classifier_models(X_data, y_data, classifier_names, classifier_models):
    '''
        Takes two arrays:
        - X_data = data without the target_var included
        - y_data = an array of the target_var
        - List of model names 
        - List of the classifiers themselves
        
        Preforms K-fold and cross-validation and returns a metrics dataframe with the model name and accuracy score. 
    '''
    # Zipping models and Classifiers
    models = zip(classifier_names, classifier_models)

    # Init empty lists
    names = [] 
    result = []
    coeff = []

    # Cross-validating accuracy for each model based on Train subset
    for i, (name, model) in enumerate(models):
        kfold = KFold(n_splits = 10)
        scores = cross_validate(model, X_data, y_data, cv = kfold, scoring = 'accuracy', return_estimator=True)
        result.append(scores)
        names.append(name)
        try:
            coeff.append(model.coeff_)
        except AttributeError:
            coeff.append(None)
        msg = "{0}: Accuracy: {1}, Coeff: {2}".format(name, scores['test_score'].mean(), coeff[i])
        print(msg)
        
    results = [res['test_score'].mean() for res in result]
    metrics_df = pd.DataFrame(data = zip(names, results), columns = ['Model', 'Accuracy'])
    return metrics_df.sort_values(by = ['Accuracy'], ascending = False)