## acquire.py

In [1]:
# Imports

import pandas as pd
import numpy as np
from requests import get
from bs4 import BeautifulSoup
import os
import time

# scroll down for exercise functions

###### project functions #########
def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    # set headers and response variables
    headers = {'User-Agent': 'Codeup Data Science'} 
    response = get(url, headers=headers)
    # use BeartifulSoup to make object
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

def github_HP_urls():
    '''
    This function scrapes all of the Harry Potter urls from
    the github search page and returns a list of urls.
    '''
    # get the first 100 pages to allow for those that don't have readme or language
    pages = range(1, 101)
    urls = []
    
    for p in pages:
        
        # format string of the base url for the main github search page we are using to update with page number
        url = f'https://github.com/search?p={p}&q=harry+potter&type=Repositories'

        # Make request and soup object using helper
        soup = make_soup(url)

        # Create a list of the anchor elements that hold the urls on this search page
        page_urls_list = soup.find_all('a', class_='v-align-middle')
        # for each url in the find all list get just the 'href' link
        page_urls = {link.get('href') for link in page_urls_list}
        # make a list of these urls
        page_urls = list(page_urls)
        # append the list from the page to the full list to return
        urls.append(page_urls)
        time.sleep(5)
        
    # flatten the urls list
    urls = [y for x in urls for y in x]
    return urls



def github_urls_single_page():
    '''
    This function scrapes all of the evironmental urls from
    the github first search page and returns a list of urls.
    '''
    # The base url for the main github search page we are using
    url = 'https://github.com/search?o=desc&p=1&q=environmental&s=&type=Repositories'
    
    # Make request and soup object using helper
    soup = make_soup(url)
    
    # Create a list of the anchor elements that hold the urls.
    urls_list = soup.find_all('a', class_='v-align-middle')
    # for each url in the find all list get just the 'href' link
    urls = {link.get('href') for link in urls_list}
    # make a list of these urls
    urls = list(urls)
    return urls
# this gets 1st 10 urls, will need to get next 10 pages

def get_github_HPresults(cached=False):
    '''
    This function with default cached == False does a fresh scrape of github pages returned from
    search of 'environmental' and writes the returned df to a json file.
    cached == True returns a df read in from a json file.
    '''
    # option to read in a json file instead of scrape for df
    if cached == True:
        df = pd.read_json('readmes.json')
        
    # cached == False completes a fresh scrape for df    
    else:
        # get url list
        url_list = github_HP_urls()

        # Set base_url that will be used in get request
        base_url = 'https://github.com'
        
        # List of full url needed to get readme info
        readme_url_list = []
        for url in url_list:
            full_url = base_url + url
            readme_url_list.append(full_url)
        
        # Create an empty list, readmes, to hold our dictionaries
        readmes = []

        for readme_url in readme_url_list:
            # Make request and soup object using helper
            soup = make_soup(readme_url)

            if soup.find('article', class_="markdown-body entry-content container-lg") != None:            
                # Save the text in each readme to variable text
                content = soup.find('article', class_="markdown-body entry-content container-lg").text
            
            if soup.find('span', class_="text-gray-dark text-bold mr-1") != None:
            # Save the first language in each readme to variable text
                # NOTE: this is the majority language, not all of the languages used
                language = soup.find('span', class_="text-gray-dark text-bold mr-1").text

                # anything else useful on the page?

                # Create a dictionary holding the title and content for each blog
                readme = {'language': language, 'content': content}

                # Add each dictionary to the articles list of dictionaries
                readmes.append(readme)
            
        # convert our list of dictionaries to a df
        df = pd.DataFrame(readmes)

        # Write df to a json file for faster access
        df.to_json('readmes.json')

    return df
    # 339 observations with 50 pgs
    # ... observations with 100 pgs

## prepare.py

In [2]:
import unicodedata
import re
import json

import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

import pandas as pd

import acquire



def basic_clean(text):
    '''
    Initial basic cleaning/normalization of text string
    '''
    # change to all lowercase
    low_case = text.lower()
    # remove special characters, encode to ascii and recode to utf-8
    recode = unicodedata.normalize('NFKD', low_case).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    # Replace anything that is not a letter, number, whitespace or a single quote
    cleaned = re.sub(r"[^a-z0-9'\s]", '', recode)
    return cleaned

def tokenize(text):
    '''
    Use NLTK TlktokTokenizer to seperate/tokenize text
    '''
    # create the NLTK tokenizer object
    tokenizer = nltk.tokenize.ToktokTokenizer()
    return tokenizer.tokenize(text, return_str=True)

def stem(text):
    '''
    Apply NLTK stemming to text to remove prefix and suffixes
    '''
    # Create the nltk stemmer object, then use it
    ps = nltk.porter.PorterStemmer()
    stems = [ps.stem(word) for word in text.split()]
    article_stemmed = ' '.join(stems)
    return article_stemmed

def lemmatize(text):
    '''
    Apply NLTK lemmatizing to text to remove prefix and suffixes
    '''
    # Create the nltk lemmatize object, then use it
    wnl = nltk.stem.WordNetLemmatizer()
    lemmas = [wnl.lemmatize(word) for word in text.split()]
    article_lemmatized = ' '.join(lemmas)
    return article_lemmatized

def remove_stopwords(text, extra_words=[], exclude_words=[]):
    '''
    Removes stopwords from text, allows for additional words to exclude, or words to not exclude
    '''
    # define initial stopwords list
    stopword_list = stopwords.words('english')
    # add additional stopwords
    for word in extra_words:
        stopword_list.append(word)
    # remove stopwords to exclude from stopword list
    for word in exclude_words:
        stopword_list.remove(word)
    # split the string into words
    words = text.split()
    # filter the words
    filtered_words = [w for w in words if w not in stopword_list]
    # print number of stopwords removed
    # print('Removed {} stopwords'.format(len(words) - len(filtered_words)))
    # produce string without stopwords
    article_without_stopwords = ' '.join(filtered_words)
    return article_without_stopwords

### instructor version
def prep_article_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)\
                            .apply(lemmatize)
    
    df['stemmed'] = df[column].apply(basic_clean).apply(stem)
    
    df['lemmatized'] = df[column].apply(basic_clean).apply(lemmatize)
    
    return df[['topic', 'title', column, 'stemmed', 'lemmatized', 'clean']]

###### my GitHub version
def prep_data(df, column, extra_words=[], exclude_words=[]):
    '''
    This function take in a df and the string name for a text column with 
    option to pass lists for extra_words and exclude_words and
    returns a df with the text article title, original text, stemmed text,
    lemmatized text, cleaned, tokenized, & lemmatized text with stopwords removed.
    '''
    # create column with text cleaned
    df['clean'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)
    # basic clean, tokenize, remove_stopwords, and stem text
    df['stemmed'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)\
                            .apply(stem)
    # basic clean, tokenize, remove_stopwords, and lemmatize text
    df['lemmatized'] = df[column].apply(basic_clean)\
                            .apply(tokenize)\
                            .apply(remove_stopwords, 
                                   extra_words=extra_words, 
                                   exclude_words=exclude_words)\
                            .apply(lemmatize)

    # add a column with a list of words
    words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df.lemmatized]

    # column name will be words, and the column will contain lists of the words in each doc
    df = pd.concat([df, pd.DataFrame({'words': words})], axis=1)
    # add column with number of words in readme content
    df['doc_length'] = [len(wordlist) for wordlist in df.words]
    
    # removing unpopular languages 
    language_list = ['JavaScript', 'Java', 'HTML', 'Python']
    df = df[df.language.isin(language_list)]
    return df

ModuleNotFoundError: No module named 'acquire'

## model.py

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import ComplementNB, MultinomialNB
import pandas as pd
import warnings
warnings.filterwarnings("ignore")

######################## Logistic Regression ##########################

def logistic_regression(X_train, y_train, X_train_bow, X_train_tfidf):
    '''
    This function takes in X_train (features using for model) and y_train (target) and performs logistic
    regression giving us accuracy of the model and the classification report
    '''
    # create the object
    lm = LogisticRegression()
    # fit the object
    lm_bow = lm.fit(X_train_bow, y_train)
    lm_tfidf = lm.fit(X_train_tfidf, y_train)
    # make predictions
    X_train['pred_bow'] = lm_bow.predict(X_train_bow)
    X_train['pred_tfidf'] = lm_tfidf.predict(X_train_tfidf)
    
    # X_bow results
    print('X_bow Accuracy: {:.0%}\n'.format(accuracy_score(y_train, X_train.pred_bow)))
    print('-----------------------')
    print(f'X_bow Confusion Matrix: \n\n {pd.crosstab(y_train.language, X_train.pred_bow)}\n' )
    print('-----------------------')
    print("X_bow Logistic Regression Classification Report:\n", classification_report(y_train, X_train.pred_bow))

    # TF-IDF results
    print('-----------------------')
    print('TF-IDF Accuracy: {:.0%}\n'.format(accuracy_score(y_train, X_train.pred_tfidf)))
    print('-----------------------')
    print(f'TF-IDF Confusion Matrix: \n\n {pd.crosstab(y_train.language, X_train.pred_tfidf)}\n' )
    print('-----------------------')
    print("TF-IDF Logistic Regression Classification Report:\n", classification_report(y_train, X_train.pred_tfidf))
    return lm_bow, lm_tfidf


######################## Random Forest ##########################

def random_forest(X_train, y_train, X_train_bow, X_train_tfidf):
    # Random forest object
    rf = RandomForestClassifier(n_estimators=500, max_depth=5, random_state=123)
    # Fitting the data to the train data
    rf_bow = rf.fit(X_train_bow, y_train)
    rf_tfidf = rf.fit(X_train_tfidf, y_train)
    # make predictions
    X_train['pred_bow'] = rf_bow.predict(X_train_bow)
    X_train['pred_tfidf'] = rf_tfidf.predict(X_train_tfidf)

    # BOW results
    print('X_bow Accuracy: {:.0%}\n'.format(accuracy_score(y_train.language, X_train.pred_bow)))
    print('-----------------------')
    print(f'X_bow Confusion Matrix: \n\n {pd.crosstab(y_train.language, X_train.pred_bow)}\n' )
    print('-----------------------')
    print("X_bow Random Forest Classification Report:\n", classification_report(y_train.language, X_train.pred_bow))

   # TF-IDF results
    print('-----------------------')
    print('TF-IDF Accuracy: {:.0%}\n'.format(accuracy_score(y_train.language, X_train.pred_tfidf)))
    print('-----------------------')
    print(f'TF-IDF Confusion Matrix: \n\n {pd.crosstab(y_train.language, X_train.pred_tfidf)}\n' )
    print('-----------------------')
    print("TF-IDF Random Forest Classification Report:\n", classification_report(y_train.language, X_train.pred_tfidf))
    return rf_bow, rf_tfidf


######################## Complement Naive Bayes ##########################

def complement_naive_bayes(X_train, y_train, X_train_tfidf):
    
    # create object, fit, and predict
    cnb = ComplementNB(alpha=1.0)
    cnb_tfidf = cnb.fit(X_train_tfidf, y_train)
    X_train['pred_tfidf'] = cnb_tfidf.predict(X_train_tfidf)

    # TF-IDF results
    print('TF-IDF Accuracy: {:.0%}\n'.format(accuracy_score(y_train, X_train.pred_tfidf)))
    print('-----------------------')
    print(f'TF-IDF Confusion Matrix: \n\n {pd.crosstab(y_train.language, X_train.pred_tfidf)}\n' )
    print('-----------------------')
    print("TF-IDF Complement Niave Bayes Classification Report:\n", classification_report(y_train, X_train.pred_tfidf))
    return cnb_tfidf
 
######################## Validate Logistic Regression ##########################

def validate_logistic_regression(X_validate, y_validate, X_val_bow, X_val_tfidf, lm_bow, lm_tfidf):
    '''
    This function takes in X_train (features using for model) and y_train (target) and performs logistic
    regression giving us accuracy of the model and the classification report
    '''
    # create predictions
    X_validate['pred_bow'] = lm_bow.predict(X_val_bow)
    X_validate['pred_tfidf'] = lm_tfidf.predict(X_val_tfidf)

    # X_bow results
    print('X_bow Accuracy: {:.0%}\n'.format(accuracy_score(y_validate, X_validate.pred_bow)))
    print('-----------------------')
    print(f'X_bow Confusion Matrix: \n\n {pd.crosstab(y_validate.language, X_validate.pred_bow)}\n' )
    print('-----------------------')
    print("X_bow Logistic Regression Classification Report:\n", classification_report(y_validate, X_validate.pred_bow))

    # TF-IDF results
    print('-----------------------')
    print('TF-IDF Accuracy: {:.0%}\n'.format(accuracy_score(y_validate, X_validate.pred_tfidf)))
    print('-----------------------')
    print(f'TF-IDF Confusion Matrix: \n\n {pd.crosstab(y_validate.language, X_validate.pred_tfidf)}\n' )
    print('-----------------------')
    print("TF-IDF Logistic Regression Classification Report:\n", classification_report(y_validate, X_validate.pred_tfidf))

######################## Validate Random Forest ##########################

def validate_random_forest(X_validate, y_validate, X_val_bow, X_val_tfidf, rf_bow, rf_tfidf):
    # create predictions
    X_validate['pred_bow'] = rf_bow.predict(X_val_bow)
    X_validate['pred_tfidf'] = rf_tfidf.predict(X_val_tfidf)
    
    # X_bow results
    print('X_bow Accuracy: {:.0%}\n'.format(accuracy_score(y_validate.language, X_validate.pred_bow)))
    print('-----------------------')
    print(f'X_bow Confusion Matrix: \n\n {pd.crosstab(y_validate.language, X_validate.pred_bow)}\n' )
    print('-----------------------')
    print("X_bow Random Forest Classification Report:\n", classification_report(y_validate.language, X_validate.pred_bow))

   # TF-IDF results
    print('-----------------------')
    print('TF-IDF Accuracy: {:.0%}\n'.format(accuracy_score(y_validate.language, X_validate.pred_tfidf)))
    print('-----------------------')
    print(f'TF-IDF Confusion Matrix: \n\n {pd.crosstab(y_validate.language, X_validate.pred_tfidf)}\n' )
    print('-----------------------')
    print("TF-IDF Random Forest Classification Report:\n", classification_report(y_validate.language, X_validate.pred_tfidf))

######################## Validate Complement Naive Bayes ##########################

def validate_complement_naive_bayes(X_validate, y_validate, X_val_tfidf, cnb_tfidf):
    # makes predictions
    X_validate['pred_tfidf'] = cnb_tfidf.predict(X_val_tfidf)

    # TF-IDF results
    print('TF-IDF Accuracy: {:.0%}\n'.format(accuracy_score(y_validate, X_validate.pred_tfidf)))
    print('-----------------------')
    print(f'TF-IDF Confusion Matrix: \n\n {pd.crosstab(y_validate.language, X_validate.pred_tfidf)}\n' )
    print('-----------------------')
    print("TF-IDF Complement Niave Bayes Classification Report:\n", classification_report(y_validate, X_validate.pred_tfidf))

######################## Test  ##########################

def test_random_forest(X_test, y_test, X_test_tfidf, rf_tfidf):
    # Creaye predictions
    X_test['pred_tfidf'] = rf_tfidf.predict(X_test_tfidf)

   # Confusion matrix
    print('TF-IDF Accuracy: {:.0%}\n'.format(accuracy_score(y_test.language, X_test.pred_tfidf)))
    print('-----------------------')
    print(f'TF-IDF Confusion Matrix: \n\n {pd.crosstab(y_test.language, X_test.pred_tfidf)}\n' )
    print('-----------------------')
    print("TF-IDF Random Forest Classification Report:\n", classification_report(y_test.language, X_test.pred_tfidf))

def test_logistic_regression(X_test, y_test, X_test_bow, lm_bow):
    # Create prediction
    X_test['pred_bow'] = lm_bow.predict(X_test_bow)

    # X_bow results
    print('X_bow Accuracy: {:.0%}\n'.format(accuracy_score(y_test, X_test.pred_bow)))
    print('-----------------------')
    print(f'X_bow Confusion Matrix: \n\n {pd.crosstab(y_test.language, X_test.pred_bow)}\n' )
    print('-----------------------')
    print("X_bow Logistic Regression Classification Report:\n", classification_report(y_test, X_test.pred_bow))