In [26]:
from string import digits
import acquire
import requests
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd

import re
import unicodedata
import nltk

from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords
from wordcloud import WordCloud


%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns

## Acquire Function

In [13]:
def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

def get_readme_articles(urls, cached=False):
    '''
    This function takes in a list of GitHub Repo urls and a parameter
    with default cached == False which scrapes the title, text, and language for each url, 
    creates a list of dictionary of features,converts list to df, and returns df.
    If cached == True, the function returns a df from a json file.
    '''
    if cached == True:
        df = pd.read_json('project_readme.json')
        
    # cached == False completes a fresh scrape for df     
    else:

        # Create an empty list to hold dictionaries
        text = []

        # Loop through each url in our list of urls
        for url in urls:

            # Make request and soup object using helper
            soup = make_soup(url)

            # Save the title of each repo in variable title
            title = soup.select('h1', class_="Label Label--outline v-align-middle")[0].text

            # Save the text in each repo to variable text
            content = soup.select('article', class_="markdown-body entry-content container-lg")[0].text
            
            # Save the language of each repo in variable language
            language = soup.select('li.d-inline:nth-child(1) > a:nth-child(1)')[0].text

            # Create a dictionary holding the title and content for each blog
            repo = {'title': title, 'content': content, 'language': language}

            # Add each dictionary to the articles list of dictionaries
            text.append(repo)
            
        # convert our list of dictionaries to a df
        df = pd.DataFrame(text)

        # Write df to a json file for faster access
        df.to_json('project_readme.json')
    
    return df

## Prepare Function

In [40]:
def clean_data(df):
    def basic_clean(text):
        text = (unicodedata.normalize('NFKD', text.lower())
                .encode('ascii', 'ignore') # ascii to reduce noise
                .decode('utf-8', 'ignore') # decode using utf-8
               )
        return re.sub(r"[^a-z0-9\s]", '', text)

    def tokenize(string):
        '''
        This function takes in a string and
        returns a tokenized string.
        '''
        # Create tokenizer.
        tokenizer = nltk.tokenize.ToktokTokenizer()

        # Use tokenizer
        string = tokenizer.tokenize(string, return_str=True)

        return string

    def lemmatize(string):
        '''
        This function takes in string for and
        returns a string with words lemmatized.
        '''
        # Create the lemmatizer.
        wnl = nltk.stem.WordNetLemmatizer()

        # Use the lemmatizer on each word in the list of words we created by using split.
        lemmas = [wnl.lemmatize(word) for word in string.split()]

        # Join our list of words into a string again and assign to a variable.
        string = ' '.join(lemmas)

        return string

    def remove_stopwords(string, extra_words=[], exclude_words=[]):
        '''
        This function takes in a string, optional extra_words and exclude_words parameters
        with default empty lists and returns a string.
        '''
        # Create stopword_list.
        stopword_list = stopwords.words('english')

        # Remove 'exclude_words' from stopword_list to keep these in my text.
        stopword_list = set(stopword_list) - set(exclude_words)
        # Add in 'extra_words' to stopword_list.
        stopword_list = stopword_list.union(set(extra_words))

        # Split words in string.
        words = string.split()

        # Create a list of words from my string with stopwords removed and assign to variable.
        filtered_words = [word for word in words if word not in stopword_list]

        # Join words in the list back into strings and assign to a variable.
        string_without_stopwords = ' '.join(filtered_words)

        return string_without_stopwords
    
    df['title'] = df.title.apply(basic_clean)
    df['title'] = df.title.apply(tokenize)
    df['title'] = df.title.apply(lemmatize)
    df['language'] = df.language.apply(basic_clean)
    df['language'] = df.language.apply(tokenize)
    df['language'] = df.language.apply(lemmatize)
    #
    remove_digits = str.maketrans('', '', digits)
    df['language'] = df['language'].str.translate(remove_digits)
    #
    df['text_cleaned'] = df.content.apply(basic_clean)
    df['text_tokenized'] = df.text_cleaned.apply(tokenize)
    df['text_lemmatized'] = df.text_tokenized.apply(lemmatize)
    df['text_filtered'] = df.text_lemmatized.apply(remove_stopwords)
    # Add column with list of words
    words = [re.sub(r'([^a-z0-9\s]|\s.\s)', '', doc).split() for doc in df.text_filtered]
    df = pd.concat([df, pd.DataFrame({'words': words})], axis=1)
    # Adds colum with lenght of word list
    df['doc_length'] = [len(wordlist) for wordlist in df.words]
    return df.head()

#### Create the df

In [41]:
# Here cached == False, so the function will do a fresh scrape of the urls and write data to a json file.

urls = ['https://github.com/freeCodeCamp/freeCodeCamp',
        'https://github.com/996icu/996.ICU',
        'https://github.com/vuejs/vue',
        'https://github.com/CSolitaire/natural-language-processing-exercises',
        'https://github.com/facebook/react']

df = get_readme_articles(urls=urls, cached=False)
df

Unnamed: 0,title,content,language
0,\n\n\nfreeCodeCamp\n\n/\n\nfreeCodeCamp\n\n,\n\n\n\n\nfreeCodeCamp.org's open-source codeb...,\n\nJavaScript\n91.3%\n
1,\n\n\n996icu\n\n/\n\n996.ICU\n\n,996.ICU\nPlease note that there exists NO othe...,\n\nRust\n59.9%\n
2,\n\n\nvuejs\n\n/\n\nvue\n\n,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...,\n\nJavaScript\n97.7%\n
3,\n\n\nCSolitaire\n\n/\n\nnatural-language-proc...,Natural Language Processing\n\n\n\nSummary\nNL...,\n\nJupyter Notebook\n99.9%\n
4,\n\n\nfacebook\n\n/\n\nreact\n\n,React · \nReact is a JavaScript library for...,\n\nJavaScript\n95.0%\n


#### Clean df

In [42]:
df = clean_data(df)
df

Unnamed: 0,title,content,language,text_cleaned,text_tokenized,text_lemmatized,text_filtered,words,doc_length
0,freecodecamp freecodecamp,\n\n\n\n\nfreeCodeCamp.org's open-source codeb...,javascript,\n\n\n\n\nfreecodecamporgs opensource codebase...,freecodecamporgs opensource codebase and curri...,freecodecamporgs opensource codebase and curri...,freecodecamporgs opensource codebase curriculu...,"[freecodecamporgs, opensource, codebase, curri...",707
1,996icu 996icu,996.ICU\nPlease note that there exists NO othe...,rust,996icu\nplease note that there exists no other...,996icu\nplease note that there exists no other...,996icu please note that there exists no other ...,996icu please note exists official account app...,"[996icu, please, note, exists, official, accou...",456
2,vuejs vue,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...,javascript,\n\n\n\n\n\n\n\n\n\n\nsupporting vuejs\nvuejs ...,supporting vuejs\nvuejs is an mitlicensed open...,supporting vuejs vuejs is an mitlicensed open ...,supporting vuejs vuejs mitlicensed open source...,"[supporting, vuejs, vuejs, mitlicensed, open, ...",256
3,csolitaire naturallanguageprocessingexercises,Natural Language Processing\n\n\n\nSummary\nNL...,jupyter notebook,natural language processing\n\n\n\nsummary\nnl...,natural language processing\n\n\n\nsummary\nnl...,natural language processing summary nlp or nat...,natural language processing summary nlp natura...,"[natural, language, processing, summary, nlp, ...",207
4,facebook react,React · \nReact is a JavaScript library for...,javascript,react \nreact is a javascript library for ...,react \nreact is a javascript library for buil...,react react is a javascript library for buildi...,react react javascript library building user i...,"[react, react, javascript, library, building, ...",314


****
**Everything is good to go except the 'language' call**
****

In [None]:
# urls = ['https://github.com/freeCodeCamp/freeCodeCamp',
#         'https://github.com/996icu/996.ICU',
#         'https://github.com/vuejs/vue',
#         'https://github.com/EbookFoundation/free-programming-books',
#         'https://github.com/facebook/react']

In [2]:
def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    response = get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [11]:
# Soup Object
soup = make_soup('https://github.com/freeCodeCamp/freeCodeCamp')
soup1 = make_soup('https://github.com/996icu/996.ICU')
soup2 = make_soup('https://github.com/vuejs/vue')
soup3 = make_soup('https://github.com/EbookFoundation/free-programming-books')

In [6]:
# Language
soup.select('li.d-inline:nth-child(1) > a:nth-child(1)')[0].text

'\n\nJavaScript\n91.3%\n'

In [8]:
soup1.select('li.d-inline:nth-child(1) > a:nth-child(1)')[0].text

'\n\nRust\n59.9%\n'

In [10]:
soup2.select('li.d-inline:nth-child(1) > a:nth-child(1)')[0].text

'\n\nJavaScript\n97.7%\n'

In [12]:
soup3.select('li.d-inline:nth-child(1) > a:nth-child(1)')[0].text

IndexError: list index out of range

In [None]:
def no_digit(s):
    no_digits = []
    # Iterate through the string, adding non-numbers to the no_digits list
    for i in s:
        if not i.isdigit():
            no_digits.append(i)
        return no_digits