# Natural Language Processing Project

>Goals:
- Build a dataset of 100 Github repositories' readme text
- Explore the text of the readme's and find connections to programming language
- Build a classification ML model that predicts the programming language used in a repo based on readme content. 

In [14]:
import numpy as np
import pandas as pd

# acquire
from requests import get
from bs4 import BeautifulSoup
from time import sleep
import os

# prepare
import unicodedata
import re
import json
import nltk
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.corpus import stopwords

# explore
from sklearn.model_selection import train_test_split
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

# model
from pprint import pprint
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import confusion_matrix, classification_report

---
## Acquire

In [28]:
def make_soup(url):
    '''
    This helper function takes in a url and requests and parses HTML
    returning a soup object.
    '''
    headers = {'User-Agent': 'Sir Galahad'} 
    response = get(url, headers=headers)    
    soup = BeautifulSoup(response.text, 'html.parser')
    return soup

In [29]:
make_soup('https://github.com/search?q=stars%3A%3E0&s=stars&type=Repositories')


<!DOCTYPE html>

<html lang="en">
<head>
<meta charset="utf-8"/>
<link href="https://github.githubassets.com" rel="dns-prefetch"/>
<link href="https://avatars0.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://avatars1.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://avatars2.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://avatars3.githubusercontent.com" rel="dns-prefetch"/>
<link href="https://github-cloud.s3.amazonaws.com" rel="dns-prefetch"/>
<link href="https://user-images.githubusercontent.com/" rel="dns-prefetch"/>
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/frameworks-53332a145deeeaaf8f21feef45138a08.css" integrity="sha512-UzMqFF3u6q+PIf7vRROKCCcSAIYk0CGPD1MvMAnv0X7Pqxc6MTt+l1mXE6StaVvPg+m/XgXexi1uO8P/zszHgA==" media="all" rel="stylesheet">
<link crossorigin="anonymous" href="https://github.githubassets.com/assets/site-eb896b2962500cd7d5d2488a2f376084.css" integrity="sha512-64lrKWJQDNfV0kiKLzdghJjhBJyD7

In [30]:
def get_language_urls():
    '''
    This function scrapes all of the Codeup blog urls from
    the main Codeup blog page and returns a list of urls.
    '''
    
    urls = []
    
    languages = ['JavaScript', 'Python', 'Java', 'HTML']
    
    for language in languages:
        for i in range(1,11):
            # first page for most starred repos on GH
            url = f'https://github.com/search?l={language}&p={i}&q=stars%3A%3E0&s=stars&type=Repositories'

            urls.append(url)
    return urls

In [31]:
urls = get_language_urls()

In [32]:
urls

['https://github.com/search?l=JavaScript&p=1&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=2&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=3&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=4&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=5&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=6&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=7&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=8&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=9&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=JavaScript&p=10&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=Python&p=1&q=stars%3A%3E0&s=stars&type=Repositories',
 'https://github.com/search?l=Pytho

In [33]:
def get_all_urls(urls):
    '''
    This function scrapes all of the Codeup blog urls from
    the main Codeup blog page and returns a list of urls.
    '''
    
    repo_urls = []
    
    for url in urls:
        # Make request and soup object using helper
        soup = make_soup(url)
        sleep(1)
        # Create a list of the anchor elements that hold the urls.
        urls_list = soup.find_all('a', class_='v-align-middle')
    
        # I'm using a set comprehension to return only unique urls.
        urls_set = {'https://github.com' + link.get('href') for link in urls_list}
        urls_set = list(urls_set)
        repo_urls.extend(urls_set)

    # I'm converting my set to a list of urls.
    # urls = list(urls) 
        
    return repo_urls

In [34]:
all_urls = get_all_urls(urls)

In [35]:
len(all_urls)

160

In [9]:
def get_blog_articles(urls, cached=False):
    '''
    This function takes in a list of Codeup Blog urls and a parameter
    with default cached == False which scrapes the title and text for each url, 
    creates a list of dictionaries with the title and text for each blog, 
    converts list to df, and returns df.
    If cached == True, the function returns a df from a json file.
    '''
    if cached == True:
        df = pd.read_json('github_repos.json')
        
    # cached == False completes a fresh scrape for df     
    else:

        # Create an empty list to hold dictionaries
        articles = []

        # Loop through each url in our list of urls
        for url in urls:

            # Make request and soup object using helper
            soup = make_soup(url)

            # Save the programming language of each repo in variable language
            language = soup.find('span', class_='text-gray-dark text-bold mr-1').text

            # Save the text in each repo to variable content
            content = soup.find('article', class_="markdown-body entry-content container-lg").text

            # Create a dictionary holding the title and content for each blog
            article = {'language': language, 'content': content}

            # Add each dictionary to the articles list of dictionaries
            articles.append(article)
            
        # convert our list of dictionaries to a df
        df = pd.DataFrame(articles)

        # Write df to a json file for faster access
        df.to_json('github_repos.json')
    
    return df

In [36]:
df = get_blog_articles(all_urls)

In [37]:
df.head()

Unnamed: 0,language,content
0,JavaScript,"\n\n\n\n\nBootstrap\n\n Sleek, intuitive, and..."
1,JavaScript,\n\n\n\n\n\n\n\n\n\n\nSupporting Vue.js\nVue.j...
2,JavaScript,\n\n\n\n\nfreeCodeCamp.org's open-source codeb...
3,JavaScript,Airbnb JavaScript Style Guide() {\nA mostly re...
4,JavaScript,Create React App \n\nCreate React apps with n...


In [11]:
soup.find("span", class_ = "text-gray-dark text-bold mr-1").text

'JavaScript'

In [12]:
soup.find('article', class_="markdown-body entry-content container-lg").text

"\n\n\n\n\nfreeCodeCamp.org's open-source codebase and curriculum\nfreeCodeCamp.org is a friendly community where you can learn to code for free. It is run by a donor-supported 501(c)(3) nonprofit to help millions of busy adults transition into tech. Our community has already helped more than 10,000 people get their first developer job.\nOur full-stack web development and machine learning curriculum is completely free and self-paced. We have thousands of interactive coding challenges to help you expand your skills.\nTable of Contents\n\nCertifications\nThe Learning Platform\nReporting Bugs and Issues\nReporting Security Issues and Responsible Disclosure\nContributing\nPlatform, Build and Deployment Status\nLicense\n\nCertifications\nfreeCodeCamp.org offers several free developer certifications. Each of these certifications involves building 5 required web app projects, along with hundreds of optional coding challenges to help you prepare for those projects. We estimate that each certif

---
## Prepare

---
## Explore

---
## Model

---
## Conclusion