In [1]:
from requests import get
from bs4 import BeautifulSoup
import os
import pandas as pd
import time
import prepare as p

In [2]:
"""
A module for obtaining repo readme and language data from the github API.
Before using this module, read through it, and follow the instructions marked
TODO.
After doing so, run it like this:
    python acquire.py
To create the `data.json` file that contains the data.
"""
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests

from env import github_token, github_username

# TODO: Make a github personal access token.
#     1. Go here and generate a personal access token: https://github.com/settings/tokens
#        You do _not_ need select any scopes, i.e. leave all the checkboxes unchecked
#     2. Save it in your env.py file under the variable `github_token`
# TODO: Add your github username to your env.py file under the variable `github_username`
# TODO: Add more repositories to the `REPOS` list below.

REPOS = [
    "gocodeup/codeup-setup-script",
    "gocodeup/movies-application",
    "torvalds/linux",
]

headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )


def github_api_request(url: str) -> Union[List, Dict]:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data


def get_repo_language(repo: str) -> str:
    url = f"https://api.github.com/repos/{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        if "language" not in repo_info:
            raise Exception(
                "'language' key not round in response\n{}".format(json.dumps(repo_info))
            )
        return repo_info["language"]
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )


def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f"https://api.github.com/repos/{repo}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )


def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""


def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_download_url = get_readme_download_url(contents)
    if readme_download_url == "":
        readme_contents = ""
    else:
        readme_contents = requests.get(readme_download_url).text
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    return [process_repo(repo) for repo in REPOS]


if __name__ == "__main__":
    data = scrape_github_data()
    json.dump(data, open("data.json", "w"), indent=1)

In [3]:
#scrape_github_data()

In [4]:
def get_repo_urls()->pd.Series:
    if os.path.isfile('url_csv'):
        return pd.read_csv('url_csv')
    all_links = []
    for i in range(1,151):
        response = requests.get(f'https://github.com/search?p={i}&q=minecraft&type=Repositories').content
        bs = BeautifulSoup(response,'html.parser')
        all_links += [l['href'] for l in bs.find_all('a',class_='v-align-middle')]
        time.sleep(6)
    links = pd.Series(all_links)
    links.to_csv('url_csv',index=False)

In [5]:
#get_repo_urls()

In [6]:
#df['url'] = df['0'].str.replace('/','',n=1)

In [7]:
#df.url

In [8]:
#df.info()

In [9]:
#get_repo_urls()

In [10]:
#df = pd.read_csv('url_csv')

In [11]:
#df

In [12]:
#df.info()

In [13]:
#df['url'] = df['0'].str.replace('/','',n=1)

In [14]:
#df = df.drop(columns='0')

In [15]:
#df

In [16]:
#df.nunique()

In [17]:
#df.url

In [18]:
#REPOS = list(df.url)

In [19]:
#scraped_dict = scrape_github_data()

In [20]:
#scraped_dict

In [21]:
#scraped_df = pd.DataFrame(scraped_dict)

In [22]:
#scraped_df.info()

In [23]:
#scraped_df

In [24]:
#scraped_df.readme_contents

In [25]:
#scraped_df.dropna()

In [26]:
#chris_df = pd.read_csv('extra_hyperlink_list.csv')

In [27]:
#chris_df

In [28]:
#REPOS = list(chris_df.repo_link)

In [29]:
#REPOS

In [30]:
#scraped_dict = scrape_github_data()

In [31]:
#scraped_df = pd.DataFrame(scraped_dict)

In [32]:
#scraped_df.to_csv('scraped_data.csv')

In [33]:
#scraped_df

In [34]:
#url_list = pd.read_csv('large_url_csv.csv')

In [35]:
#url_list.info()

In [36]:
#url_list = list(url_list.repo_link)

In [37]:
#url_list

In [38]:
#REPOS = url_list

In [39]:
#scraped_dict = scrape_github_data()

In [40]:
#scraped_df = pd.DataFrame(scraped_dict)

In [41]:
#scraped_df

In [42]:
#scraped_df.to_csv('large_scraped_data.csv')

In [43]:
large_scraped_data = pd.read_csv('large_scraped_data.csv')

In [44]:
large_scraped_data

Unnamed: 0.1,Unnamed: 0,repo,language,readme_contents
0,0,MinecraftForge/MinecraftForge,Java,
1,1,fogleman/Minecraft,Python,# Minecraft\n\nSimple Minecraft-inspired demo ...
2,2,itzg/docker-minecraft-server,Shell,[![Docker Pulls](https://img.shields.io/docker...
3,3,overviewer/Minecraft-Overviewer,Python,====================================\nMinecraf...
4,4,TheGreyGhost/MinecraftByExample,Java,MinecraftByExample [1.16.4]\n=================...
...,...,...,...,...
995,995,GeyserMC/PacketLib,Java,# PacketLib\nPacketLib is a library for packet...
996,996,Nic4Las/Minecraft-Enderite-Mod,Java,# Enderite Mod\n\n[![](http://cf.way2muchnoise...
997,997,grahamedgecombe/minecraft-installer,Groff,
998,998,espertus/blockly-lua,JavaScript,Blockly Lua allows you to program ComputerCraf...


In [45]:
large_scraped_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   Unnamed: 0       1000 non-null   int64 
 1   repo             1000 non-null   object
 2   language         938 non-null    object
 3   readme_contents  943 non-null    object
dtypes: int64(1), object(3)
memory usage: 31.4+ KB


In [46]:
large_scraped_data = large_scraped_data.dropna()

In [47]:
large_scraped_data = large_scraped_data.drop(columns='Unnamed: 0')

In [48]:
large_scraped_data

Unnamed: 0,repo,language,readme_contents
1,fogleman/Minecraft,Python,# Minecraft\n\nSimple Minecraft-inspired demo ...
2,itzg/docker-minecraft-server,Shell,[![Docker Pulls](https://img.shields.io/docker...
3,overviewer/Minecraft-Overviewer,Python,====================================\nMinecraf...
4,TheGreyGhost/MinecraftByExample,Java,MinecraftByExample [1.16.4]\n=================...
5,minecraft-dev/MinecraftDev,Kotlin,"<p align=""center""><a href=""https://minecraftde..."
...,...,...,...
994,Crossroads-Development/Crossroads,Java,# Crossroads\n\nA steampunk technology mod for...
995,GeyserMC/PacketLib,Java,# PacketLib\nPacketLib is a library for packet...
996,Nic4Las/Minecraft-Enderite-Mod,Java,# Enderite Mod\n\n[![](http://cf.way2muchnoise...
998,espertus/blockly-lua,JavaScript,Blockly Lua allows you to program ComputerCraf...


In [50]:
clean_df = p.prep_readme_data(large_scraped_data, 'readme_contents')

Removed 129 stopwords
---
Removed 4065 stopwords
---
Removed 191 stopwords
---
Removed 474 stopwords
---
Removed 142 stopwords
---
Removed 69 stopwords
---
Removed 7 stopwords
---
Removed 1267 stopwords
---
Removed 187 stopwords
---
Removed 31 stopwords
---
Removed 264 stopwords
---
Removed 201 stopwords
---
Removed 235 stopwords
---
Removed 232 stopwords
---
Removed 169 stopwords
---
Removed 215 stopwords
---
Removed 84 stopwords
---
Removed 64 stopwords
---
Removed 167 stopwords
---
Removed 134 stopwords
---
Removed 7 stopwords
---
Removed 119 stopwords
---
Removed 7 stopwords
---
Removed 170 stopwords
---
Removed 564 stopwords
---
Removed 1962 stopwords
---
Removed 70 stopwords
---
Removed 405 stopwords
---
Removed 561 stopwords
---
Removed 190 stopwords
---
Removed 610 stopwords
---
Removed 123 stopwords
---
Removed 64 stopwords
---
Removed 107 stopwords
---
Removed 224 stopwords
---
Removed 1042 stopwords
---
Removed 123 stopwords
---
Removed 202 stopwords
---
Removed 43 stopwords

In [51]:
clean_df

Unnamed: 0,repo,language,readme_contents,clean,lemmatized
1,fogleman/Minecraft,Python,# Minecraft\n\nSimple Minecraft-inspired demo ...,minecraft simple minecraftinspired demo writte...,minecraft simple minecraftinspired demo writte...
2,itzg/docker-minecraft-server,Shell,[![Docker Pulls](https://img.shields.io/docker...,docker pullshttpsimgshieldsiodockerpullsitzgmi...,docker pullshttpsimgshieldsiodockerpullsitzgmi...
3,overviewer/Minecraft-Overviewer,Python,====================================\nMinecraf...,minecraft overviewer build status andrew brown...,minecraft overviewer build status andrew brown...
4,TheGreyGhost/MinecraftByExample,Java,MinecraftByExample [1.16.4]\n=================...,minecraftbyexample 1164 purpose minecraftbyexa...,minecraftbyexample 1164 purpose minecraftbyexa...
5,minecraft-dev/MinecraftDev,Kotlin,"<p align=""center""><a href=""https://minecraftde...",p aligncentera hrefhttpsminecraftdevorgimg src...,p aligncentera hrefhttpsminecraftdevorgimg src...
...,...,...,...,...,...
994,Crossroads-Development/Crossroads,Java,# Crossroads\n\nA steampunk technology mod for...,crossroads steampunk technology mod minecraft ...,crossroad steampunk technology mod minecraft i...
995,GeyserMC/PacketLib,Java,# PacketLib\nPacketLib is a library for packet...,packetlib packetlib library packetbased networ...,packetlib packetlib library packetbased networ...
996,Nic4Las/Minecraft-Enderite-Mod,Java,# Enderite Mod\n\n[![](http://cf.way2muchnoise...,enderite mod httpcfway2muchnoiseeuversions3992...,enderite mod httpcfway2muchnoiseeuversions3992...
998,espertus/blockly-lua,JavaScript,Blockly Lua allows you to program ComputerCraf...,blockly lua allows program computercraft turtl...,blockly lua allows program computercraft turtl...


In [52]:
clean_df.to_csv('clean_scraped_data.csv')