In [4]:
import pandas as pd
import numpy as np

from requests import get
from bs4 import BeautifulSoup
import os
import json
from typing import Dict, List, Optional, Union, cast
import requests
import re
import time

from env import github_token, github_username

In [2]:
"""
A module for obtaining repo readme and language data from the github API.
Before using this module, read through it, and follow the instructions marked
TODO.
After doing so, run it like this:
    python acquire.py
To create the `data.json` file that contains the data.
"""

# TODO: Make a github personal access token.
#     1. Go here and generate a personal access token: https://github.com/settings/tokens
#        You do _not_ need select any scopes, i.e. leave all the checkboxes unchecked
#     2. Save it in your env.py file under the variable `github_token`
# TODO: Add your github username to your env.py file under the variable `github_username`
# TODO: Add more repositories to the `REPOS` list below.

REPOS = [
    "gocodeup/codeup-setup-script",
    "gocodeup/movies-application",
    "torvalds/linux",
]

headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )


def github_api_request(url: str) -> Union[List, Dict]:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data


def get_repo_language(repo: str) -> str:
    url = f"https://api.github.com/repos/{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        if "language" not in repo_info:
            raise Exception(
                "'language' key not round in response\n{}".format(json.dumps(repo_info))
            )
        return repo_info["language"]
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )


def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f"https://api.github.com/repos/{repo}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )


def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""


def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_download_url = get_readme_download_url(contents)
    if readme_download_url == "":
        readme_contents = ""
    else:
        readme_contents = requests.get(readme_download_url).text
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    return [process_repo(repo) for repo in REPOS]


if __name__ == "__main__":
    data = scrape_github_data()
    json.dump(data, open("data.json", "w"), indent=1)

In [4]:
#url = "https://github.com/topics/minecraft"
reqs = requests.get(url)
soup = BeautifulSoup(reqs.text, 'html.parser')
urls = []
for link in soup.find_all('a',class_="v-align-middle"):
    print(link.get('href'))

In [None]:
soup.find_all('div', class_="d-flex flex-1")

In [21]:
list_of_div_elements = soup.find_all('a', class_="data-hydro-click")
list_of_div_elements

[]

In [42]:
s = soup.find_all('h3', class_='f3 color-fg-muted text-normal lh-condensed')

In [54]:
s[0].find('a', class_="text-bold wb-break-word")['href']

'fogleman/Craft'

In [57]:
links =[]
all_minecraft_repos = soup.find_all('h3', class_='f3 color-fg-muted text-normal lh-condensed')
for each_minecraft_repo in all_minecraft_repos:
    #href_link = each_minecraft_repo.a.attrs["href"]
    href_link = each_minecraft_repo.find('a', class_="text-bold wb-break-word")['href'][1:]
    links.append(href_link)

In [58]:
links

['fogleman/Craft',
 'dnschneid/crouton',
 'PaperMC/Paper',
 'IdreesInc/Monocraft',
 'MinecraftForge/MinecraftForge',
 'nuno-faria/tiler',
 'cabaletta/baritone',
 'huanghongxun/HMCL',
 'cuberite/cuberite',
 'CaffeineMC/sodium-fabric',
 'MultiMC/Launcher',
 'GeyserMC/Geyser',
 'PojavLauncherTeam/PojavLauncher',
 'PrismarineJS/mineflayer',
 'pmmp/PocketMine-MP',
 'PrismLauncher/PrismLauncher',
 'EngineHub/WorldEdit',
 'IrisShaders/Iris',
 'feather-rs/feather',
 'Querz/mcaselector']

In [60]:
soup.find_all('a', class_="v-align-middle")

[]

In [92]:
url = 'https://github.com/search?3&q=minecraft&type=Repositories'

In [93]:
url

'https://github.com/search?3&q=minecraft&type=Repositories'

In [94]:
def get_minecraft_urls(url):
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    urls = []
    for link in soup.find_all('a',class_="v-align-middle"):
        urls.append(link.get('href'))
    return urls

In [95]:
urls = get_minecraft_urls(url)
urls

['/MinecraftForge/MinecraftForge',
 '/fogleman/Minecraft',
 '/itzg/docker-minecraft-server',
 '/overviewer/Minecraft-Overviewer',
 '/TheGreyGhost/MinecraftByExample',
 '/minecraft-dev/MinecraftDev',
 '/jdah/minecraft-weekend',
 '/mouse0w0/MinecraftDeveloperGuide',
 '/Bukkit/Bukkit',
 '/TheRemote/MinecraftBedrockServer']

In [75]:
df_url = pd.DataFrame(urls, columns=['urls'])

In [77]:
#turn to csv
df_url.to_csv('df_urls.csv')

In [98]:
for i in range(3,53):
    url = f'https://github.com/search?{i}&q=minecraft&type=Repositories'
    url_list = []
    urls = get_minecraft_urls(url)
    url_list.append(urls)

In [100]:
url_list

[[]]

In [115]:
df.head()

Unnamed: 0,url
3,TheRemote/MinecraftBedrockServer
4,TheRemote/MinecraftBedrockServer


In [111]:
len(urls)

20

In [137]:
urls = []
for i in range(1,152):
    url = f'https://github.com/search?p={i}&q=minecraft&type=Repositories'
    reqs = requests.get(url)
    soup = BeautifulSoup(reqs.text, 'html.parser')
    time.sleep(5)
    
    for link in soup.find_all('a',class_="v-align-middle"):
        hyperlink = re.sub(r'/','', link.get('href'), count = 1)
        urls.append(hyperlink)
        time.sleep(1)

In [139]:
len(urls)

560

In [130]:
urls

['PrismarineJS/node-minecraft-protocol',
 'SpongePowered/SpongeAPI',
 'PrismarineJS/minecraft-data',
 'xPaw/PHP-Minecraft-Query',
 'samhogan/Minecraft-Unity3D',
 'GeyserMC/Geyser',
 'Hawstein/PyMinecraft',
 'jkutner/heroku-buildpack-minecraft',
 'walterhiggins/ScriptCraft',
 'doctorray117/minecraft-ondemand',
 'Ahtenus/minecraft-init',
 'Pierce01/MinecraftLauncher-core',
 'itzg/docker-minecraft-bedrock-server',
 'itsFrank/MinecraftHDL',
 'docker-archive/minecraft',
 'ldtteam/minecolonies',
 'YouHaveTrouble/minecraft-optimization',
 'dan200/ComputerCraft',
 'smith-j-travis/MinecraftAutoClicker',
 'toolbox4minecraft/amidst',
 'ReplayMod/ReplayMod',
 'pdinklag/MinecraftStats',
 'Wurst-Imperium/Wurst7',
 'MultiMC/Launcher',
 'pmmp/PocketMine-MP',
 'jdah/minecraft-again',
 'clear-code-projects/Minecraft-in-Python',
 'danba340/minecraft-freecodecamp',
 'LuckPerms/LuckPerms',
 'Advanced-Rocketry/AdvancedRocketry',
 'Minecraft-in-python/Minecraft',
 'overshard/docker-minecraft',
 'Fenixin/Mine

In [140]:
df2 = pd.DataFrame(columns=['repo_link'])
for i in range(len(urls)):
    df2.loc[i] = urls[i]

In [141]:
df2['repo_link'].nunique(), df.shape[0]

(556, 300)

In [6]:
#df = pd.read_csv(r'df_urls.csv', index_col=[0])

In [7]:
#df

Unnamed: 0,urls
0,/MinecraftForge/MinecraftForge
1,/fogleman/Minecraft
2,/itzg/docker-minecraft-server
3,/overviewer/Minecraft-Overviewer
4,/TheGreyGhost/MinecraftByExample
5,/minecraft-dev/MinecraftDev
6,/jdah/minecraft-weekend
7,/mouse0w0/MinecraftDeveloperGuide
8,/Bukkit/Bukkit
9,/TheRemote/MinecraftBedrockServer


In [3]:
url = f'https://github.com/search?q=minecraft'

In [8]:
df['urls'] = df['urls'].str.replace('/','',n=1)
df

Unnamed: 0,urls
0,MinecraftForge/MinecraftForge
1,fogleman/Minecraft
2,itzg/docker-minecraft-server
3,overviewer/Minecraft-Overviewer
4,TheGreyGhost/MinecraftByExample
5,minecraft-dev/MinecraftDev
6,jdah/minecraft-weekend
7,mouse0w0/MinecraftDeveloperGuide
8,Bukkit/Bukkit
9,TheRemote/MinecraftBedrockServer


In [9]:
"""
A module for obtaining repo readme and language data from the github API.
Before using this module, read through it, and follow the instructions marked
TODO.
After doing so, run it like this:
    python acquire.py
To create the `data.json` file that contains the data.
"""

# TODO: Make a github personal access token.
#     1. Go here and generate a personal access token: https://github.com/settings/tokens
#        You do _not_ need select any scopes, i.e. leave all the checkboxes unchecked
#     2. Save it in your env.py file under the variable `github_token`
# TODO: Add your github username to your env.py file under the variable `github_username`
# TODO: Add more repositories to the `REPOS` list below.

REPOS = list(df['urls'])

headers = {"Authorization": f"token {github_token}", "User-Agent": github_username}

if headers["Authorization"] == "token " or headers["User-Agent"] == "":
    raise Exception(
        "You need to follow the instructions marked TODO in this script before trying to use it"
    )


def github_api_request(url: str) -> Union[List, Dict]:
    response = requests.get(url, headers=headers)
    response_data = response.json()
    if response.status_code != 200:
        raise Exception(
            f"Error response from github api! status code: {response.status_code}, "
            f"response: {json.dumps(response_data)}"
        )
    return response_data


def get_repo_language(repo: str) -> str:
    url = f"https://api.github.com/repos/{repo}"
    repo_info = github_api_request(url)
    if type(repo_info) is dict:
        repo_info = cast(Dict, repo_info)
        if "language" not in repo_info:
            raise Exception(
                "'language' key not round in response\n{}".format(json.dumps(repo_info))
            )
        return repo_info["language"]
    raise Exception(
        f"Expecting a dictionary response from {url}, instead got {json.dumps(repo_info)}"
    )


def get_repo_contents(repo: str) -> List[Dict[str, str]]:
    url = f"https://api.github.com/repos/{repo}/contents/"
    contents = github_api_request(url)
    if type(contents) is list:
        contents = cast(List, contents)
        return contents
    raise Exception(
        f"Expecting a list response from {url}, instead got {json.dumps(contents)}"
    )


def get_readme_download_url(files: List[Dict[str, str]]) -> str:
    """
    Takes in a response from the github api that lists the files in a repo and
    returns the url that can be used to download the repo's README file.
    """
    for file in files:
        if file["name"].lower().startswith("readme"):
            return file["download_url"]
    return ""


def process_repo(repo: str) -> Dict[str, str]:
    """
    Takes a repo name like "gocodeup/codeup-setup-script" and returns a
    dictionary with the language of the repo and the readme contents.
    """
    contents = get_repo_contents(repo)
    readme_download_url = get_readme_download_url(contents)
    if readme_download_url == "":
        readme_contents = ""
    else:
        readme_contents = requests.get(readme_download_url).text
    return {
        "repo": repo,
        "language": get_repo_language(repo),
        "readme_contents": readme_contents,
    }


def scrape_github_data() -> List[Dict[str, str]]:
    """
    Loop through all of the repos and process them. Returns the processed data.
    """
    return [process_repo(repo) for repo in REPOS]


if __name__ == "__main__":
    data = scrape_github_data()
    json.dump(data, open("data.json", "w"), indent=1)

In [11]:
scraped_dict = scrape_github_data()

In [13]:
scraped_df = pd.DataFrame(scraped_dict)
scraped_df

Unnamed: 0,repo,language,readme_contents
0,MinecraftForge/MinecraftForge,Java,
1,fogleman/Minecraft,Python,# Minecraft\n\nSimple Minecraft-inspired demo ...
2,itzg/docker-minecraft-server,Shell,[![Docker Pulls](https://img.shields.io/docker...
3,overviewer/Minecraft-Overviewer,Python,====================================\nMinecraf...
4,TheGreyGhost/MinecraftByExample,Java,MinecraftByExample [1.16.4]\n=================...
5,minecraft-dev/MinecraftDev,Kotlin,"<p align=""center""><a href=""https://minecraftde..."
6,jdah/minecraft-weekend,C,"# Minecraft, but I made it in 48 hours*\n\n\* ..."
7,mouse0w0/MinecraftDeveloperGuide,,# 我的世界开发者中文指南\n![](https://img.shields.io/badg...
8,Bukkit/Bukkit,Java,Bukkit\n======\n\nA Minecraft Server API.\n\nW...
9,TheRemote/MinecraftBedrockServer,Shell,# Minecraft Bedrock Server\n\n[日本語版 README はこち...


In [14]:
df = pd.read_csv(r'scraped_data.csv', index_col=[0])
df

FileNotFoundError: [Errno 2] No such file or directory: 'scraped_data.csv'