In [1]:
import requests
import os
from dotenv import load_dotenv
load_dotenv()
import json
from bs4 import BeautifulSoup
import pandas as pd
from datetime import datetime

In [2]:
def gen_url(n):
    base_url = "https://api.github.com"
    owner = "ironhack-datalabs"
    repo = "datamad0820"
    query = f"?per_page=100&page={n}&state=all"
    endpoint = f"/repos/{owner}/{repo}/pulls"
    return f"{base_url}{endpoint}{query}"

load_dotenv()
authkey = os.getenv("AUTHKEY")
headers = {"Authorization": f"Bearer {authkey}"}

In [3]:
gen_url_res = []

page = 1
while True:
    url_res = requests.get(gen_url(page),headers=headers).json()
    if not url_res:
        break
    gen_url_res += url_res
    page += 1
 


In [4]:

#Obtenemos de la API la lista de las pull requests.
def getPullHtml(number):
    res = requests.get(f'https://github.com/ironhack-datalabs/datamad0820/pull/{number}')
    soup = BeautifulSoup(res.text, 'html.parser')
    return soup

#Creamos una lista con los memes por user.
def pullMemeLst(soup):
    images_lst = []
    images = soup.find_all("img")
    for x in images:
        if "user-images" in x['src']:
            images_lst.append(x['src'])
    return images_lst

#Buscamos los users con @, función que también nos sirve para obtener los users de los "join" porque en la corrección
#se les menciona con una @

#Ejemplo
#print(pullMentions(getPullHtml(385)))

#output: {'jorge-alamillos'}

def pullMentions(soup):
    user_mentions = set()
    names = soup.find_all("a")
    for x in names:
        if x.has_attr("class") and "user-mention" in x['class']:
            user_mentions.add(x.getText().split('@')[1])
    return user_mentions

#Pasamos las fechas del último commit y de la hora de cierre a formato datetime para operar y luego lo volvemos a
#pasar a json
def to_datetime(date_str):
    return datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")

def dateStr(datetime):
    return datetime.strftime("%Y-%m-%dT%H:%M:%SZ")

#Para obtener el último commit realizamos una nueva llamada a la API que nos devuelve una lista con los commits, añadimos a 
#una lista vacía la información que nos interesa de toda la obtenida, y, como puede haber varios commits, para quedarnos
#con el último hacemos el max.

def lastCommitDate(pull_number):
    commit_list = requests.get(f"https://api.github.com/repos/ironhack-datalabs/datamad0820/pulls/{pull_number}/commits", headers=headers).json()

    dates = []

    for commit in commit_list:
        dates.append(to_datetime(commit['commit']['author']['date']))

    return dateStr(max(dates))



In [10]:

#Creamos varios json vacíos para empezar a meter la info y organizarla para después exportarla a MongoDB.
lst = {}
pulls = {}
labs = {}
names = {}
for pull in gen_url_res:
    number = pull["number"]
    creator = pull["user"]["login"]
    pull_id = pull["id"]
    lab = {}
    labTitle = pull["title"]
    if "[" in labTitle:
        labTitle = labTitle.split("[")[1]
    if "]" in labTitle:
        labTitle = labTitle.split("]")[0]
    labTitle = labTitle.replace(" ", "-").lower()
    lab = labTitle
    soup = getPullHtml(number)
    meme_lst= pullMemeLst(soup)
    
    if creator in lst:
        lst[creator] = lst[creator]+ meme_lst
    else:
        lst.update({creator : meme_lst})
    mentions = pullMentions(soup)
    for mention in mentions:
        if mention in lst:
            lst[mention] = lst[mention]+ meme_lst
        else:
            lst.update({mention : meme_lst})
    json_pull = {}
    json_pull["users"] = [creator]+list(mentions)
    json_pull["created_at"] = pull["created_at"]
    
    closed_at = pull["closed_at"]
    json_pull["closed_at"] = closed_at
    
    last_commit_date = lastCommitDate(number)
    json_pull["last_commit_time"] = last_commit_date
    
    if closed_at:
        closed_at_datetime = to_datetime(closed_at)
        last_commit_date_datetime = to_datetime(last_commit_date)
        json_pull['instructor_grade_time'] = (closed_at_datetime-last_commit_date_datetime).total_seconds() / 3600
        
    json_pull["memes_lst"] = meme_lst
    json_pull["state"] = pull['state']
    pulls[str(pull_id)] = json_pull
    if lab in labs:
        labs[lab] = labs[lab]+ [pull_id]
    else:
        labs.update({lab : [pull_id]})
         
    if creator in names:
        names[creator] = names[creator] + [lab]
    else:
        names.update({creator : [lab]})
    for mention in mentions:
        if mention in names:
            names[mention] = names[mention] + [lab]
        else:
            names.update({mention : [lab]})
            

In [None]:
#Exportación de datos en formato json

In [12]:
pullsToExport = []

for k, v in pulls.items():
    pullsToExport.append({**v, **{"pull_id": k}})
    
#número de pull requests obtenidas 
print(len(pullsToExport))

518


In [13]:
with open('pulls.json', 'w') as json_file:
    json.dump(pullsToExport, json_file)

In [15]:
labsToExport = []

for k, v in labs.items():
    v = [str(x) for x in v]
    labsToExport.append({"lab_id": k, "pulls_list": v})

In [16]:
#jsonlabs 

with open('labs.json', 'w') as json_file:
    json.dump(labsToExport, json_file)

In [17]:
lstToExport = []

for k, v in lst.items():
    lstToExport.append({"name": k, "memes_list": v})

In [18]:
#jsonlst 

with open('lst.json', 'w') as json_file:
    json.dump(lstToExport, json_file)

In [19]:
namesToExport = []

for k, v in names.items():
    namesToExport.append({"name": k, "labs": v})

In [20]:
#jsonnames

with open('names.json', 'w') as json_file:
    json.dump(namesToExport, json_file)

In [144]:
#Testfunction

def test(pull):
    lst = {}
    names = set()
    pulls = {}
    labs = {}
    number = pull["number"]
    lab = pull["title"].split("[")[1].split("]")[0]


    creator = pull["user"]["login"]
    pull_id = pull["id"]
    soup = getPullHtml(number)
    meme_lst= pullMemeLst(soup)
    names.add(creator)
    
    if creator in lst:
        lst[creator] = lst[creator]+ meme_lst
    else:
        lst.update({creator : meme_lst})
    mentions = pullMentions(soup)
    for mention in mentions:
        names.add(mention)
        if mention in lst:
            lst[mention] = lst[mention]+ meme_lst
        else:
            lst.update({mention : meme_lst})
    json_pull = {}
    json_pull["users"] = [creator]+list(mentions)
    json_pull["created_at"] = pull["created_at"]
    
    closed_at = pull["closed_at"]
    json_pull["closed_at"] = closed_at
    
    last_commit_date = lastCommitDate(number)
    json_pull["last_commit_time"] = last_commit_date
    
    if closed_at:
        closed_at_datetime = to_datetime(closed_at)
        last_commit_date_datetime = to_datetime(last_commit_date)
        json_pull['instructor_grade_time'] = (closed_at_datetime-last_commit_date_datetime).total_seconds() / 3600
    
    json_pull["memes_lst"] = meme_lst
    pulls[str(pull_id)] = json_pull
    if lab in labs:
        labs[lab] = labs[lab]+ [pull_id]
    else:
        labs.update({lab : [pull_id]})
        
    names = {}
    if creator in names:
        names[creator] = names[creator] + [lab]
    else:
        names.update({creator : lab})
    for mention in mentions:
        if mention in names:
            names[mention] = names[mention] + [lab]
        else:
            names.update({mention : lab})
    
    print(names)
    
for pull in gen_url_res:
    if pull['number'] == 526:
        test(pull)

{'bmedm': 'lab-tableau-data-visualization'}
{'bmedm': 'lab-tableau-data-visualization'}
{'bmedm': 'lab-tableau-data-visualization'}
{'bmedm': 'lab-tableau-data-visualization'}
{'bmedm': 'lab-tableau-data-visualization'}
