In [None]:
import requests as req
import pandas as pd
import json
from datetime import datetime

# Utils 

## Consts

In [None]:
DOMAIN = 'https://api.github.com/graphql'
TOKEN = ''
HEADERS = {
  'Authorization': f'bearer {TOKEN}',
  'Content-Type': 'application/json'
}
TODAY = datetime.now()
NUMBER_OF_REPOSITORIES=200

## Functions

In [None]:
def doPost(data : json)-> json:
  response = req.post(DOMAIN, headers=HEADERS, json=data)
  if response.status_code == 200:
    return response.json()

  raise Exception(f'Erro ao fazer requisição: {response.status_code} \n {response.text}')

def analisar_createdAt(repositories: list):
  data_frame = pd.DataFrame(repositories)
  created_dates = [datetime.strptime(date, '%Y-%m-%dT%H:%M:%SZ') for date in data_frame['createdAt']]
  age_years = [round((TODAY - date).days / 365, 2) for date in created_dates]
  return age_years

# REQUESTS

## QUERY

In [None]:
query_to_repos = '''
query search($queryString: String!, $perPage: Int!, $cursor: String) {
  search(query: $queryString, type: REPOSITORY, first: $perPage, after: $cursor) {
    edges {
      node {
        ... on Repository {
          nameWithOwner
          stargazers {
            totalCount
          }
          pullRequests(states: [MERGED, CLOSED], first: 100) {
            totalCount
          }
        }
      }
    }
    pageInfo {
      endCursor
      hasNextPage
    }
  }
}
'''


query_to_pullRequests = '''
query getPullRequests($owner: String!, $name: String!) {
  repository(owner: $owner, name: $name) {
    pullRequests(states: [MERGED, CLOSED], first: 100, orderBy: {field: CREATED_AT, direction: DESC}) {
      edges {
        node {
          number
          title
          createdAt
          closedAt
          mergedAt
          reviewDecision
          participants {
            totalCount
          }
          comments {
            totalCount
          }
          reviews(first: 1) {
            totalCount
            edges {
              node {
                createdAt
                updatedAt
                state
              }
            }
          }
          files {
            totalCount
          }
          additions
          deletions
          bodyText
        }
      }
    }
  }
}
'''

In [None]:
per_page = 1
cursor = None
repositories = []
query_string = "stars:>0"

while len(repositories) < NUMBER_OF_REPOSITORIES:
    variables = {
        "queryString": query_string,
        "perPage": per_page,
        "cursor": cursor
    }
    
    data = doPost(data={'query': query_to_repos, 'variables': variables})
    # print(data)
    if 'errors' in data:
        print("GraphQL query failed:", data['errors'])
        break
    
    for edge in data['data']['search']['edges']:
        
        if edge['node']['pullRequests']['totalCount'] < 100:
            continue
        
        if len(repositories) < 200:
            repositories.append(edge['node'])

    if data['data']['search']['pageInfo']['hasNextPage']:
        cursor = data['data']['search']['pageInfo']['endCursor']
    else:
        break

print("Total repositories: ", len(repositories))
print("Cursor: ", cursor)
print("Per page: ", per_page)

data_brutus = pd.DataFrame(repositories)
data_brutus.to_csv('./dataset/dados_base.csv', index=False, sep=';')


## data processing

In [None]:
dataFrame_tratado = pd.DataFrame()
dataFrame_tratado['Repositorio'] = [repo.get('nameWithOwner') for repo in repositories]
dataFrame_tratado['Estrelas'] = [repo.get('stargazers', {}).get('totalCount', 0) if isinstance(repo, dict) else 0 for repo in repositories]
dataFrame_tratado['nº Pull Requests'] = [repo.get('pullRequests').get('totalCount',0) if isinstance(repo, dict) else 0 for repo in repositories]

dataFrame_tratado.head()
dataFrame_tratado.to_csv('./dataset/dados_tratados.csv', index=False, sep=';')

## Requests to Pull Requests

In [None]:
data_treats = pd.read_csv('./dataset/dados_tratados.csv', sep=';')

names = []
owners = []
title = []
number = []
createdAt = []
closedAt = []
mergedAt = []
reviewDecision = []
reviews = []
total_files = []
additions = []
deletions = []
bodyText = []
comments = []
participants = []

for nameWithOwner in data_treats['Repositorio']:
    print(nameWithOwner)
    owner, name = nameWithOwner.split('/')
    variables = {
        "owner": owner,
        "name": name,
    }
    
    try:
        data = doPost(data={'query': query_to_pullRequests, 'variables': variables})
        
        if 'errors' in data:
            print("GraphQL query failed:", data['errors'])
        
    
        for node in data['data']['repository']['pullRequests']['edges']:
            values = node['node']
            names.append(name)
            owners.append(owner)
            title.append(values['title'])
            number.append(values['number'])
            createdAt.append(values['createdAt'])
            closedAt.append(values['closedAt'])
            mergedAt.append(values['mergedAt'])
            reviewDecision.append(values['reviewDecision'])
            reviews.append(values['reviews']['totalCount'])
            participants.append(values['participants']['totalCount'])
            comments.append(values['comments']['totalCount'])
            total_files.append(values['files']['totalCount'])
            additions.append(values['additions'])
            deletions.append(values['deletions'])
            bodyText.append(values['bodyText'])
    except Exception as ex:
        names.append(name)
        owners.append(owner)
        title.append(None)
        number.append(None)
        createdAt.append(None)
        closedAt.append(None)
        mergedAt.append(None)
        reviewDecision.append(None)
        reviews.append(None)
        total_files.append(None)
        additions.append(None)
        deletions.append(None)
        bodyText.append(None)
        participants.append(None)
        comments.append(None)

In [None]:
dataFrame_final = pd.DataFrame()
dataFrame_final['Proprietário'] = owners
dataFrame_final['Repositorio'] = names
dataFrame_final['Estrelas'] = data_treats['Estrelas']
dataFrame_final['Título'] = title
dataFrame_final['Número'] = number
dataFrame_final['Criado em'] = createdAt
dataFrame_final['fechado em'] = closedAt
dataFrame_final['merge em'] = mergedAt
dataFrame_final['decisão da review'] = reviewDecision
dataFrame_final['Total de Reviews'] = reviews
dataFrame_final['Total de arquivos'] = total_files
dataFrame_final['Adições'] = additions
dataFrame_final['exclusões'] = deletions
dataFrame_final['Texto de corpo'] = bodyText
dataFrame_final['total de participantes'] = participants
dataFrame_final['total de comentários'] = comments


dataFrame_final.head()
dataFrame_final.to_csv('./dataset/dados_tratados_pulRequests.csv', index=False, sep=';')


In [None]:
dataFrame_final.head()