In [None]:
import requests
from pyspark.sql.functions import udf, regexp_replace, to_date, col, regexp_extract, date_format
from pyspark.sql.types import StructType, StringType, StructField, IntegerType, DateType


In [None]:
user =  'elonmuskceo'
token = ''
headers = {
    "Authorization": f"token {token}"
}
path = '/app/output/'

## Extração

In [None]:
def get_followers(user, headers):
    followers = []
    page = 1
    while True:
        end_point_followers = f'https://api.github.com/users/{user}/followers?page={page}'
        response_followers = requests.get(end_point_followers, headers=headers).json()
        if not response_followers:
            break
        followers.extend(response_followers)
        page += 1
    return followers

In [None]:
response_followers = get_followers(user, headers)

In [None]:
users = [{'login': follower['login']} for follower in response_followers]
followers_df = spark.createDataFrame(users)

In [None]:
followers_df.display()

login
rccomp
nrupatunga
sdlonn
banekondic1996
nateraw
matiasignacio28
Jarvangod
brunoabcabral
Walinestlouis
jesuscaesar


In [None]:

def get_github_user_info(login):
    url = f"https://api.github.com/users/{login}"
    headers = {'Authorization': f'token {token}'}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        user_info = response.json()
        return (
            user_info.get('name'),
            user_info.get('company'),
            user_info.get('blog'),
            user_info.get('email'),
            user_info.get('bio'),
            user_info.get('public_repos'),
            user_info.get('followers'),
            user_info.get('following'),
            user_info.get('created_at')
        )
    else:
        return (None, None, None, None, None, None, None, None, None)
    
get_github_user_info_udf = udf(get_github_user_info, StructType([
    StructField("name", StringType(), True),
    StructField("company", StringType(), True),
    StructField("blog", StringType(), True),
    StructField("email", StringType(), True),
    StructField("bio", StringType(), True),
    StructField("public_repos", IntegerType(), True),
    StructField("followers", IntegerType(), True),
    StructField("following", IntegerType(), True),
    StructField("created_at", StringType(), True)
]))

In [None]:
df_with_github_info = followers_df.withColumn("github_info", get_github_user_info_udf(followers_df['login']))

In [None]:
df_with_github_info = df_with_github_info.select(
    df_with_github_info["login"],
    df_with_github_info.github_info.getItem("name").alias("name"),
    df_with_github_info.github_info.getItem("company").alias("company"),
    df_with_github_info.github_info.getItem("blog").alias("blog"),
    df_with_github_info.github_info.getItem("email").alias("email"),
    df_with_github_info.github_info.getItem("bio").alias("bio"),
    df_with_github_info.github_info.getItem("public_repos").alias("public_repos"),
    df_with_github_info.github_info.getItem("followers").alias("followers"),
    df_with_github_info.github_info.getItem("following").alias("following"),
    df_with_github_info.github_info.getItem("created_at").alias("created_at")
)


## Tratamentos

In [None]:
df_with_github_info = (
    df_with_github_info.withColumns(
        {
        "company": regexp_replace("company","@",""),
        "created_at":date_format(to_date("created_at"),"dd/MM/yyyy"),

        })
    )

## Carga dos dados

In [None]:
df_with_github_info.write.csv(path, header=True, mode="overwrite")

## Testes

### Tratamento de remoção do @ da coluna company

In [None]:
df_test = spark.read.format("csv").options(header = True, multiLine=True).load(path)

In [None]:
if df_test.filter(col("company").like("%@%")).count() > 0:
    print("Existe pelo menos um '@' na coluna 'company'.")
else:
    print("Não existe '@' na coluna 'company'.")

Não existe '@' na coluna 'company'.


### Verificação se a coluna created_at está no formato dd/mm/yyyy

In [None]:
pattern = r'^\d{2}/\d{2}/\d{4}$'
if df_test.filter(regexp_extract(col("created_at"), pattern, 0) != "").count() == df_test.count():
    print("A coluna 'created_at' está com as datas no formato dd/mm/yyyy.")
else:
    print("A coluna 'created_at' não está com todas as datas no formato dd/mm/yyyy.")

A coluna 'created_at' está com as datas no formato dd/mm/yyyy.
