In [None]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, StructField, IntegerType, Row

class DataExtractorIfood:
    def __init__(self, spark: SparkSession, user: str, token: str):
        self.spark = spark
        self.user = user
        self.headers = {"Authorization": f"token {token}"}

    def get_followers(self):
        followers = []
        page = 1
        while True:
            end_point_followers = f'https://api.github.com/users/{self.user}/followers?page={page}'
            response_followers = requests.get(end_point_followers, headers=self.headers).json()
            if not response_followers:
                break
            followers.extend(response_followers)
            page += 1
        users = [{'login': follower['login']} for follower in followers]
        followers_df = self.spark.createDataFrame(users)
        return followers_df

    def get_github_user_info(self, login):
        url = f"https://api.github.com/users/{login}"
        response = requests.get(url, headers=self.headers)
        if response.status_code == 200:
            user_info = response.json()
            return (
                user_info.get('name'),
                user_info.get('company'),
                user_info.get('blog'),
                user_info.get('email'),
                user_info.get('bio'),
                user_info.get('public_repos'),
                user_info.get('followers'),
                user_info.get('following'),
                user_info.get('created_at')
            )
        else:
            return (None, None, None, None, None, None, None, None, None)

    def enrich_with_github_info(self, df):
        users = df.collect()

        enriched_data = []
        for user in users:
            user_info = self.get_github_user_info(user['login'])
            enriched_data.append(Row(
                name=user_info[0],
                company=user_info[1],
                blog=user_info[2],
                email=user_info[3],
                bio=user_info[4],
                public_repos=user_info[5],
                followers=user_info[6],
                following=user_info[7],
                created_at=user_info[8]
            ))

        schema = StructType([
            StructField("name", StringType(), True),
            StructField("company", StringType(), True),
            StructField("blog", StringType(), True),
            StructField("email", StringType(), True),
            StructField("bio", StringType(), True),
            StructField("public_repos", IntegerType(), True),
            StructField("followers", IntegerType(), True),
            StructField("following", IntegerType(), True),
            StructField("created_at", StringType(), True)
        ])
        
        enriched_df = self.spark.createDataFrame(enriched_data, schema)
        return enriched_df

    def execute_extract_api(self):
        df = self.get_followers()
        df_extract = self.enrich_with_github_info(df=df)

        return df_extract

    def read_csv(self, path):
        return self.spark.read.csv(path, header=True, inferSchema=True)
    
    def get_count_followers(self):
        url = f"https://api.github.com/users/{self.user}"
        response = requests.get(url, headers=self.headers)
        data = response.json()
        followers_count = data.get('followers')

        return followers_count


In [None]:
import requests
from pyspark.sql import SparkSession
from pyspark.sql.types import StructType, StringType, StructField, IntegerType
from pyspark.sql.functions import udf


def get_github_user_info(login, token):
    url = f"https://api.github.com/users/{login}"
    headers = {"Authorization": f"token {token}"}
    response = requests.get(url, headers=headers)
    if response.status_code == 200:
        user_info = response.json()
        return (
            user_info.get('name'),
            user_info.get('company'),
            user_info.get('blog'),
            user_info.get('email'),
            user_info.get('bio'),
            user_info.get('public_repos'),
            user_info.get('followers'),
            user_info.get('following'),
            user_info.get('created_at')
        )
    else:
        return (None, None, None, None, None, None, None, None, None)

class DataExtractorIfood:
    def __init__(self, spark: SparkSession, user: str, token: str):
        self.spark = spark
        self.user = user
        self.token = token

    def get_followers(self):
        followers = []
        page = 1
        while True:
            end_point_followers = f'https://api.github.com/users/{self.user}/followers?page={page}'
            response_followers = requests.get(end_point_followers, headers={"Authorization": f"token {self.token}"}).json()
            if not response_followers:
                break
            followers.extend(response_followers)
            page += 1
        users = [{'login': follower['login']} for follower in followers]
        followers_df = self.spark.createDataFrame(users)
        return followers_df

    def enrich_with_github_info(self, df):
        schema = StructType([
            StructField("name", StringType(), True),
            StructField("company", StringType(), True),
            StructField("blog", StringType(), True),
            StructField("email", StringType(), True),
            StructField("bio", StringType(), True),
            StructField("public_repos", IntegerType(), True),
            StructField("followers", IntegerType(), True),
            StructField("following", IntegerType(), True),
            StructField("created_at", StringType(), True)
        ])

        token = self.token

        @udf(returnType=schema)
        def get_github_user_info_udf(login):
            return get_github_user_info(login, token)

        enriched_df = df.withColumn("github_info", get_github_user_info_udf(df["login"]))

        enriched_df = enriched_df.select(
            "github_info.name",
            "github_info.company",
            "github_info.blog",
            "github_info.email",
            "github_info.bio",
            "github_info.public_repos",
            "github_info.followers",
            "github_info.following",
            "github_info.created_at"
        )

        return enriched_df

    def execute_extract_api(self):
        df = self.get_followers()
        df_extract = self.enrich_with_github_info(df=df)
        return df_extract

    def read_csv(self, path):
        return self.spark.read.csv(path, header=True, inferSchema=True)

    def get_count_followers(self):
        url = f"https://api.github.com/users/{self.user}"
        response = requests.get(url, headers={"Authorization": f"token {self.token}"})
        data = response.json()
        followers_count = data.get('followers')
        return followers_count


In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import regexp_replace, to_date, date_format

class DataLoaderIfood:
    def __init__(self, path: str):
        self.path = path

    def save_to_csv(self, df: DataFrame):
        df.coalesce(1).write.csv(self.path, header=True, mode="append")


In [None]:
from pyspark.sql import DataFrame
from pyspark.sql.functions import regexp_replace, to_date, date_format

class DataTransformerIfood:
    @staticmethod
    def transform(df: DataFrame) -> DataFrame:
        df_transformed = df.withColumn("company", regexp_replace("company", "@", "")) \
                           .withColumn("created_at", date_format(to_date("created_at", "yyyy-MM-dd'T'HH:mm:ss'Z'"), "dd/MM/yyyy"))
        return df_transformed


In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import  col, regexp_extract

class DataTestIfood:

    def __init__(self, spark: SparkSession,path,user, token):
        self.spark = spark
        self.path = path
        self.user = user
        self.token = token
        self.headers = {"Authorization": f"token {token}"}

    def read_data(self):
        df_test = self.spark.read.format("csv").options(header = True, multiLine=True).load(self.path)
        
        return df_test

    def clean_company(self,df):
        
        if df.filter(col("company").like("%@%")).count() > 0:
            print("Existe pelo menos um '@' na coluna 'company'.")
        else:
            print("Não existe '@' na coluna 'company'.")
        
    def date_format(self,df):

        pattern = r'^\d{2}/\d{2}/\d{4}$'

        if df.filter(regexp_extract(col("created_at"), pattern, 0) != "").count() == df.count():
            print("A coluna 'created_at' está com as datas no formato dd/mm/yyyy.")
        else:
            print("A coluna 'created_at' não está com todas as datas no formato dd/mm/yyyy.")
    
    def count_followers(self,df):

        followers_count = DataExtractorIfood(spark=self.spark,user=self.user,token=self.token).get_count_followers()
        
        if df.count() == followers_count:
            return print("A quantidade de seguidores em conformidade a quantidade atual de seguidores no GitHub.")
        else:
            return print("A quantidade de seguidores não está igual")


    def execute_test(self):
        print("Resultado das verificações de teste no Dataframe.")
        df = self.read_data()
        self.clean_company(df=df)
        self.date_format(df=df)
        self.count_followers(df=df)
        

In [None]:
user = 'marciocl'
token = ''
path = '/app/output/'


In [None]:
spark.conf.set("spark.sql.sources.commitProtocolClass", "org.apache.spark.sql.execution.datasources.SQLHadoopMapReduceCommitProtocol")
spark.conf.set("parquet.enable.summary-metadata", "false")
spark.conf.set("mapreduce.fileoutputcommitter.marksuccessfuljobs", "false")

In [None]:
extractor = DataExtractorIfood(spark=spark, user=user, token=token)
transformer = DataTransformerIfood()
loader = DataLoaderIfood(path)
test = DataTestIfood(spark=spark,path=path,user=user,token=token)

In [None]:
df_with_github_info = extractor.execute_extract_api()

df_transformed = transformer.transform(df_with_github_info)

loader.save_to_csv(df_transformed)

test.execute_test()

Resultado das verificações de teste no Dataframe.
Não existe '@' na coluna 'company'.
A coluna 'created_at' está com as datas no formato dd/mm/yyyy.
A quantidade de seguidores não está igual


In [None]:
df_with_github_info.display()

name,company,blog,email,bio,public_repos,followers,following,created_at
Cícero Viana,,linktr.ee/cicerohen,,Front-end engineer and contributor on @reactjs-ceara,29,287,92,2011-05-30T16:32:55Z
Dapieve,,https://www.linkedin.com/in/luiz-henrique-dapieve-4ab91137/,lhzdapieve@gmail.com,Architect Mobile | Remote Developer Fullstack | PHP | JAVA | Android | Angular2,2,0,0,2013-06-07T11:30:35Z
Davi Montenegro,,,davi16sm@gmail.com,,15,2,16,2013-11-14T00:29:19Z
Guto Macedo,,,,,8,7,6,2015-05-12T11:43:43Z
Walison Filipe,@buserbrasil,https://www.linkedin.com/in/walison-filipe,walisonfilipe@hotmail.com,Full stack developer #Python #Django,116,133,220,2015-09-11T22:10:07Z
David Torres,,,,,36,12,12,2015-11-27T10:43:51Z
Digo Gomes,Cheesecake Labs,https://gomes.dev,,A developer reborn from the ashes of a designer.,27,20,28,2017-02-05T00:51:02Z
Eri JS,Unigrande,https://www.linkedin.com/in/erijsfernandes/,,Front-End Developer React | Node | Vue | Nuxt |,44,8,20,2017-10-31T15:30:50Z
Marcos Lisboa,CompreUp,,,ruby on rails developer,22,26,60,2017-11-27T05:07:18Z
Bruno Luiz da Paciência,iFood,https://www.linkedin.com/in/brunopaciencia/,,,1,2,16,2018-02-07T11:20:19Z


In [None]:
df_transformed.display()

name,company,blog,email,bio,public_repos,followers,following,created_at
Cícero Viana,,linktr.ee/cicerohen,,Front-end engineer and contributor on @reactjs-ceara,29,287,92,30/05/2011
Dapieve,,https://www.linkedin.com/in/luiz-henrique-dapieve-4ab91137/,lhzdapieve@gmail.com,Architect Mobile | Remote Developer Fullstack | PHP | JAVA | Android | Angular2,2,0,0,07/06/2013
Davi Montenegro,,,davi16sm@gmail.com,,15,2,16,14/11/2013
Guto Macedo,,,,,8,7,6,12/05/2015
Walison Filipe,buserbrasil,https://www.linkedin.com/in/walison-filipe,walisonfilipe@hotmail.com,Full stack developer #Python #Django,116,133,220,11/09/2015
David Torres,,,,,36,12,12,27/11/2015
Digo Gomes,Cheesecake Labs,https://gomes.dev,,A developer reborn from the ashes of a designer.,27,20,28,05/02/2017
Eri JS,Unigrande,https://www.linkedin.com/in/erijsfernandes/,,Front-End Developer React | Node | Vue | Nuxt |,44,8,20,31/10/2017
Marcos Lisboa,CompreUp,,,ruby on rails developer,22,26,60,27/11/2017
Bruno Luiz da Paciência,iFood,https://www.linkedin.com/in/brunopaciencia/,,,1,2,16,07/02/2018


## Testes

In [None]:
from pyspark.sql.functions import  col, regexp_extract

In [None]:
df_test = spark.read.format("csv").options(header = True, multiLine=True).load(path)

In [None]:
if df_test.filter(col("company").like("%@%")).count() > 0:
    print("Existe pelo menos um '@' na coluna 'company'.")
else:
    print("Não existe '@' na coluna 'company'.")

In [None]:
pattern = r'^\d{2}/\d{2}/\d{4}$'
if df_test.filter(regexp_extract(col("created_at"), pattern, 0) != "").count() == df_test.count():
    print("A coluna 'created_at' está com as datas no formato dd/mm/yyyy.")
else:
    print("A coluna 'created_at' não está com todas as datas no formato dd/mm/yyyy.")

In [None]:
df_test.display()