# Imports

In [1]:
import pandas as pd
import requests

# Loading data

In [2]:
def list_csv_files(url):
    response = requests.get(url)

    if response.status_code == 200:
        content = response.json()
        csv_files = [file['download_url'] for file in content if file['name'].endswith('.csv')]
        return csv_files
    else:
        print(f'Error accessing URL.\nStatus code: {response.status_code}')

def get_dataframes(user, repo, folder, dtype=None):
    url = f'https://api.github.com/repos/{user}/{repo}/contents/{folder}'
    csv_files = list_csv_files(url)

    df_list = []

    for file_url in csv_files:
        df = pd.read_csv(file_url, dtype=dtype)
        df_list.append(df)

    df_list = pd.concat(df_list, ignore_index=True)

    return df_list

# Pre-processing

## Eliminating irrelevant attributes

In [3]:
def eliminate_columns(df, columns=[]):
    # Drop columns
    df = df.drop(columns=columns)
    return df

## Handling attributes with missing values

In [4]:
def find_rows_with_null_values(df):
    # Indexes of rows with null values
    idxNullRows = pd.isnull(df).any().to_numpy().nonzero()

    # Print lines with missing values
    display(df.iloc[idxNullRows])

    return idxNullRows

# Execution

In [5]:
# GitHub repository with the data
user, repo, folder = 'GabrielNG13', 'ps-mediamonks-datascience', 'data/transient'

# Data types
dtype = {'ano': str, 'uf': str, 'genero': str, 'mes': str, 'numero': int}

# Loading data
df = get_dataframes(user, repo, folder, dtype)

In [6]:
# I classified all columns as relevant
# No column to be droped
df = eliminate_columns(df)

In [7]:
rows = find_rows_with_null_values(df)
# No record has any null values
# No action necessary

Unnamed: 0,ano,uf,genero,mes,numero
