# Imports

In [1]:
import numpy as np
import pandas as pd
import requests

from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Loading data

In [2]:
def list_csv_files(url):
    response = requests.get(url)

    if response.status_code == 200:
        content = response.json()
        csv_files = [file['download_url'] for file in content if file['name'].endswith('.csv')]
        return csv_files
    else:
        print(f'Error accessing URL.\nStatus code: {response.status_code}')

In [3]:
def get_dataframes(user, repo, folder, dtype=None):
    url = f'https://api.github.com/repos/{user}/{repo}/contents/{folder}'
    csv_files = list_csv_files(url)

    df_list = []

    for file_url in csv_files:
        df = pd.read_csv(file_url, dtype=dtype)
        df_list.append(df)

    df_list = pd.concat(df_list, ignore_index=True)

    return df_list

# Pre-processing

## Eliminating irrelevant attributes

In [4]:
def eliminate_columns(df, columns=[]):
    # Drop columns
    df = df.drop(columns=columns)
    return df

## Handling attributes with missing values

In [5]:
def find_rows_with_null_values(df):
    # Indexes of rows with null values
    idxNullRows = pd.isnull(df).any().to_numpy().nonzero()

    # Print lines with missing values
    display(df.iloc[idxNullRows])

    return idxNullRows

# Regression model

## Dataset preparer

In [6]:
def define_dataset(df, categorical_columns, target_column):
    # Define X using df and indicating dummy variables
    X = pd.get_dummies(df, columns=categorical_columns, drop_first=True)

    # Define Y using target variable
    y = X[target_column]

    return X, y

In [7]:
def split_dataset(X, y, test_size=0.3):
    # Split dataset with the selected test_size or 70/30 by default
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=13)
    print(f'X_train: {X_train.shape} | y_train: {y_train.shape}')
    print(f'X_test: {X_test.shape} | y_test: {y_test.shape}')

    return X_train, X_test, y_train, y_test

## Define model & predict

In [8]:
def define_model(X_train, y_train):
    # Create LinearRegression Model
    model = LinearRegression()

    # Compile model
    model.fit(X_train, y_train)

    return model

In [9]:
def predict(model, X):
    y_pred = model.predict(X)
    return y_pred

## Results

In [10]:
def show_metrics(y_test, y_pred):
    mae = mean_absolute_error(y_test, y_pred)
    mse = mean_squared_error(y_test, y_pred)
    print(f'Erro Médio Absoluto (MAE): {mae}')
    print('O MAE indica a média da diferença entre o valor real com o predito\n')
    print(f'Erro Quadrático Médio (MSE): {mse}')
    print('O MSE indica a diferença real/previsto, porém acentuando diferenças maiores')

# Execution

In [11]:
# GitHub repository with the data
user, repo, folder = 'GabrielNG13', 'ps-mediamonks-datascience', 'data/transient'

# Data types
dtype = {'ano': str, 'uf': str, 'genero': str, 'mes': str, 'numero': int}

# Loading data
df = get_dataframes(user, repo, folder, dtype)

In [12]:
# I classified all columns as relevant
# No column to be droped
df = eliminate_columns(df)

In [13]:
rows = find_rows_with_null_values(df)
# No record has any null values
# No action necessary

Unnamed: 0,ano,uf,genero,mes,numero


In [14]:
# Set categorical and target columns
categorical_columns = df.select_dtypes(include=['object']).columns.tolist()
target_column = df.select_dtypes(include=['int64']).columns.tolist()

X, y = define_dataset(df, categorical_columns, target_column)

In [15]:
# Proportion test/total
test_size = 0.3
X_train, X_test, y_train, y_test = split_dataset(X, y, test_size)

X_train: (3175, 45) | y_train: (3175, 1)
X_test: (1361, 45) | y_test: (1361, 1)


In [16]:
# Create model
model = define_model(X_train, y_train)

# Results
y_pred = predict(model, X_test)
show_metrics(y_test, y_pred)

Erro Médio Absoluto (MAE): 2.355009038671054e-14
O MAE indica a média da diferença entre o valor real com o predito

Erro Quadrático Médio (MSE): 3.0292910567260295e-27
O MSE indica a diferença real/previsto, porém acentuando diferenças maiores
