# Imports

In [348]:
import numpy as np
import pandas as pd
import requests

from itertools import product
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split

# Constants

In [349]:
# Data types
const_dtype = { 'ano': int, 'uf': str, 'genero': str, 'mes': str, 'numero': int }

In [350]:
const_dict_month = {
    'Janeiro': 1,   'Fevereiro': 2,   'Março': 3,       'Abril': 4,
    'Maio': 5,      'Junho': 6,       'Julho': 7,       'Agosto': 8,
    'Setembro': 9,  'Outubro': 10,    'Novembro': 11,   'Dezembro': 12
}

# Loading data

In [351]:
def list_csv_files(url):
    response = requests.get(url)

    if response.status_code == 200:
        content = response.json()
        csv_files = [file['download_url'] for file in content if file['name'].endswith('.csv')]
        return csv_files
    else:
        print(f'Error accessing URL.\nStatus code: {response.status_code}')

In [352]:
def get_dataframes(user, repo, folder, dtype=None):
    url = f'https://api.github.com/repos/{user}/{repo}/contents/{folder}'
    csv_files = list_csv_files(url)

    df_list = []

    for file_url in csv_files:
        df = pd.read_csv(file_url, dtype=dtype)
        df_list.append(df)

    df_list = pd.concat(df_list, ignore_index=True)

    return df_list

# Pre-processing

## Eliminating irrelevant attributes

In [353]:
def eliminate_columns(df, columns=[]):
    # Drop columns
    df = df.drop(columns=columns)
    return df

## Handling attributes with missing values

In [354]:
def find_rows_with_null_values(df):
    # Indexes of rows with null values
    idxNullRows = pd.isnull(df).any().to_numpy().nonzero()

    return idxNullRows

# Regression model

## Dataset preparer

In [367]:
def prepare_dataset(df):
    df = pd.get_dummies(df, columns=['genero'], drop_first=True)

    # Add number of month
    df['n_mes'] = df['mes'].map(const_dict_month)

    # Column to represent time since the start data
    df['tempo'] = (df['ano'] - df['ano'].min()) * 12 + df['n_mes']

    return df

In [375]:
def define_dataset(df, resources_columns, target_column=False):
    # Define X using df and resource variables
    X = df[resources_columns]

    # Define Y using target variable
    y = df[target_column] if target_column else None

    return X, y

In [357]:
def split_dataset(X, y, test_size=0.3):
    # Split dataset with the selected test_size or 70/30 by default
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=test_size, random_state=13)

    return X_train, X_test, y_train, y_test

## Define model & predict

In [358]:
def define_model(X_train, y_train):
    # Create LinearRegression Model
    model = LinearRegression()

    # Compile model
    model.fit(X_train, y_train)

    return model

In [359]:
def predict(model, X):
    y_pred = model.predict(X)
    return y_pred

## Results

In [360]:
def summary_model(uf, X_train, X_test, y_train, y_test, y_pred):
    mae = round(mean_absolute_error(y_test, y_pred), 4)
    mse = round(mean_squared_error(y_test, y_pred), 4)

    print(f'========== Model {uf} ==========')
    print(f'\nData Split (proportion {test_size}):')
    print(f'\tX_train: {X_train.shape} | y_train: {y_train.shape}')
    print(f'\tX_test: {X_test.shape} | y_test: {y_test.shape}')
    print(f'\nModel metrics:')
    print(f'\tErro Médio Absoluto (MAE): {mae}')
    print('\tO MAE indica a média da diferença entre o valor real com o predito\n')
    print(f'\tErro Quadrático Médio (MSE): {mse}')
    print('\tO MSE indica a diferença real/previsto, porém acentuando diferenças maiores\n\n')

## Report utils

In [361]:
def get_previous_year(df):
    return df['ano'].max()

In [362]:
def get_largests_ufs_by_year(df, year, n=3):
    # Filter data os previous year
    df_previous_year = df[df['ano'] == year]

    # Group targert column using column uf
    total_per_uf = df_previous_year.groupby('uf')['numero'].sum()

    # Sort results and get top N
    top_n_ufs = total_per_uf.nlargest(n)

    # Format ufs in a list
    top_n_ufs = list(top_n_ufs.to_dict().keys())

    return top_n_ufs

# Fit model

In [363]:
# GitHub repository with the data
user, repo, folder = 'GabrielNG13', 'ps-mediamonks-datascience', 'data/transient'

# Loading data
raw_df = get_dataframes(user, repo, folder, const_dtype)

In [None]:
dfs_per_uf = raw_df.groupby('uf')
models = {}

for uf, df in dfs_per_uf:

    # Drop uf column
    df = eliminate_columns(df, columns=['uf'])

    # No record has any null values
    # No action necessary
    rows = find_rows_with_null_values(df)

    dataset = prepare_dataset(df)

    # Set categorical and target columns
    resources_columns = ['ano', 'tempo', 'n_mes'] + [ col for col in dataset.columns if 'genero' in col ]
    target_column = 'numero'

    X, y = define_dataset(dataset, resources_columns, target_column)

    # Proportion test/total
    test_size = 0.3
    X_train, X_test, y_train, y_test = split_dataset(X, y, test_size)

    # Creating and training the regression model
    modelo = LinearRegression()
    modelo.fit(X, y)

    # Create model
    model = define_model(X_train, y_train)

    # Results
    y_pred = predict(model, X_test)
    summary_model(uf, X_train, X_test, y_train, y_test, y_pred)

    # Store trained model
    models[uf] = model

# Report

In [370]:
# Previous year
previous_year = get_previous_year(raw_df)

# Get top 3 ufs
top_3_ufs = get_largests_ufs_by_year(raw_df, previous_year, 3)

In [379]:
# Define years to predict data
years = [str(int(previous_year) + 1), str(int(previous_year) + 2)]

# Get genders and months list
ufs = list(raw_df['uf'].unique())
genders = list(raw_df['genero'].unique())
months = list(raw_df['mes'].unique())

# Generate DataFrame with combinations of year, uf, gender and month
combinations = list(product(years, ufs, genders, months))

# Create general DataFrame
columns = ['ano', 'uf', 'genero', 'mes']
df_predict = pd.DataFrame(combinations, columns=columns)
df_predict['ano'] = df_predict['ano'].astype(int)

In [381]:
dfs_to_predict = df_predict.loc[df_predict['uf'].isin(top_3_ufs)].groupby('uf')
predicts = {}

for uf, df in dfs_to_predict:

    # Drop uf column
    df = eliminate_columns(df, columns=['uf'])

    # No record has any null values
    # No action necessary
    rows = find_rows_with_null_values(df)

    dataset = prepare_dataset(df)

    # Set categorical columns
    resources_columns = ['ano', 'tempo', 'n_mes'] + [ col for col in dataset.columns if 'genero' in col ]

    X_predict, _ = define_dataset(dataset, resources_columns)
    y_pred = predict(models[uf], X_predict)

    predicts[uf] = y_pred