In [None]:
import pandas as pd

In [None]:
births = pd.read_csv("~/Downloads/ETLSINASC/ETLSINASC.DNRES_TO_2010_t.csv")

In [None]:
import os

directory = os.path.expanduser("~/Downloads/ETLSINASC")
files_2010 = []

if os.path.exists(directory):
    files = os.listdir(directory)
    for file in files:
        if "2010" in file:
            files_2010.append(file)
else:
    print("The specified directory does not exist.")


In [None]:
cols = ['def_gravidez', 'def_gestacao', 'nasc_LATITUDE', 'nasc_LONGITUDE', "nasc_MUNNOMEX", "res_NOME_UF"]

In [None]:
file[-13:-11]

In [None]:
first = True
frac = 0.1
for file in files_2010:
    if first:
        first = False
        births = pd.read_csv(f"~/Downloads/ETLSINASC/{file}")[cols]#.sample(frac=frac)
        births["estado"] = file[-13:-11]
    else:
        births_file = pd.read_csv(f"~/Downloads/ETLSINASC/{file}")[cols]#.sample(frac=frac)
        births_file["estado"] = file[-13:-11]
        births = pd.concat(
            [
                births, 
                births_file
            ], 
            ignore_index=True
        )

In [None]:
prematuros = ["Menos de 22 semanas", "32 a 36 semanas", "28 a 31 semanas", "22 a 27 semanas"]
births['prematuro'] = births['def_gestacao'].isin(prematuros).astype("int")

In [None]:
births.groupby("def_gestacao").count()["def_gravidez"]

In [None]:
births = births.drop(["def_gestacao"], axis=1)

In [None]:
from sklearn.model_selection import train_test_split
X = births.drop(['prematuro'], axis=1)
y = births['prematuro']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
X_train["prematuro"] = y_train
X_test["prematuro"] = y_test

In [None]:
births["def_gravidez"] = (births["def_gravidez"] != "Única").astype("int")


In [None]:
births.corr()

In [None]:
import plotly.graph_objs as go

# group the data by nasc_LATITUDE and nasc_LONGITUDE, and calculate the mean of prematuro for each group
grouped_data = births.groupby(['res_NOME_UF']).mean().reset_index()[["prematuro", "nasc_LATITUDE", "nasc_LONGITUDE", 'res_NOME_UF']]
grouped_data["prematuro"] = grouped_data["prematuro"].clip(upper=0.2)


# create a Scattergeo trace with the mean values
data = go.Scattergeo(
    lon = grouped_data['nasc_LONGITUDE'],
    lat = grouped_data['nasc_LATITUDE'],
    mode = 'markers',
    marker = dict(
        size = 8,
        opacity = 0.8,
        color = grouped_data['prematuro'],
        colorscale = 'RdBu_r',  # choose a colorscale
        colorbar = dict(title = 'Prematuro')
    )
)

layout = go.Layout(
    title = go.layout.Title(text='Premature births by city in Brazil'),
    geo = go.layout.Geo(
        scope = 'south america',  # set the map scope to South America
        projection_type = 'equirectangular',
        showland = True,
        landcolor = 'rgb(217, 217, 217)',
        countrycolor = 'rgb(204, 204, 204)'
    )
)

fig = go.Figure(data=data, layout=layout)
fig.show()

In [None]:
births = births.drop(["nasc_LATITUDE", "nasc_LONGITUDE", "nasc_MUNNOMEX", "res_NOME_UF"], axis=1)

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import OneHotEncoder

preprocessor = ColumnTransformer(
    transformers=[
        ('estado', OneHotEncoder(), ['estado']),
    ],
    remainder='passthrough'
)

# Create the pipeline
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=2000))
])

# Split the dataset into training and testing sets
X = births.drop(['prematuro'], axis=1)
y = births['prematuro']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to the training data
pipeline.fit(X_train, y_train)

# Evaluate the pipeline on the testing data
score = pipeline.score(X_test, y_test)
print(f'Test accuracy: {score:.4f}')

In [None]:
1 - y_train.mean()

In [None]:
probs = pipeline.predict_proba(X_test)[:, 1]

In [None]:
import plotly.graph_objs as go
from plotly.subplots import make_subplots

# Create a histogram using Plotly
fig = go.Figure(data=[go.Histogram(x=probs, nbinsx=20)])

# Customize the layout
fig.update_layout(title='Frequency of Each Value by Bin',
                  xaxis_title='Value',
                  yaxis_title='Frequency')

# Show the plot
fig.show()

In [None]:
fig = go.Figure()

fig.add_trace(go.Histogram(
    x=probs,
    y=y_test,
    histfunc='avg',
    nbinsx=20,
    name='Frequency'
))

fig.update_layout(
    title='Frequency of Each Value by Bin',
    xaxis_title='Data',
    yaxis_title='Prematuro',
    bargap=0.2,
    bargroupgap=0.1
)

fig.show()

In [None]:
# Quando o modelo diz que a prob é maior que 30%, quais são os fatores decisivos?

In [None]:
import numpy as np

# Find the indexes of the elements above 0.30
indexes_above_30 = np.where(probs > 0.30)

# Select the corresponding rows from the DataFrame
risk_moms = X_test.iloc[indexes_above_30]

In [None]:
risk_moms.groupby("def_gravidez").count()
# Sempre que a mae nao tem gravidez unica, a chance de ser prematura é maior que 30%.
# Sempre que a gravidez é única, a chance é menor que 30%.
# Essa é a única feature que faz o modelo tomar decisão. Feature de pressão/tabaco da região são boas, mas não o suficiente
# Precisaríamos dessas features para a mãe.
# Qual o protocolo seguir? Quê acoes tomar?

In [None]:
# Nosso modelo impacta 2% das gravidez, considerando capitais.
11736 / len(X_test)

In [None]:
preprocessor = pipeline.named_steps['preprocessor']

# Get the feature names after preprocessing
feature_names = preprocessor.get_feature_names_out()

# Access the logistic regression classifier in the pipeline
logistic_regression = pipeline.named_steps['classifier']

# Get the weights (coefficients) of the logistic regression model
weights = logistic_regression.coef_

# Create a DataFrame to display the feature names and their corresponding weights
feature_weights = pd.DataFrame(weights.T, columns=['Weight'], index=feature_names)

# Print the feature names and their corresponding weights
print("Features and their corresponding weights:")
print(feature_weights)

In [None]:
feature_weights["abs_weight"] = abs(feature_weights["Weight"])

In [None]:
feature_weights.sort_values(by="abs_weight", ascending=False)

In [None]:
X_test["prematuro"] = y_test
X_test

In [None]:
X_test.groupby(["def_gravidez", "prematuro"]).count()

In [None]:
# Nosso modelo tem 13% de recall...
recall = 5406/(5406 + 35495)
recall

In [None]:
# ... e 46% de precisão
precision = 5406 / (5406 + 6330)
precision

In [None]:
n_premature_births = int(X_test["prematuro"].mean() * len(births))

In [None]:
print(f"Número de nascimentos prematuros: {n_premature_births}")
print(f"Número de nascimentos prematuros corretamente detectados: {int(n_premature_births * recall)}")
print(f"Número de nascimentos prematuros incorretamente detectados: {int((int(n_premature_births * recall) / precision) - int(n_premature_births * recall))}")