In [13]:
import os
import zipfile
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

In [14]:
path_dataset_file = "cvd/data.csv"

df = pd.read_csv(path_dataset_file)

print(df.head())
print(df.info())

   id             location country  gender   age     sym_on   hosp_vis  \
0   1  Shenzhen, Guangdong   China    male  66.0   1/3/2020  1/11/2020   
1   2             Shanghai   China  female  56.0  1/15/2020  1/15/2020   
2   3             Zhejiang   China    male  46.0   1/4/2020  1/17/2020   
3   4              Tianjin   China  female  60.0        NaN        NaN   
4   5              Tianjin   China    male  58.0        NaN        NaN   

   vis_wuhan  from_wuhan  death  recov symptom1 symptom2 symptom3 symptom4  \
0          1         0.0      0      0      NaN      NaN      NaN      NaN   
1          0         1.0      0      0      NaN      NaN      NaN      NaN   
2          0         1.0      0      0      NaN      NaN      NaN      NaN   
3          1         0.0      0      0      NaN      NaN      NaN      NaN   
4          0         0.0      0      0      NaN      NaN      NaN      NaN   

  symptom5 symptom6  
0      NaN      NaN  
1      NaN      NaN  
2      NaN      NaN 

In [15]:
for col in ["sym_on", "hosp_vis"]:
    # Converte para datetime; ignora erros
    df[col] = pd.to_datetime(df[col], errors="coerce")
    # Se houver algum valor válido, calcula dias desde o primeiro; senão define 0
    if df[col].notna().any():
        df[col] = (df[col] - df[col].min()).dt.days
    else:
        df[col] = 0

# Converter gênero para 0/1 (preencher faltantes com 0 ou outro rótulo se preferir)
df["gender"] = df["gender"].map({"male": 0, "female": 1})
df["gender"] = df["gender"].fillna(0).astype(int)

# País pode ser descartado (sempre China) — mantém somente se existir
df = df.drop(columns=[c for c in ["country"] if c in df.columns])

# location → opcional (se quiser usar: one-hot); aqui descartado se existir
df = df.drop(columns=[c for c in ["location"] if c in df.columns])

# Converter sintomas: primeiro tornar numérico (coerce), preencher NaN com 0 e converter para int
symptoms = [f"symptom{i}" for i in range(1, 7)]
for s in symptoms:
    if s in df.columns:
        df[s] = pd.to_numeric(df[s].replace("NA", np.nan), errors='coerce').fillna(0).astype(int)

# Definir variável alvo: morte — garantir numérico sem NaN
y = pd.to_numeric(df.get("death", 0), errors='coerce').fillna(0).astype(int)

# Features: remover colunas de rótulo se existirem
cols_to_drop = [c for c in ["death", "recov"] if c in df.columns]
X = df.drop(columns=cols_to_drop)

# Garantir que todas as features sejam numéricas (coerce) e preencher NaNs com 0 antes de obter numpy array
X = X.apply(pd.to_numeric, errors='coerce').fillna(0)

X = X.to_numpy()
y = y.to_numpy()

print("Formas:", X.shape, y.shape)

Formas: (1085, 13) (1085,)


In [16]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [17]:
class Perceptron:
    def __init__(self, learning_rate=0.01, n_iters=1000):
        self.lr = learning_rate
        self.n_iters = n_iters
        self.weights = None
        self.bias = None

    def activation_func(self, x):
        return np.where(x >= 0, 1, 0)

    def fit(self, X, y):
        n_samples, n_features = X.shape
        self.weights = np.zeros(n_features)
        self.bias = 0
        
        for _ in range(self.n_iters):
            for idx, x_i in enumerate(X):
                linear_output = np.dot(x_i, self.weights) + self.bias
                y_predicted = self.activation_func(linear_output)
                
                update = self.lr * (y[idx] - y_predicted)
                self.weights += update * x_i
                self.bias += update

    def predict(self, X):
        linear_output = np.dot(X, self.weights) + self.bias
        return self.activation_func(linear_output)

In [18]:
p = Perceptron(learning_rate=0.01, n_iters=1000)
p.fit(X_train_scaled, y_train)
predictions = p.predict(X_test_scaled)

print("Accuracy:", accuracy_score(y_test, predictions))

Accuracy: 0.9400921658986175
