# Bibliotecas

In [None]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline

# Rascunho

In [None]:
data_train = pd.read_csv('https://raw.githubusercontent.com/Cayan-Portela/ceub/main/dados/insurance_treino.csv', sep = ';')
data_test = pd.read_csv('https://raw.githubusercontent.com/Cayan-Portela/ceub/main/dados/insurance_teste.csv', sep = ';')

In [None]:
data_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 338 entries, 0 to 337
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   age          338 non-null    int64  
 1   sex          338 non-null    int64  
 2   bmi          338 non-null    float64
 3   children     338 non-null    int64  
 4   smoker       338 non-null    int64  
 5   region       338 non-null    object 
 6   charges      338 non-null    float64
 7   log_charges  338 non-null    float64
dtypes: float64(3), int64(4), object(1)
memory usage: 21.2+ KB


In [None]:
data_train.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,34,female,2356,0,no,northeast,49923764
1,45,female,331,0,no,southwest,7345084
2,23,male,327,3,no,southwest,359148
3,38,female,1995,2,no,northeast,71339025
4,32,female,298,2,no,southwest,5152134


In [None]:
data_test.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,18,male,3377,1,no,southeast,17255523
1,31,female,2574,0,no,southeast,37566216
2,37,female,2774,3,no,northwest,72815056
3,37,male,2983,2,no,northeast,64064107
4,60,female,2584,0,no,northwest,2892313692


# Base de Teste

In [None]:
data_test['sex'] = data_test['sex'].map({'male': 0, 'female': 1})
data_test['smoker'] = data_test['smoker'].map({'no': 0, 'yes': 1})

data_test['bmi'] = pd.to_numeric(data_test['bmi'].str.replace(',', '.'), errors='coerce')
data_test['charges'] = pd.to_numeric(data_test['charges'].str.replace(',', '.'), errors='coerce')

data_test['log_charges'] = np.log1p(data_test['charges'])

features = ['age', 'bmi', 'children', 'sex', 'smoker']
X = data_test[features]
y = data_test['log_charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'bmi', 'children']),
        ('cat', OneHotEncoder(), ['sex', 'smoker'])
    ])

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

model.fit(X_train, y_train)

score = model.score(X_test, y_test)
print(f'R-squared score: {score:.4f}')

R-squared score: 0.6738


#Questão 1


##A

Y (n x 1): É a matriz de resposta, onde cada linha representa uma observação e contém o valor da variável que estamos tentando prever.

X (n x p): É a matriz de regressores, onde cada linha representa uma observação e cada coluna corresponde a uma variável independente.

B (p x 1): É o vetor de coeficientes, contendo os pesos que são atribuídos a cada variável independente.

ϵ (n x 1): É o vetor de erro, representando as diferenças entre os valores previstos e os valores reais das variáveis dependentes.

Basicamente, a equação descreve como as variáveis dependentes (Y) são previstas com base nas variáveis independentes (X) usando coeficientes (B), enquanto o vetor de erro (ϵ) captura as discrepâncias entre as previsões e os valores reais.

# Questão 2

## A

O objetivo é encontrar os melhores valores para os coeficientes beta, que melhor se 'encaixam' na relação das variáveis dependentes e independentes

## B



i. Solução Analítica

In [None]:
X_train_analytical = np.c_[np.ones(X_train.shape[0]), X_train]

X_test_analytical = np.c_[np.ones(X_test.shape[0]), X_test]

predictions_analytical = np.dot(X_test_analytical, np.linalg.inv(np.dot(X_train_analytical.T, X_train_analytical)).dot(X_train_analytical.T).dot(y_train))

print("Previsões usando a solução analítica:")
print(predictions_analytical[:5])

Previsões usando a solução analítica:
[ 9.40884653  8.15913892 10.28223396  9.00830404  7.99279324]


ii. Gradiente Descendente

In [None]:
X_test_gradient = np.c_[np.ones(X_test.shape[0]), X_test]

predictions_gradient = model.predict(X_test)

print("Previsões usando o modelo LinearRegression do scikit-learn:")
print(predictions_gradient[:5].round(1))

Previsões usando o modelo LinearRegression do scikit-learn:
[ 9.4  8.2 10.3  9.   8. ]


iii. Gradiente Descendente

In [None]:
predictions_scikit = model.predict(X_test)

print("Previsões usando LinearRegression() do scikit-learn:")
print(predictions_scikit[:5])

Previsões usando LinearRegression() do scikit-learn:
[ 9.40884653  8.15913892 10.28223396  9.00830404  7.99279324]


## C

In [None]:
X_train_analytical = np.c_[np.ones(X_train.shape[0]), X_train]

coefficients_analytical = np.linalg.inv(X_train_analytical.T @ X_train_analytical) @ X_train_analytical.T @ y_train
print("Coeficientes usando solução analítica:")
print(coefficients_analytical)

Coeficientes usando solução analítica:
[7.03189891 0.03510812 0.0093223  0.07684105 0.03874494 1.56337708]


In [None]:
coefficients_gradient = np.concatenate(([model.named_steps['regressor'].intercept_], model.named_steps['regressor'].coef_))
print("Coeficientes usando gradiente descendente:")
print(coefficients_gradient)


Coeficientes usando gradiente descendente:
[ 9.57728363  0.48622488  0.05640972  0.09408745 -0.01937247  0.01937247
 -0.78168854  0.78168854]


In [None]:
intercept_scikit = model.named_steps['regressor'].intercept_

coefficients_scikit = model.named_steps['regressor'].coef_

print("Intercepto usando LinearRegression():", intercept_scikit)
print("Coeficientes das variáveis independentes usando LinearRegression():", coefficients_scikit)


Intercepto usando LinearRegression(): 9.577283634597368
Coeficientes das variáveis independentes usando LinearRegression(): [ 0.48622488  0.05640972  0.09408745 -0.01937247  0.01937247 -0.78168854
  0.78168854]


## D

In [None]:
data_test['sex'] = data_test['sex'].map({'male': 0, 'female': 1})
data_test['smoker'] = data_test['smoker'].map({'no': 0, 'yes': 1})

if not pd.api.types.is_numeric_dtype(data_test['bmi']):
    data_test['bmi'] = pd.to_numeric(data_test['bmi'].str.replace(',', '.'), errors='coerce')

if not pd.api.types.is_numeric_dtype(data_test['charges']):
    data_test['charges'] = pd.to_numeric(data_test['charges'].str.replace(',', '.'), errors='coerce')

data_test['log_charges'] = np.log1p(data_test['charges'])

features = ['age', 'bmi', 'children', 'sex', 'smoker']
X = data_test[features]
y = data_test['log_charges']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), ['age', 'bmi', 'children']),
        ('cat', OneHotEncoder(), ['sex', 'smoker'])
    ])

model = Pipeline(steps=[('preprocessor', preprocessor),
                        ('regressor', LinearRegression())])

model.fit(X_train, y_train)

score = model.score(X_test, y_test)
print(f'R-squared score: {score:.4f}')

R-squared score: 0.3643


In [None]:
predictions_test = model.predict(X_test)

mse_test = mean_squared_error(y_test, predictions_test)

print(f"Erro Quadrático Médio na base de teste: {mse_test:.4f}")

Erro Quadrático Médio na base de teste: 0.6717


# Questão 3

In [None]:
"""
2x^2+5 = e^x
derivada da função = 4x-e^x

3.5 - ((2))
"""

'\n2x^2+5 = e^x\nderivada da função = 4x-e^x\n\n3.5 - ((2))\n'

In [None]:
def f(x):
    return 2*x**2 + 5 - math.exp(x)

def df(x):
    return 4*x - math.exp(x)

def newton_raphson(initial_guess, tolerance, max_iterations):
    x = initial_guess
    for i in range(max_iterations):
        x_next = x - f(x) / df(x)
        if abs(x_next - x) < tolerance:
            return x_next
        x = x_next
    return None

initial_guess = 3.5
tolerance = 1e-16  # Precisão de 16 casas decimais
max_iterations = 1000

result = newton_raphson(initial_guess, tolerance, max_iterations)

if result is not None:
    print(f"A raiz é aproximadamente {result:.16f}")
else:
    print("O método de Newton-Raphson não convergiu.")

A raiz é aproximadamente 3.2756010888473224


# Questão 4

In [None]:
def updatex(x):
    return (2 - x**2)

In [None]:
x_ = 16
count = 1

while x_ < -3 or x_ > 3:
    x_ -= updatex(x_)
    count += 1

print(round(x_, 8))
print(count)

# Questão 5

In [None]:
def calcula_p(X,B):
   return np.exp(X @ B) / (1+np.exp(X @ B))

def matriz_x(coluna, dados):
    n_ = dados.shape[0]

    col_1 = np.ones(n_)
    col_var = dados[coluna]

    X_mat = np.c_[col_1, np.array(col_var)]

    return X_mat

def beta_update(X, W, y, p):
    return inv(X.T @ W @ X) @ X.T @ (y-p)

def beta_inicial(X):
    return np.zeros(X.shape[1])

In [None]:
dados = pd.read_csv("https://raw.githubusercontent.com/Cayan-Portela/ceub/main/dados/bank_customer_treino.csv")

In [None]:
dados['gender'] = np.where(dados.gender == "Male", 1, 0)

In [None]:
col_x = ['credit_score', 'gender', 'age', 'credit_card']
col_y = 'churn'

In [None]:
X_mat = matriz_x(col_x, dados)
y = dados[col_y]
betas = beta_inicial(X_mat)
p = calcula_p(X_mat, betas)
w = np.diag(p)
print("X_mat: \n", X_mat)
print("y: \n", y)
print("betas: \n", betas)
print("p: \n", p)
print("w: \n", w)

X_mat: 
 [[  1. 564.   1.  26.   0.]
 [  1. 688.   1.  45.   0.]
 [  1. 784.   0.  42.   1.]
 ...
 [  1. 461.   0.  40.   1.]
 [  1. 765.   1.  36.   1.]
 [  1. 534.   0.  33.   0.]]
y: 
 0       0
1       0
2       0
3       1
4       1
       ..
5995    0
5996    1
5997    0
5998    0
5999    0
Name: churn, Length: 6000, dtype: int64
betas: 
 [0. 0. 0. 0. 0.]
p: 
 [0.5 0.5 0.5 ... 0.5 0.5 0.5]
w: 
 [[0.5 0.  0.  ... 0.  0.  0. ]
 [0.  0.5 0.  ... 0.  0.  0. ]
 [0.  0.  0.5 ... 0.  0.  0. ]
 ...
 [0.  0.  0.  ... 0.5 0.  0. ]
 [0.  0.  0.  ... 0.  0.5 0. ]
 [0.  0.  0.  ... 0.  0.  0.5]]


In [None]:
beta_update(X_mat, w, y, p)

array([-1.23914770e+00, -1.94814661e-04, -1.47000394e-01,  2.22232554e-02,
       -1.21993081e-02])

In [None]:
betas = beta_inicial(X_mat)
p = calcula_p(X_mat, betas)
w = np.diag(p)

for i in range(10):
    p = calcula_p(X_mat, betas)
    W = np.diag(p)
    beta_k1 = beta_update(X_mat, W, y, p)
    betas = betas + beta_k1

In [None]:
sk_logistica = LogisticRegression()
sk_logistica.fit(dados[col_x], y)

In [None]:
print(f'Nossos Betas: {np.round(betas,3)}')
print(f'Sklearn Betas: {sk_logistica.intercept_, sk_logistica.coef_}')

Nossos Betas: [-3.256e+00 -1.000e-03 -4.890e-01  6.400e-02 -4.900e-02]
Sklearn Betas: (array([-3.27061466]), array([[-0.00063749, -0.49520859,  0.0641967 , -0.05292901]]))


In [None]:
dados_teste = pd.read_csv("https://raw.githubusercontent.com/Cayan-Portela/ceub/main/dados/bank_custoter_teste.csv")
dados_teste['gender'] = np.where(dados_teste.gender == "Male", 1, 0)

In [None]:
col_x = ['credit_score', 'gender', 'age', 'credit_card']
col_y = 'churn'

In [None]:
X_mat = matriz_x(col_x, dados_teste)
y = dados_teste[col_y]
p_teste = calcula_p(X_mat, betas)
print("betas: \n", betas)
print("Probabilidade de evasão: \n", p_teste)

betas: 
 [-3.25573132e+00 -6.58026852e-04 -4.89083616e-01  6.40302961e-02
 -4.90911654e-02]
Probabilidade de evasão: 
 [0.26446899 0.26297371 0.15507199 ... 0.13097248 0.19510378 0.16622374]


In [None]:
np.sort(p_teste)[::-1][:10]

array([0.8646357 , 0.83895565, 0.8365764 , 0.83477935, 0.82261039,
       0.80036349, 0.7937318 , 0.78956555, 0.78290534, 0.77412462])

In [None]:
pd.crosstab(
    np.where(p_teste > 0.5, 1, 0),    # linhas
    dados_teste['churn']              # colunas
)

churn,0,1
row_0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,3089,753
1,116,42


In [None]:
print("Acurácia = ", (3089 + 42)/4000)
print("Recall = ", 42/(42+753))
print("Precisão = ", (42)/ (42+116))

Acurácia =  0.78275
Recall =  0.052830188679245285
Precisão =  0.26582278481012656
