In [10]:
import pandas as pd
df = pd.read_csv('breast_cancer_data.csv')
df['diagnosis_dummy'] = (df['diagnosis'] == 'M').astype(int)

In [14]:
import pandas as pd
import statsmodels.api as sm

# Assuming 'data' is your DataFrame, 'y_col' is the name of your target variable,
# and 'x_cols' are your predictor variables.
# data = pd.read_csv('path_to_your_data.csv')  # Load your data
y_col = 'diagnosis'
x_cols = ['radius_mean']

# Add an intercept term to your model
df['intercept'] = 1
x_cols = ['intercept'] + x_cols

X = df[x_cols]
y = df[y_col]

# Create a GLM model
model = sm.GLM(y, X, family=sm.families.Binomial())

# Fit the model
result = model.fit()

# Print the fitted parameters
print(result.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:              diagnosis   No. Observations:                  569
Model:                            GLM   Df Residuals:                      567
Model Family:                Binomial   Df Model:                            1
Link Function:                  Logit   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -165.01
Date:                Thu, 11 Apr 2024   Deviance:                       330.01
Time:                        17:46:57   Pearson chi2:                     489.
No. Iterations:                     7   Pseudo R-squ. (CS):             0.5232
Covariance Type:            nonrobust                                         
                  coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------
intercept     -15.2459      1.325    -11.509      

In [23]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize

def logistic_regression_wls(data, y_col, x_cols):
    # Add intercept term to the dataset
    data['intercept'] = 1
    x_cols = ['intercept'] + x_cols
    
    X = data[x_cols].values
    y = data[y_col].values
    
    # Initial guess for the parameters
    beta_init = np.zeros(X.shape[1])
    
    # Logit link function
    def logit(p):
        return np.log(p / (1 - p))
    
    # Inverse of the logit function
    def logit_inv(x):
        return 1 / (1 + np.exp(-x))
    
    # The likelihood function for the binomial distribution
    def binomial_log_likelihood(beta):
        linear_prediction = np.dot(X, beta)
        p = logit_inv(linear_prediction)
        return -np.sum(y * np.log(p) + (1 - y) * np.log(1 - p))
    
    # Minimize the negative log likelihood
    result = minimize(binomial_log_likelihood, beta_init, method='BFGS')
    
    if result.success:
        fitted_params = result.x
        print("Optimization successful.")
    else:
        raise ValueError("Optimization failed.")
    
    return fitted_params

y_col = 'diagnosis'
x_cols = ['radius_mean']

params = logistic_regression_wls(df, y_col, x_cols)
print(params)


Optimization successful.
[-15.24586346   1.0335883 ]


In [32]:
train_test_split(df, test_size=0.2, random_state=42)[0]['diagnosis'].value_counts()

diagnosis
0    455
Name: count, dtype: int64

In [None]:
df['diagnosis'].value_counts()

diagnosis
0    569
Name: count, dtype: int64

In [27]:
import numpy as np
import pandas as pd
from scipy.optimize import minimize
from sklearn.model_selection import train_test_split

# Assuming 'df' is your DataFrame and you've already encoded the 'diagnosis' column as suggested
df['diagnosis'] = (df['diagnosis'] == 'M').astype(int)

# Splitting the data into train and test sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=42)

def logistic_regression_wls(data, y_col, x_cols):
    data['intercept'] = 1
    x_cols = ['intercept'] + x_cols
    
    X = data[x_cols].values
    y = data[y_col].values
    
    beta_init = np.zeros(X.shape[1])
    
    def logit_inv(x):
        return 1 / (1 + np.exp(-x))
    
    def binomial_log_likelihood(beta):
        linear_prediction = np.dot(X, beta)
        p = logit_inv(linear_prediction)
        return -np.sum(y * np.log(p) + (1 - y) * np.log(1 - p))
    
    result = minimize(binomial_log_likelihood, beta_init, method='BFGS')
    
    if result.success:
        fitted_params = result.x
        print("Optimization successful.")
    else:
        raise ValueError("Optimization failed.")
    
    return fitted_params

# Train the model
y_col = 'diagnosis'
x_cols = ['radius_mean']
params = logistic_regression_wls(df_train, y_col, x_cols)

# Predict on test set
def predict(data, params):
    data['intercept'] = 1
    x_cols = ['intercept', 'radius_mean']
    X = data[x_cols].values
    linear_prediction = np.dot(X, params)
    pred_prob = 1 / (1 + np.exp(-linear_prediction))
    return pred_prob

df_test['pred_prob'] = predict(df_test, params)

# Calculate accuracy
df_test['predicted_diagnosis'] = (df_test['pred_prob'] > 0.5).astype(int)
accuracy = (df_test['predicted_diagnosis'] == df_test['diagnosis']).mean()
print(f"Accuracy: {accuracy}")


Optimization successful.
Accuracy: 1.0


In [13]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming 'df' is your DataFrame and you've already encoded the 'diagnosis' column as suggested
df['diagnosis_dummy'] = (df['diagnosis'] == 'M').astype(int)

# Splitting the data into train and test sets
df_train, df_test = train_test_split(df, test_size=0.2, random_state=0)

def logistic_regression_iwls(data, y_col, x_cols, max_iter=10):
    data = data.copy()  # Make a copy to avoid modifying the original DataFrame
    data['intercept'] = 1
    x_cols = ['intercept'] + x_cols
    
    X = data[x_cols].values
    y = data[y_col].values
    
    # Define the logit link function and its derivative
    def logit(p):
        return np.log(p / (1 - p))
    
    def logit_inv(eta):
        return 1 / (1 + np.exp(-eta))

    def logit_derivative(mu):
        return 1/(mu * (1 - mu))
    
    # Initial estimates for mu and eta using y
    mu = y.copy()
    eta = logit(mu)
    
    # Initialize beta estimates
    beta = np.zeros(X.shape[1])
    print("mu", mu)
    print("LD", logit_derivative(mu))
    print("LD2", logit_derivative(mu)**2)
    for _ in range(max_iter):
        # Update z and weights w
        z = eta + (y - mu) * logit_derivative(mu)
        w = logit_derivative(mu) ** 2 / (mu * (1 - mu))
        
        # Diagonal weight matrix
        W = np.diag(w)
        
        # Update beta using weighted least squares
        XTWX_inv = np.linalg.inv(X.T @ W @ X)
        XTWZ = X.T @ W @ z
        beta = XTWX_inv @ XTWZ
        
        # Update eta and mu
        eta = X @ beta
        mu = logit_inv(eta)
    print(beta)
    return beta

# Train the model using IWLS
x_cols = ['radius_mean']
params = logistic_regression_iwls(df_train, 'diagnosis_dummy', x_cols)

# Define the prediction function
def predict(data, params):
    data = data.copy()  # Make a copy to avoid modifying the original DataFrame
    data['intercept'] = 1
    X = data[['intercept', 'radius_mean']].values
    linear_prediction = np.dot(X, params)
    pred_prob = 1 / (1 + np.exp(-linear_prediction))
    return pred_prob

# Predict on test set
df_test['pred_prob'] = predict(df_test, params)

# Calculate accuracy
df_test['predicted_diagnosis'] = (df_test['pred_prob'] > 0.5).astype(int)
accuracy = (df_test['predicted_diagnosis'] == df_test['diagnosis_dummy']).mean()
print(f"Accuracy: {accuracy}")


mu [0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 0 0 1 0 0 1 0 1 0 0 0
 0 0 0 0 1 0 1 0 1 1 0 0 1 0 1 1 1 0 0 0 0 0 0 1 0 0 0 0 1 1 0 0 1 0 0 0 0
 1 0 0 1 1 0 0 1 1 0 0 1 0 0 1 1 1 0 0 0 1 0 0 0 0 0 1 0 1 0 1 0 1 0 1 0 0
 0 0 1 0 1 0 0 0 1 0 0 0 1 0 0 1 1 0 1 0 1 0 1 1 1 1 0 1 0 1 0 1 0 1 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 1 0 1 0 0 1 0 1 1 0 1 1 0
 0 1 0 0 0 1 0 0 0 0 0 1 1 0 0 0 0 1 1 0 0 0 0 1 0 0 1 0 0 1 0 1 0 0 0 0 1
 1 1 1 0 1 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1 1 0 0 0 1 1 0 0 1 0 0 0 1 0 1 0 1
 1 1 1 0 0 0 0 1 1 0 0 0 0 0 1 0 0 1 0 0 1 1 1 1 0 0 1 0 0 0 1 1 0 0 0 0 0
 1 1 1 0 1 1 0 0 0 0 0 0 1 0 0 0 0 0 0 1 1 0 1 0 1 1 0 0 0 0 0 1 1 1 0 0 1
 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 0 0 0 0 1 0 1 1 1 0 1 0 1 0 1
 1 0 0 0 0 0 0 0 0 1 1 1 1 1 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 1 1 1 0 1 0 0 1
 1 1 1 0 1 1 1 0 1 0 1 0 0 1 1 0 1 0 0 0 0 1 0 0 1 0 0 0 1 1 0 0 0 1 0 0 1
 0 0 0 0 0 1 1 1 0 0 0]
LD [inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf inf i

  return np.log(p / (1 - p))
  return np.log(p / (1 - p))
  return 1/(mu * (1 - mu))
  z = eta + (y - mu) * logit_derivative(mu)


In [9]:
df_test['diagnosis']

512    1
457    0
439    0
298    0
37     0
      ..
213    1
519    0
432    1
516    1
500    0
Name: diagnosis, Length: 114, dtype: int32