In [None]:
# Q1: Difference between linear regression and logistic regression

# Linear regression:
# - Used for predicting continuous outcomes.
# - Outputs a continuous value.
# - Fits a straight line to minimize the mean squared error.

# Logistic regression:
# - Used for binary or categorical classification problems.
# - Outputs probabilities (between 0 and 1) using a sigmoid function.
# - Example: Predicting if a customer will buy a product (Yes/No).

# Example where logistic regression is more appropriate:
# Predicting whether a patient has a disease (1) or not (0) based on medical test results.

# Q2: Cost function in logistic regression

# - Logistic regression uses a log-loss (cross-entropy) cost function.
# - It is optimized using gradient descent or similar methods.
# Cost function formula:
# J(θ) = -1/m * Σ [y * log(h(x)) + (1 - y) * log(1 - h(x))]

# Python code example for sigmoid function:
import numpy as np

def sigmoid(z):
    return 1 / (1 + np.exp(-z))

# Example of cost function implementation:
def compute_cost(y, h):
    m = len(y)
    return -1/m * np.sum(y * np.log(h) + (1 - y) * np.log(1 - h))

# Q3: Regularization in logistic regression

# Regularization prevents overfitting by adding a penalty to the cost function.
# Two types:
# - L1 Regularization (Lasso): Adds absolute value of coefficients to the cost function.
# - L2 Regularization (Ridge): Adds squared value of coefficients to the cost function.

# Example with L2 Regularization:
# J(θ) = -1/m * Σ [y * log(h(x)) + (1 - y) * log(1 - h(x))] + λ/2m * Σ θ^2

# Q4: ROC curve

# - The ROC (Receiver Operating Characteristic) curve plots True Positive Rate (TPR) vs. False Positive Rate (FPR).
# - It evaluates the model's performance at various threshold levels.
# - The Area Under the Curve (AUC) indicates the model's ability to distinguish between classes.

# Example: Plotting the ROC curve
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt

# Assuming `y_test` and `y_prob` are true labels and predicted probabilities:
fpr, tpr, _ = roc_curve(y_test, y_prob)
roc_auc = auc(fpr, tpr)

plt.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend(loc='lower right')
plt.show()

# Q5: Feature selection techniques for logistic regression

# 1. Recursive Feature Elimination (RFE)
# 2. Lasso Regularization (L1 penalty)
# 3. Mutual Information or Chi-square test
# 4. Removing features with low variance

# Example with RFE:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
selector = RFE(model, n_features_to_select=5)
selector = selector.fit(X_train, y_train)
selected_features = selector.support_

# Q6: Handling imbalanced datasets in logistic regression

# Strategies:
# 1. Resampling: Oversample minority class or undersample majority class.
# 2. Class weighting: Use `class_weight` parameter in scikit-learn's LogisticRegression.
# 3. Synthetic Data Generation: Use SMOTE (Synthetic Minority Oversampling Technique).

# Example with class weighting:
model = LogisticRegression(class_weight='balanced')
model.fit(X_train, y_train)

# Q7: Common issues and challenges in logistic regression

# - Multicollinearity: When independent variables are highly correlated.
#   Solution: Use Variance Inflation Factor (VIF) to detect multicollinearity and drop one of the correlated features.
# - Outliers: Can affect performance. Use robust scaling or remove outliers.
# - Non-linearity: Logistic regression assumes linear relationships between features and log-odds.
#   Solution: Use feature engineering or switch to non-linear models like decision trees.

# Example of VIF calculation:
from statsmodels.stats.outliers_influence import variance_inflation_factor
import pandas as pd

X = pd.DataFrame(X_train)  # Assuming X_train is a numpy array
vif = pd.DataFrame()
vif["VIF"] = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
vif["Feature"] = X.columns
print(vif)
