In [None]:
# Assuming you have a dataset (X) and corresponding labels (y)
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, roc_curve, roc_auc_score
import matplotlib.pyplot as plt

# Step 1: Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Step 2: Train a Logistic Regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Step 3: Predict probabilities on the test set
probs = model.predict_proba(X_test)[:, 1]

# Step 4: Try different thresholds and evaluate the impact on metrics
thresholds = [0.2, 0.4, 0.6, 0.8]

for threshold in thresholds:
    predictions = (probs >= threshold).astype(int)
    
    accuracy = accuracy_score(y_test, predictions)
    precision = precision_score(y_test, predictions)
    recall = recall_score(y_test, predictions)
    
    print(f"Threshold: {threshold}")
    print(f"Accuracy: {accuracy:.4f}, Precision: {precision:.4f}, Recall: {recall:.4f}\n")

# Step 5: Fit a Logistic Regression Model on all features
model_all_features = LogisticRegression()
model_all_features.fit(X, y)

# Step 6: Plot ROC Curve for both models
fpr, tpr, thresholds_roc = roc_curve(y_test, probs)
roc_auc = roc_auc_score(y_test, probs)

fpr_all, tpr_all, thresholds_roc_all = roc_curve(y, model_all_features.predict_proba(X)[:, 1])
roc_auc_all = roc_auc_score(y, model_all_features.predict_proba(X)[:, 1])

plt.figure(figsize=(8, 8))
plt.plot(fpr, tpr, label=f'Logistic Regression (AUC = {roc_auc:.2f})')
plt.plot(fpr_all, tpr_all, label=f'Logistic Regression (All Features) (AUC = {roc_auc_all:.2f})')
plt.plot([0, 1], [0, 1], linestyle='--', color='gray', label='Random')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()


In [None]:
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt

# Assuming you have a dataset (X)

# Step 1: Varying k and analyzing inertia and silhouette scores
k_values = [2, 3, 4, 5, 6]

for k in k_values:
    # Perform k-means clustering
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)

    # Inertia: Sum of squared distances of samples to their closest cluster center
    inertia = kmeans.inertia_

    # Silhouette Score
    silhouette_avg = silhouette_score(X, kmeans.labels_)

    print(f"Number of Clusters (k): {k}")
    print(f"Inertia: {inertia:.4f}, Silhouette Score: {silhouette_avg:.4f}\n")

# Step 2: Analyzing the impact of not scaling features
kmeans_unscaled = KMeans(n_clusters=k, random_state=42)
kmeans_unscaled.fit(X)

inertia_unscaled = kmeans_unscaled.inertia_
silhouette_unscaled = silhouette_score(X, kmeans_unscaled.labels_)

print("Without Scaling Features:")
print(f"Inertia: {inertia_unscaled:.4f}, Silhouette Score: {silhouette_unscaled:.4f}\n")

# Step 3: Discussing the concept of the 'right' k
# You can use various methods like the elbow method, silhouette analysis, or cross-validation
# to find an optimal value for k. There is no one "right" k, and it depends on the specific data and problem.

# Example: Elbow Method
inertia_values = []
for k in range(1, 11):
    kmeans_elbow = KMeans(n_clusters=k, random_state=42)
    kmeans_elbow.fit(X)
    inertia_values.append(kmeans_elbow.inertia_)

plt.figure(figsize=(8, 6))
plt.plot(range(1, 11), inertia_values, marker='o')
plt.title('Elbow Method for Optimal k')
plt.xlabel('Number of Clusters (k)')
plt.ylabel('Inertia')
plt.show()
