<a href="https://colab.research.google.com/github/Aakriti555/Nammi-Assignment2/blob/main/NAAMI_Task_2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score
from sklearn.metrics import (
    accuracy_score, roc_auc_score, recall_score, f1_score, confusion_matrix
)

In [3]:
import gdown
gdown.download(url="https://drive.google.com/file/d/1Zsg7ZiTWcpvm9IZl72z0DnOiNFu4QgGo/view", output="file.zip", fuzzy=True)

Downloading...
From: https://drive.google.com/uc?id=1Zsg7ZiTWcpvm9IZl72z0DnOiNFu4QgGo
To: /content/file.zip
100%|██████████| 8.66M/8.66M [00:00<00:00, 42.8MB/s]


'file.zip'

In [4]:
!unzip "/content/file.zip" -d "/content"

Archive:  /content/file.zip
   creating: /content/TASK_2/
  inflating: /content/TASK_2/blinded_test_set.csv  
  inflating: /content/__MACOSX/TASK_2/._blinded_test_set.csv  
  inflating: /content/TASK_2/train_set.csv  
  inflating: /content/__MACOSX/TASK_2/._train_set.csv  
  inflating: /content/TASK_2/test_set.csv  
  inflating: /content/__MACOSX/TASK_2/._test_set.csv  


In [5]:
# Load data
train_df = pd.read_csv("/content/TASK_2/train_set.csv")
test_df = pd.read_csv("/content/TASK_2/test_set.csv")
blinded_df = pd.read_csv("/content/TASK_2/blinded_test_set.csv")

In [6]:
train_df.drop(columns=["ID"], inplace=True)
test_df.drop(columns=["ID"], inplace=True)
blinded_df.drop(columns=["ID"], inplace=True)

In [7]:
train_df.replace([np.inf, -np.inf], np.nan, inplace=True)
train_df.dropna(inplace=True)
test_df.replace([np.inf, -np.inf], np.nan, inplace=True)
test_df.dropna(inplace=True)
blinded_df.replace([np.inf, -np.inf], np.nan, inplace=True)
blinded_df.dropna(inplace=True)

In [8]:
# Drop Blinded IDs as well for prediction
blinded_X = blinded_df.drop(columns=["ID"], errors='ignore')

In [9]:
X_train = train_df.drop("CLASS", axis=1)
y_train = train_df["CLASS"]

X_test = test_df.drop("CLASS", axis=1)
y_test = test_df["CLASS"]

In [10]:
# Ensure same columns in test and train
X_train, X_test = X_train.align(X_test, join='left', axis=1, fill_value=0)

In [11]:
model = LogisticRegression(max_iter=10000)
model.fit(X_train, y_train)

STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [12]:
y_pred = model.predict(X_test)

print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.5490196078431373
Classification Report:
               precision    recall  f1-score   support

           0       0.56      0.58      0.57        26
           1       0.54      0.52      0.53        25

    accuracy                           0.55        51
   macro avg       0.55      0.55      0.55        51
weighted avg       0.55      0.55      0.55        51



In [13]:
# Make predictions
y_pred = model.predict(X_test)
y_proba = model.predict_proba(X_test)[:, 1]  # For AUROC (use prob of positive class)
accuracy = accuracy_score(y_test, y_pred)
auroc = roc_auc_score(y_test, y_proba)
sensitivity = recall_score(y_test, y_pred)
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel()
specificity = tn / (tn + fp)
f1 = f1_score(y_test, y_pred)

In [14]:
# Print all metrics
print(f"Accuracy:    {accuracy:.4f}")
print(f"AUROC:       {auroc:.4f}")
print(f"Sensitivity: {sensitivity:.4f}")
print(f"Specificity: {specificity:.4f}")
print(f"F1 Score:    {f1:.4f}")

Accuracy:    0.5490
AUROC:       0.5862
Sensitivity: 0.5200
Specificity: 0.5769
F1 Score:    0.5306


In [15]:
print(y_train.value_counts(normalize=True))
print(y_test.value_counts(normalize=True))


CLASS
0    0.535354
1    0.464646
Name: proportion, dtype: float64
CLASS
0    0.509804
1    0.490196
Name: proportion, dtype: float64


In [16]:
# --- TRAIN SET PROBABILITIES ---
train_proba = model.predict_proba(X_train)
train_pred_df = pd.DataFrame({
    'ID': np.arange(1, len(X_train) + 1),
    'class_0': train_proba[:, 0],
    'class_1': train_proba[:, 1]
})
train_pred_df.to_csv("train_predictions_logreg.csv", index=False)
print("Train predictions saved as 'train_predictions_logreg.csv'")

# --- TEST SET PROBABILITIES ---
test_proba = model.predict_proba(X_test)
test_pred_df = pd.DataFrame({
    'ID': np.arange(1, len(X_test) + 1),
    'class_0': test_proba[:, 0],
    'class_1': test_proba[:, 1]
})
test_pred_df.to_csv("test_predictions_logreg.csv", index=False)
print("Test predictions saved as 'test_predictions_logreg.csv'")


Train predictions saved as 'train_predictions_logreg.csv'
Test predictions saved as 'test_predictions_logreg.csv'


In [17]:
# Align columns with training features
blinded_X = blinded_X.reindex(columns=X_train.columns, fill_value=0)

In [18]:
# Predict probabilities
blinded_proba = model.predict_proba(blinded_X)

In [19]:
# Prepare output DataFrame
blind_ids = blinded_df['ID'] if 'ID' in blinded_df.columns else pd.Series(range(1, len(blinded_df)+1), name='ID')
blinded_pred_df = pd.DataFrame({
    'ID': blind_ids,
    'class_0': blinded_proba[:, 0],
    'class_1': blinded_proba[:, 1]
})

In [20]:
# Export to CSV
blinded_pred_df.to_csv("blinded_predictions_logreg.csv", index=False)
print("Blinded predictions saved as 'blinded_predictions_logreg.csv'")

Blinded predictions saved as 'blinded_predictions_logreg.csv'
