In [119]:
# 1. Import Libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from scipy import sparse

In [120]:
#Load Data
train = pd.read_csv(r'E:/IITK/P1 Data/Consumer_Complaints_train.csv')
test = pd.read_csv(r'E:/IITK/P1 Data/Consumer_Complaints_test_share.csv')

In [121]:
print("Train Shape:", train.shape)
print("Test Shape:", test.shape)

Train Shape: (478421, 18)
Test Shape: (119606, 17)


In [122]:
# Drop columns not to be used as per guidelines
train.drop(["ZIP code", "Complaint ID"], axis=1, inplace=True)
test.drop(["ZIP code", "Complaint ID"], axis=1, inplace=True)


In [123]:
# Add source column for split tracking
train["source"] = "train"
test["source"] = "test"

# Combine for unified processing
full_data = pd.concat([train, test], axis=0)

In [124]:
# Convert date columns
full_data["Date received"] = pd.to_datetime(full_data["Date received"])
full_data["Date sent to company"] = pd.to_datetime(full_data["Date sent to company"])

In [125]:
# Create new date features
full_data["Received Month"] = full_data["Date received"].dt.month
full_data["Received Weekday"] = full_data["Date received"].dt.weekday
full_data["Gap Days"] = (full_data["Date sent to company"] - full_data["Date received"]).dt.days

In [126]:
# Drop original date columns
full_data.drop(["Date received", "Date sent to company"], axis=1, inplace=True)

In [127]:
# Text: Create simple length feature (instead of TF-IDF)
full_data["Narrative Length"] = full_data["Consumer complaint narrative"].fillna("").apply(lambda x: len(x.split()))
full_data.drop("Consumer complaint narrative", axis=1, inplace=True)

In [128]:
# Fill missing values
full_data.fillna("missing", inplace=True)

In [129]:
#Encode categoricals
source = full_data["source"]
full_data.drop("source", axis=1, inplace=True)
full_data = pd.get_dummies(full_data, drop_first=True, sparse=True)
full_data["source"] = source.values



In [130]:
#Train/Test Split
train = full_data[full_data["source"] == "train"].drop("source", axis=1)
test = full_data[full_data["source"] == "test"].drop("source", axis=1)

In [131]:
# Ensure same columns in test as train
missing_cols = set(train.columns) - set(test.columns)
for col in missing_cols:
    test[col] = 0  # add missing columns as 0

# Ensure column order matches
test = test[train.columns]

In [132]:
# Define target and feature matrices
target_col = [col for col in train.columns if "Consumer disputed" in col][0]
X = train.drop(target_col, axis=1)
y = train[target_col]
X_test = test.drop(target_col, axis=1)  # safe if exists, else just use test



In [133]:
#Validation Split
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

In [134]:
#Sparse Conversion
X_train_sparse = sparse.csr_matrix(X_train.astype(np.float32).values)
X_val_sparse = sparse.csr_matrix(X_val.astype(np.float32).values)
X_test_sparse = sparse.csr_matrix(X_test.astype(np.float32).values)
X_full_sparse = sparse.csr_matrix(X.astype(np.float32).values)


In [135]:
#Model Training (Validation)
model = LogisticRegression(max_iter=1000, solver='saga', class_weight='balanced')
model.fit(X_train_sparse, y_train)
val_preds = model.predict_proba(X_val_sparse)[:, 1]
auc_val = roc_auc_score(y_val, val_preds)
print("Validation AUC:", auc_val)

Validation AUC: 0.627208548755855




In [153]:
#Train on Full Data
final_model = LogisticRegression(max_iter=3000, solver='saga', class_weight='balanced')
final_model.fit(X_full_sparse, y)



In [155]:
#Predict on Test
test_preds = final_model.predict_proba(X_test_sparse)[:, 1]
test_preds_class = (test_preds > 0.5).astype(int)


In [161]:
#Create Final Submission
sample = pd.read_csv(r'Downloads/sample_submission.csv')  # for correct format
submission = pd.DataFrame({
    "Complaint ID": sample["Complaint ID"],
    "Consumer disputed?": test_preds_class
})


In [163]:
# Map 0 ➜ No, 1 ➜ Yes
submission["Consumer disputed?"] = submission["Consumer disputed?"].map({1: "Yes", 0: "No"})

In [165]:
# Save file
submission.to_csv(r'E:/IITK/P1 Data/submission.csv', index=False)
print("Final submission.csv created with Yes/No format.")


Final submission.csv created with Yes/No format.


In [None]:
print(submission['Consumer disputed'].value_counts())
