# Logistic Regression

### Task Overview

Build a logistic regression model to classify transactions as high-value based on selected features.

In [4]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, roc_auc_score

# Load the dataset
df = pd.read_csv("https://raw.githubusercontent.com/Data-Navigators/Statistical_Concept_Excercise/main/data/Retail_sales_dataset.csv")

In [5]:
df['Date'] = pd.to_datetime(df['Date'])

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

# Rename 'Product Category' to 'Product_Category'
df = df.rename(columns={'Product Category': 'Product_Category'})

# Re-apply one-hot encoding, ensuring all categories are represented
df = pd.get_dummies(df, columns=['Gender', 'Product_Category'], drop_first=False, dtype=int)


Missing values:
Transaction ID      0
Date                0
Customer ID         0
Gender              0
Age                 0
Product Category    0
Quantity            0
Price per Unit      0
Total Amount        0
dtype: int64


In [12]:
# Define target variable: HighValue (1 if Total Amount > 80th percentile, else 0)
threshold = df['Total Amount'].quantile(0.80)
df['HighValue'] = (df['Total Amount'] > threshold).astype(int)

# Check available columns
print("Available columns:", df.columns)

# One-hot encode categorical variables (only if they exist)
if 'Gender' in df.columns and 'Product_Category' in df.columns:
    df = pd.get_dummies(df, columns=['Gender', 'Product_Category'], drop_first=False, dtype=int)
else:
    print("Warning: 'Gender' or 'Product_Category' columns not found. Skipping one-hot encoding.")

# Define features and target variable
available_features = ['Age', 'Quantity', 'Price per Unit']
for col in df.columns:
    if col.startswith(('Gender_', 'Product_Category_')):
        available_features.append(col)

X = df[available_features]
y = df['HighValue']

# Split the dataset
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Create and fit the logistic regression model
model = LogisticRegression()
model.fit(X_train, y_train)

# Make predictions
y_pred = model.predict(X_test)
y_pred_proba = model.predict_proba(X_test)[:, 1]

# Evaluate the model
print(classification_report(y_test, y_pred))
print(f"ROC-AUC Score: {roc_auc_score(y_test, y_pred_proba):.4f}")

# Print coefficients
print("\nCoefficients:")
for name, coef in zip(X.columns, model.coef_[0]):
    print(f"{name}: {coef:.4f}")


Available columns: Index(['Transaction ID', 'Date', 'Customer ID', 'Age', 'Quantity',
       'Price per Unit', 'Total Amount', 'Gender_Female', 'Gender_Male',
       'Product_Category_Beauty', 'Product_Category_Clothing',
       'Product_Category_Electronics', 'HighValue'],
      dtype='object')
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       170
           1       1.00      1.00      1.00        30

    accuracy                           1.00       200
   macro avg       1.00      1.00      1.00       200
weighted avg       1.00      1.00      1.00       200

ROC-AUC Score: 1.0000

Coefficients:
Age: -0.0355
Quantity: 4.1173
Price per Unit: 0.0226
Gender_Female: 0.2008
Gender_Male: 0.1171
Product_Category_Beauty: 0.6149
Product_Category_Clothing: -0.4047
Product_Category_Electronics: 0.1077


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
