In [2]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from typing import Dict

In [11]:
file_path = 'Credit_card.csv'  
data = pd.read_csv(file_path)

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
  
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    
    categorical_cols = df.select_dtypes(include=['object']).columns
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    
    return df

processed_data = preprocess_data(data)


if 'Annual_income' in processed_data.columns and 'label' in processed_data.columns:
    X = processed_data[['Annual_income']]  # Feature matrix
    y = processed_data['label']  # Target variable

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)

    
    y_pred = lin_reg.predict(X_test)

    
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Linear Regression MSE: {mse}")
    print(f"Linear Regression R²: {r2}")
else:
    print("Required columns are missing in the processed data.")

def evaluate_models(X: pd.DataFrame, y: pd.Series) -> Dict[str, float]:
    """
    Evaluates Linear Regression, KNN, and Logistic Regression models with and without normalization.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_normalized = scaler.fit_transform(X_train)
    X_test_normalized = scaler.transform(X_test)

    performances: Dict[str, float] = {}

    knn = KNeighborsClassifier(n_neighbors=8)
    log_reg = LogisticRegression(max_iter=1000)

    knn.fit(X_train, y_train)
    performances['KNN without Normalization'] = knn.score(X_test, y_test)
    
    knn.fit(X_train_normalized, y_train)
    performances['KNN with Normalization'] = knn.score(X_test_normalized, y_test)

    log_reg.fit(X_train, y_train)
    performances['Logistic Regression without Normalization'] = log_reg.score(X_test, y_test)
    
    log_reg.fit(X_train_normalized, y_train)
    performances['Logistic Regression with Normalization'] = log_reg.score(X_test_normalized, y_test)

    return performances


X = processed_data.drop(['label', 'Annual_income'], axis=1, errors='ignore')
y = processed_data['label']


performances = evaluate_models(X, y)
for model, performance in performances.items():
    print(f"{model}: {performance}")

Linear Regression MSE: 0.08779253009842213
Linear Regression R²: -0.004388350292662713
KNN without Normalization: 0.8935483870967742
KNN with Normalization: 0.9032258064516129
Logistic Regression without Normalization: 0.9032258064516129
Logistic Regression with Normalization: 0.896774193548387


In [None]:
# Q2: Univariate Linear Regression Analysis
# The Univariate Linear Regression analysis using 'Annual_income' as a predictor for the target variable 'label' resulted in an MSE 
# of 0.08779253009842213 and an R² value of -0.004388350292662491. These metrics indicate that the model is not fitting the data well.
# A negative R² value suggests that the model performs worse than a simple horizontal line representing the mean of the target. 
# This outcome implies that either 'Annual_income' alone does not have a linear relationship with the target or other features are needed 
# to capture the complexity of the dataset. It points towards exploring more complex models or including more predictors for better performance.

In [None]:
# Q3: KNN Performance and Optimal K
# Normalization improved the KNN model's performance, with accuracy increasing from 0.8935483870967742 without normalization 
# to 0.9032258064516129 with normalization. This enhancement underscores the significance of feature scaling 
# for distance-based algorithms like KNN, where normalization ensures all features contribute equally to the distance computation, enhancing model accuracy.

In [None]:
# Q4: Logistic Regression Performance
# For Logistic Regression, the model showed high accuracy both with (0.896774193548387) and without normalization (0.9032258064516129),
# with a slight decrease in performance when normalization was applied. 
# This slight decrease might be due to the model's inherent ability to handle different feature scales, especially when 
# regularization is applied. However, the high accuracy in both cases suggests that Logistic Regression is well-suited 
# for this binary classification task, capable of capturing the relationship between features and the target effectively.

In [12]:
# Q5: Impact of Normalization
# Normalization had a positive impact on the KNN model's performance but a slightly negative impact on Logistic Regression. 
# This outcome highlights that the benefits of normalization can vary between models based on their underlying assumptions
# and sensitivity to feature scales. While normalization is generally recommended, especially for algorithms like KNN 
# that are sensitive to the scale of input features, its impact should be evaluated on a case-by-case basis.
# For Logistic Regression in this dataset, feature scaling was less critical, possibly due to the model's 
# characteristics or the data distribution.
# Normalization's varying impact on different models emphasizes the importance of understanding model assumptions 
#and the characteristics of the dataset when preprocessing data. While normalization is beneficial for models sensitive to feature scale, 
#it might not always enhance performance for all models or datasets.
