In [10]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from typing import Dict
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline

In [13]:
file_path = 'Credit_card.csv'  
data = pd.read_csv(file_path)

def preprocess_data(df: pd.DataFrame) -> pd.DataFrame:
  
    numeric_cols = df.select_dtypes(include=['number']).columns
    df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())
    
    categorical_cols = df.select_dtypes(include=['object']).columns
    df = pd.get_dummies(df, columns=categorical_cols, drop_first=True)
    
    return df

processed_data = preprocess_data(data)


if 'Annual_income' in processed_data.columns and 'label' in processed_data.columns:
    X = processed_data[['Annual_income']]  # Feature matrix
    y = processed_data['label']  # Target variable

    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    
    lin_reg = LinearRegression()
    lin_reg.fit(X_train, y_train)

    
    y_pred = lin_reg.predict(X_test)


    pipe = [('pca', PCA(n_components=1)), ('m', LinearRegression())]
    model2 = Pipeline(steps=pipe)

    
    model2.fit(X_train, y_train)

   
    y_pred2 = model2.predict(X_test)

        
    mse = mean_squared_error(y_test, y_pred)
    r2 = r2_score(y_test, y_pred)
    print(f"Linear Regression MSE: {mse}")
    print(f"Linear Regression R²: {r2}")
    # model evaluation with one dimension data
    mse2 = mean_squared_error(y_test, y_pred2)
    r2_x = r2_score(y_test, y_pred2)
    print(f"Linear Regression MSE with one dimension: {mse2}")
    print(f"Linear Regression R² with one dimension: {r2_x}")
else:
    print("Required columns are missing in the processed data.")

def evaluate_models(X: pd.DataFrame, y: pd.Series) -> Dict[str, float]:
    """
    Evaluates Linear Regression, KNN, and Logistic Regression models with and without normalization.
    """
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_normalized = scaler.fit_transform(X_train)
    X_test_normalized = scaler.transform(X_test)

    performances: Dict[str, float] = {}

    
    

    knn = KNeighborsClassifier(n_neighbors=8)
    log_reg = LogisticRegression(max_iter=1000)

    
    pipe1 = [('pca', PCA(n_components=1)), ('m', KNeighborsClassifier(n_neighbors=8))]
    
    
    pipe2 = [('pca', PCA(n_components=1)), ('m', LogisticRegression(max_iter=1000))]
    
    knn.fit(X_train, y_train)
    performances['KNN without Normalization'] = knn.score(X_test, y_test)
    
    model3 = Pipeline(steps = pipe1)
    model3.fit(X_train, y_train)
    
    performances['KNN without Normalization for one dimension reduction'] = model3.score(X_test, y_test)
    
    knn.fit(X_train_normalized, y_train)
    performances['KNN with Normalization'] = knn.score(X_test_normalized, y_test)
    
    model4 = Pipeline(steps = pipe1)
    model4.fit(X_train_normalized, y_train)
    performances['KNN with Normalization for one dimension reduction'] = model4.score(X_test_normalized, y_test)

    log_reg.fit(X_train, y_train)
    performances['Logistic Regression without Normalization'] = log_reg.score(X_test, y_test)
    
    model5 = Pipeline(steps = pipe2)
    model5.fit(X_train, y_train)
    performances['Logistic Regression without Normalization for one dimension reduction'] = model5.score(X_test, y_test)
    
    log_reg.fit(X_train_normalized, y_train)
    performances['Logistic Regression with Normalization'] = log_reg.score(X_test_normalized, y_test)
    #fit logistic regression with normalization for one dimension data
    model6 = Pipeline(steps=pipe2)
    model6.fit(X_train_normalized, y_train)
    performances['Logistic Regression with Normalization for one dimension reduction'] = model6.score(X_test_normalized, y_test)

    return performances


X = processed_data.drop(['label', 'Annual_income'], axis=1, errors='ignore')
y = processed_data['label']


performances = evaluate_models(X, y)
for model, performance in performances.items():
    print(f"{model}: {performance}")

Linear Regression MSE: 0.08779253009842213
Linear Regression R²: -0.004388350292662713
Linear Regression MSE with one dimension: 0.08779253009842213
Linear Regression R² with one dimension: -0.004388350292662713
KNN without Normalization: 0.8935483870967742
KNN without Normalization for one dimension reduction: 0.896774193548387
KNN with Normalization: 0.9032258064516129
KNN with Normalization for one dimension reduction: 0.9
Logistic Regression without Normalization: 0.9032258064516129
Logistic Regression without Normalization for one dimension reduction: 0.9032258064516129
Logistic Regression with Normalization: 0.896774193548387
Logistic Regression with Normalization for one dimension reduction: 0.9032258064516129


# We can observe from the above input the metrics for linear regression remain unchanged not affecting its performance when one dimension reduction technique is applied. We obseve that KNN without normalization improves from 0.8967 to 0.8935 when one dimension reduction technique is applied. KNN ith normalization reduces from 0.9032 to 0.9. The logistic regression without normalization remains unchanged. Logistic regression without normalization improves from 0.8968 to 0,9032