In [2]:
import wandb
import pandas as pd
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import MinMaxScaler 
from sklearn.preprocessing import PolynomialFeatures
from sklearn.feature_selection import SelectKBest, chi2
import os
import warnings
warnings.filterwarnings('ignore')
import imblearn

# Downloading file from Wandb 

In [80]:
run = wandb.init(project="risk_credit", job_type="EDA")

In [None]:
WANDB_API_KEY=os.environ.get('1d620fa1eff54f2f0ba01b14c81969f4ce70bd6c')
!wandb login --relogin 1d620fa1eff54f2f0ba01b14c81969f4ce70bd6c

In [83]:
artifact_X_train = wandb.use_artifact('risk_credit/X_train.csv:latest', type='Preprocessing')
artifact_X_test = wandb.use_artifact('risk_credit/X_test.csv:latest', type='Preprocessing')
artifact_y_train = wandb.use_artifact('risk_credit/y_train.csv:latest', type='Preprocessing')
artifact_y_test = wandb.use_artifact('risk_credit/y_test.csv:latest', type='Preprocessing')

X_train_dir = artifact_X_train.download()
X_test_dir = artifact_X_test.download()
y_train_dir = artifact_y_train.download()
y_test_dir = artifact_y_test.download()

wandb.finish()

[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  
[34m[1mwandb[0m:   1 of 1 files downloaded.  


In [84]:
X_train_path = os.path.join(X_train_dir, "X_train.csv")
X_test_path = os.path.join(X_test_dir, "X_test.csv")
y_train_path = os.path.join(y_train_dir, "y_train.csv")
y_test_path = os.path.join(y_test_dir, "y_test.csv")

X_train = pd.read_csv(X_train_path)
X_test = pd.read_csv(X_test_path)
y_train = pd.read_csv(y_train_path)
y_test = pd.read_csv(y_test_path)

# I. Preprocessing

## 1. Resampling

In [85]:
column_names = X_train.columns

In [86]:
smote = SMOTE(random_state=42)
X_train, y_train = smote.fit_resample(X_train, y_train)

## 2. Scaling

In [87]:
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [88]:
X_train = pd.DataFrame(X_train, columns=column_names)
X_test = pd.DataFrame(X_test, columns=column_names)

## 3. Features importance

In [89]:
chi2_selector = SelectKBest(chi2, k="all")  
x_kbest = chi2_selector.fit_transform(X_train, y_train)

chi2_scores = chi2_selector.scores_

chi2_results = pd.DataFrame({
    "Feature": X_train.columns,
    "Chi2 Score": chi2_scores
})

chi2_results = chi2_results.sort_values(by="Chi2 Score", ascending=False)

chi2_results

Unnamed: 0,Feature,Chi2 Score
0,HighBP,23351.828784
16,DiffWalk,18332.267182
1,HighChol,14484.951613
6,HeartDiseaseorAttack,10888.203411
13,GenHlth,9644.696355
15,PhysHlth,9519.889518
10,HvyAlcoholConsump,3944.896984
5,Stroke,3244.742754
3,BMI,2751.382395
20,Income,2410.537963


In [90]:
selector = SelectKBest(score_func=chi2, k=15)
selector.fit_transform(X_train, y_train)

selected_columns = X_train.columns[selector.get_support()].tolist()

## 4. Polynomial Features

In [91]:
poly = PolynomialFeatures(degree = 2, interaction_only=False)
poly.fit(X_train[selected_columns])

poly_feature_names = poly.get_feature_names_out(selected_columns)

X_train_poly = pd.DataFrame(poly.transform(X_train[selected_columns]), columns=poly_feature_names, index=X_train.index)
X_test_poly = pd.DataFrame(poly.transform(X_test[selected_columns]), columns=poly_feature_names, index=X_test.index)

poly_new_columns = [col for col in poly_feature_names if col not in selected_columns]

X_train_poly = X_train_poly[poly_new_columns]
X_test_poly = X_test_poly[poly_new_columns]

X_train = pd.concat([X_train, X_train_poly], axis=1)
X_test = pd.concat([X_test, X_test_poly], axis=1)

## Up file to wandb

In [92]:
X_train_new_file = os.path.join(X_train_dir, "X_train_new.csv")
X_test_new_file = os.path.join(X_test_dir, "X_test_new.csv")
y_train_new_file = os.path.join(y_train_dir, "y_train_new.csv")
y_test_new_file = os.path.join(y_test_dir, "y_test_new.csv")

X_train.to_csv(X_train_new_file, index=False)
X_test.to_csv(X_test_new_file, index=False)
y_train.to_csv(y_train_new_file, index=False)
y_test.to_csv(y_test_new_file, index=False)

In [93]:
run = wandb.init(project="risk_credit", job_type="upload-feature-engineered")

def upload_artifact(artifact_name, file_path, artifact_type="Feature engineering", artifact_description="Feature engineered data"):
    artifact = wandb.Artifact(
        name=artifact_name,
        type=artifact_type,
        description=artifact_description
    )
    artifact.add_file(file_path)
    wandb.log_artifact(artifact)

upload_artifact("X_train_new.csv", X_train_new_file)
upload_artifact("X_test_new.csv", X_test_new_file)
upload_artifact("y_train_new.csv", y_train_new_file)
upload_artifact("y_test_new.csv", y_test_new_file)

wandb.finish()