In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd

def load_data():
    df = pd.read_csv("Train.csv")  # Replace with actual file path
    train, test = train_test_split(df, test_size=0.2, random_state=42, stratify=df["Target"])
    return train, test

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.impute import SimpleImputer
from imblearn.pipeline import Pipeline as ImbPipeline  # Use imbalanced-learn's Pipeline
from imblearn.over_sampling import SMOTE

# Preprocess Data
def preprocess_data(train, test):
    numeric_features = train.select_dtypes(include=[np.number]).columns.tolist()
    categorical_features = train.select_dtypes(include=["object"]).columns.tolist()

    numeric_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='mean')),
        ('scaler', StandardScaler())
    ])

    categorical_transformer = Pipeline(steps=[
        ('imputer', SimpleImputer(strategy='most_frequent')),
        ('encoder', OneHotEncoder(handle_unknown='ignore'))
    ])

    preprocessor = ColumnTransformer(transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

    return preprocessor


In [None]:
test = pd.read_csv("Test.csv")

In [None]:
test = pd.read_csv("Test.csv")
train = pd.read_csv("Train.csv")
# all columns are numerical

# Robust Scaler is trained with train data
from sklearn.preprocessing import RobustScaler
scaler = RobustScaler()

# Select columns (excluding target)
num_cols = [col for col in train.columns if col != "Target"]

train_scaled = train.copy()
train_scaled[num_cols] = scaler.fit_transform(train[num_cols])

train.fillna(train.median(), inplace=True)
test.fillna(train.median(), inplace=True)

# Drop highly correlated features from train & test datasets
train.drop(columns=['V15', 'V14'], inplace=True)
test.drop(columns=['V15', 'V14'], inplace=True) 

# Separate features and target
X_train = train.drop(columns=["Target"])
y_train = train["Target"]

X_test = test.drop(columns=["Target"])
y_test = test["Target"]

from imblearn.under_sampling import RandomUnderSampler
from sklearn.ensemble import RandomForestClassifier

# Random undersampler for under sampling the data
rus = RandomUnderSampler(random_state=1, sampling_strategy=1)
X_train_un, y_train_un = rus.fit_resample(X_train, y_train)

best_rf_model_un = RandomForestClassifier(
    n_estimators=250,
    min_samples_leaf=2,
    max_samples=0.6,
    max_features=0.1,
    random_state=1,
    class_weight="balanced"
)

best_rf_model_un.fit(X_train_un, y_train_un)

from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# defining a function to compute different metrics to check performance of a classification model built using sklearn
def model_performance_classification_sklearn(model, predictors, target):
    """
    Function to compute different metrics to check classification model performance

    model: classifier
    predictors: independent variables
    target: dependent variable
    """

    # predicting using the independent variables
    pred = model.predict(predictors)

    acc = accuracy_score(target, pred)  # to compute Accuracy
    recall = recall_score(target, pred)  # to compute Recall
    precision = precision_score(target, pred)  # to compute Precision
    f1 = f1_score(target, pred)  # to compute F1-score

    # creating a dataframe of metrics
    df_perf = pd.DataFrame(
        {
            "Accuracy": acc,
            "Recall": recall,
            "Precision": precision,
            "F1": f1

        },
        index=[0],
    )

    return df_perf

# Evaluate the final tuned model
best_rf_performance = model_performance_classification_sklearn(best_rf_model_un, X_test, y_test)
print("Final Model Performance on Test Data:\n", best_rf_performance)

# Export model
import joblib

joblib.dump(best_rf_model_un, "final_rf_model.pkl")

In [2]:
import pandas as pd
import joblib
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import RobustScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from imblearn.pipeline import Pipeline as ImbPipeline  # For imbalanced data handling
from imblearn.under_sampling import RandomUnderSampler
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score

# Load data
train = pd.read_csv("Train.csv")
test = pd.read_csv("Test.csv")

# Define features and target
X_train = train.drop(columns=["Target"])
y_train = train["Target"]
X_test = test.drop(columns=["Target"])
y_test = test["Target"]

# Identify numerical columns (all columns except 'Target')
num_cols = X_train.columns.tolist()

# Define preprocessing steps
preprocessor = ColumnTransformer(
    transformers=[
        ("num", Pipeline([
            ("imputer", SimpleImputer(strategy="median")),  # Handle missing values
            ("scaler", RobustScaler())  # Scale features
        ]), num_cols)
    ]
)

# Define the full pipeline
final_pipeline = ImbPipeline(steps=[
    ("preprocessing", preprocessor),  # Apply preprocessing (imputation & scaling)
    ("feature_selection", "passthrough"),  # No explicit feature selection, handled manually
    ("undersampling", RandomUnderSampler(random_state=1, sampling_strategy=1)),  # Balance dataset
    ("classifier", RandomForestClassifier(
        n_estimators=250,
        min_samples_leaf=2,
        max_samples=0.6,
        max_features=0.1,
        random_state=1,
        class_weight="balanced"
    ))
])

# Train model using pipeline
final_pipeline.fit(X_train, y_train)

# Predict using test data
y_pred = final_pipeline.predict(X_test)

# Define a function to evaluate model performance
def evaluate_model(model, X, y):
    pred = model.predict(X)
    return pd.DataFrame({
        "Accuracy": [accuracy_score(y, pred)],
        "Recall": [recall_score(y, pred)],
        "Precision": [precision_score(y, pred)],
        "F1": [f1_score(y, pred)]
    })

# Evaluate final model
final_performance = evaluate_model(final_pipeline, X_test, y_test)
print("Final Model Performance:\n", final_performance)

# Save the trained pipeline
joblib.dump(final_pipeline, "final_rf_pipeline.pkl")


Final Model Performance:
    Accuracy    Recall  Precision        F1
0    0.9458  0.875887   0.511387  0.645752


['final_rf_pipeline.pkl']