In [1]:
# Import custom libraries
from tools.DataLoader import DataLoader
from tools.SLModelEvaluator import SLModelEvaluator
from tools.SLModelTrainer import SLModelTrainer
from transformers.DropHighNaNColumnsTransformer import DropHighNaNColumnsTransformer
from tools.EDA import EDA
from transformers.DropNaNTransformer import DropNaNTransformer

# Import necessary libraries
from sklearn.pipeline import Pipeline
import numpy as np
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.model_selection import train_test_split

# Import necessary libraries
from sklearn.discriminant_analysis import StandardScaler
from sklearn.impute import SimpleImputer
from category_encoders import TargetEncoder
from sklearn.preprocessing import PowerTransformer
from category_encoders import OneHotEncoder

from imblearn.pipeline import Pipeline as ImbPipeline
from imblearn.over_sampling import SMOTE
from sklearn.decomposition import PCA

# Import Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression

In [2]:
# Step 1: Load Data using DataLoader
data_loader = DataLoader(request_type="local", path="db/final_proj_data.csv")
df = data_loader.load_data()

In [3]:
# Step 2: Perform EDA using the EDA class
eda = EDA(df)
eda.perform_full_eda()

Dataset Information:

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Columns: 231 entries, Var1 to y
dtypes: float64(191), int64(2), object(38)
memory usage: 17.6+ MB


None

Unnamed: 0,Column,Has_Nulls,Dtype
Var1,Var1,True,float64
Var2,Var2,True,float64
Var3,Var3,True,float64
Var4,Var4,True,float64
Var5,Var5,True,float64
...,...,...,...
Var227,Var227,False,object
Var228,Var228,False,object
Var229,Var229,True,object
Var230,Var230,True,float64



Dataset Shape:

(10000, 231)

Descriptive Statistics:



Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Var1,133.0,14.977444,66.456008,0.0,0.000,0.00,16.000,680.0
Var2,266.0,0.000000,0.000000,0.0,0.000,0.00,0.000,0.0
Var3,266.0,341.052632,2810.606975,0.0,0.000,0.00,0.000,42588.0
Var4,280.0,0.096429,0.928243,0.0,0.000,0.00,0.000,9.0
Var5,241.0,233810.124481,553230.515446,0.0,0.000,0.00,117235.000,3024000.0
...,...,...,...,...,...,...,...,...
Var189,4206.0,272.455064,86.752531,6.0,204.000,270.00,330.000,642.0
Var190,43.0,25725.112326,37487.484852,0.0,1312.875,10853.82,37491.525,191167.2
Var209,0.0,,,,,,,
Var230,0.0,,,,,,,



Missing Values:



Var1       9867
Var2       9734
Var3       9734
Var4       9720
Var5       9759
          ...  
Var227        0
Var228        0
Var229     5561
Var230    10000
y             0
Length: 231, dtype: int64


Sample Data:



Unnamed: 0,Var1,Var2,Var3,Var4,Var5,Var6,Var7,Var8,Var9,Var10,...,Var222,Var223,Var224,Var225,Var226,Var227,Var228,Var229,Var230,y
0,,,,,,812.0,14.0,,,,...,catzS2D,jySVZNlOJy,,xG3x,Aoh3,ZI9m,ib5G6X1eUxUn6,mj86,,0
1,,,,,,2688.0,7.0,,,,...,i06ocsg,LM8l689qOp,,kG3k,WqMG,RAYp,55YFVY9,mj86,,0
2,,,,,,1015.0,14.0,,,,...,P6pu4Vl,LM8l689qOp,,kG3k,Aoh3,ZI9m,R4y5gQQWY8OodqDV,am7c,,0
3,,,,,,168.0,0.0,,,,...,BNrD3Yd,LM8l689qOp,,,FSa2,RAYp,F2FyR07IdsN7I,,,0
4,,,,,,14.0,0.0,,,,...,3B1QowC,LM8l689qOp,,,WqMG,RAYp,F2FyR07IdsN7I,,,0


In [4]:
# # Feature Engineering Pipeline


# fe_pipeline = Pipeline(
#     steps=[
#         # ("drop_high_nan", DropHighNaNColumnsTransformer(threshold=0.8)),
#         ("drop_nan", DropNaNTransformer()),
#     ]
# )

In [5]:
# Define categorical transformer for RandomForest
cat_transformer_rf = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="most_frequent")),
        ("encoder", OneHotEncoder(handle_unknown="ignore")),
    ]
)

# Define numerical transformer for RandomForest
num_transformer_rf = Pipeline(
    steps=[
        ("imputer", SimpleImputer(strategy="mean")),
    ]
)

# Define preprocessor for RandomForest
preprocessor_rf = ColumnTransformer(
    transformers=[
        ("num", num_transformer_rf, make_column_selector(dtype_include=np.number)),
        (
            "cat",
            cat_transformer_rf,
            make_column_selector(dtype_include=[object, "category"]),
        ),
    ],
    n_jobs=-1,
    verbose_feature_names_out=False,
).set_output(transform="pandas")

In [6]:
# # Define categorical transformer for GradientBoosting


# cat_transformer_gb = Pipeline(
#     steps=[
#         ("imputer", SimpleImputer(strategy="most_frequent")),
#         ("encoder", TargetEncoder(handle_unknown="ignore")),
#     ]
# )

# # Define numerical transformer for GradientBoosting
# num_transformer_gb = Pipeline(
#     steps=[
#         ("imputer", SimpleImputer(strategy="mean")),
#         ("scaler", StandardScaler()),
#         ("power", PowerTransformer(method="yeo-johnson")),
#     ]
# )

# # Define preprocessor for GradientBoosting
# preprocessor_gb = ColumnTransformer(
#     transformers=[
#         ("num", num_transformer_gb, make_column_selector(dtype_include=np.number)),
#         (
#             "cat",
#             cat_transformer_gb,
#             make_column_selector(dtype_include=[object, "category"]),
#         ),
#     ],
#     n_jobs=-1,
#     verbose_feature_names_out=False,
# ).set_output(transform="pandas")

# # Full preprocessing pipeline for GradientBoosting
# full_preprocessing_pipeline_gb = Pipeline(
#     steps=[
#         ("feature_engineering", feature_engineering_pipeline),
#         ("preprocessing", preprocessor_gb),
#         # ("smote", SMOTE(random_state=42)),  # Балансировка данных
#         ("pca", PCA(n_components=0.95)),  # Уменьшение размерности
#     ]
# )

In [7]:
# # Define categorical transformer for LogisticRegression
# cat_transformer = ColumnTransformer(
#     transformers=[
#         (
#             "cat_imputer",
#             SimpleImputer(strategy="most_frequent"),
#             make_column_selector(dtype_include=[object, "category"]),
#         ),
#         (
#             "cat_encoder",
#             OneHotEncoder(handle_unknown="ignore"),
#             make_column_selector(dtype_include=[object, "category"]),
#         ),
#     ]
# )

# # Define numerical transformer for LogisticRegression
# num_transformer = ColumnTransformer(
#     transformers=[
#         (
#             "num_imputer",
#             SimpleImputer(strategy="mean"),
#             make_column_selector(dtype_include=np.number),
#         ),
#         ("num_scaler", StandardScaler(), make_column_selector(dtype_include=np.number)),
#         (
#             "num_power",
#             PowerTransformer(method="yeo-johnson"),
#             make_column_selector(dtype_include=np.number),
#         ),
#     ]
# )

In [8]:
# # Create full preprocessing pipeline with feature engineering and preprocessor
# full_preprocessing_pipeline = Pipeline(
#     steps=[
#         ("feature_engineering", feature_engineering_pipeline),
#         ("preprocessing", preprocessor),
#         ("finalpreprocessing", "passthrough"),
#     ]
# )

In [9]:
X = df.drop(columns=["y"])
y = df["y"]
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

In [10]:
# full_pipeline_random_forest = ImbPipeline(
#     steps=[
#         ("full_preprocessing", full_preprocessing_pipeline_rf),
#         ("model", RandomForestClassifier(random_state=42)),
#     ]
# )

# # Create full pipeline with preprocessing and model for GradientBoosting
# full_pipeline_gradient_boosting = ImbPipeline(
#     steps=[
#         ("full_preprocessing", full_preprocessing_pipeline_gb),
#         ("model", GradientBoostingClassifier()),
#     ]
# )

# # Create full pipeline with preprocessing and model for LogisticRegression
# full_pipeline_logistic_regression = Pipeline(
#     steps=[
#         ("full_preprocessing", full_preprocessing_pipeline_lr),
#         ("model", LogisticRegression(max_iter=10000)),
#     ]
# )

In [11]:
# # Full preprocessing pipeline for LogisticRegression without nested Pipelines
# full_pipeline_logistic_regression = ImbPipeline(
#     steps=[
#         ("drop_high_nan", DropHighNaNColumnsTransformer(threshold=0.8)),
#         ("cat_transformer", cat_transformer),
#         ("num_transformer", num_transformer),
#         ("smote", SMOTE(random_state=42)),  # Балансировка данных
#         ("pca", PCA(n_components=0.95)),  # Уменьшение размерности
#         ("model", LogisticRegression(max_iter=1000)),
#     ]
# )

In [12]:
# Full preprocessing pipeline for RandomForest without nested Pipelines
full_pipeline_random_forest = ImbPipeline(
    steps=[
        ("drop_high_nan", DropHighNaNColumnsTransformer(threshold=0.8)),
        ("preprocessing", preprocessor_rf),
        ("smote", SMOTE(random_state=42)),  # Балансировка данных
        ("pca", PCA(n_components=0.95)),  # Уменьшение размерности
        ("model", RandomForestClassifier(random_state=42)),
    ]
)

In [13]:
# Define the parameter grid
param_grids = {
    "random_forest": {
        "model__n_estimators": [100, 200, 300],
        "model__max_depth": [None, 10, 20, 30],
        "model__min_samples_split": [2, 5, 10],
        "model__min_samples_leaf": [1, 2, 4],
        "model__bootstrap": [True, False],
    },
    # "gradient_boosting": {
    #     "model__n_estimators": [100, 200, 300],
    #     "model__learning_rate": [0.01, 0.1, 0.2],
    #     "model__max_depth": [3, 4, 5],
    #     "model__min_samples_split": [2, 5, 10],
    #     "model__min_samples_leaf": [1, 2, 4],
    #     "model__subsample": [0.8, 0.9, 1.0],
    # },
    # "logistic_regression": {
    #     "model__C": [0.01, 0.1, 1, 10],
    #     "model__solver": ["liblinear", "saga"],
    #     "model__penalty": ["l1", "l2"],
    # },
}

In [14]:
# Combine pipelines into a dictionary
model_pipelines = {
    "random_forest": full_pipeline_random_forest,
    # "gradient_boosting": full_pipeline_gradient_boosting,
    # "logistic_regression": full_pipeline_logistic_regression,
}

In [15]:
# Create an instance of ClusterPipeline and train models
model_trainer = SLModelTrainer()
best_models, best_params, best_scores, best_model_name = model_trainer.train(
    X_train, y_train, model_pipelines, param_grids, scoring="accuracy", cv=5
)

In [None]:
best_model = best_models[best_model_name]
model_evaluator = SLModelEvaluator(best_model, X_test, y_test)

In [None]:
model_evaluator.visualize_pipeline(
    model_trainer.best_model_name, model_trainer.best_models
)

In [None]:
# Валидация на тестовых данных
model_evaluator.validate_on_test(X_test, y_test, best_model, best_model_name)

In [None]:
# Визуализация ROC-кривой
model_evaluator.plot_roc_curve()

# Визуализация матрицы ошибок
model_evaluator.plot_confusion_matrix()

# Визуализация важности признаков
model_evaluator.feature_importance(X_train, y_train, df)