In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import pickle
import datacompy
import os, sys
import numpy as np

# narzedzia
from sklearn.model_selection import (
    train_test_split,
    cross_val_score,
    cross_val_predict,
    learning_curve,
    RepeatedStratifiedKFold,
    GridSearchCV,
)
from collections import Counter
from sklearn.metrics import classification_report, confusion_matrix, silhouette_score, accuracy_score
from sklearn import tree
from sklearn.tree import plot_tree
from sklearn.preprocessing import StandardScaler, OneHotEncoder, LabelEncoder
from scipy import stats
from joblib import dump, load

# modele
from xgboost import XGBClassifier, XGBRFClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# methods
from imblearn.under_sampling import ClusterCentroids, NearMiss
from scipy.optimize import differential_evolution
from sklearn.cluster import KMeans
from sklearn.ensemble import IsolationForest
from sklearn.neighbors import LocalOutlierFactor
from sklearn.inspection import permutation_importance

from imblearn.over_sampling import SMOTE, BorderlineSMOTE
from ctgan import CTGAN

### Read data

In [None]:
#oversampling data
original_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\original_data.csv")    
original_data = original_data.drop(columns=["Unnamed: 0"])
smote_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\smote_data.csv")
GAN_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\GAN_data.csv")
borderline_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\borderline_data.csv")
smote2_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\smote3_data.csv")
GAN2_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\GAN3_data.csv")    
borderline2_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\borderline3_data.csv")

# test data
X_test = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\new\\test\\X_test.csv")
y_test = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\new\\test\\y_test.csv")

""" #undersampling prepare data //mixed data (gan, brdsmote, smote)
cc_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\clustercentroids_data.csv")
if_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\isolationforest_data.csv")
nm_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\nearmiss_data.csv")
median_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\median_data.csv")
lof_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\localoutlierfactor_data.csv") """


' #undersampling prepare data //mixed data (gan, brdsmote, smote)\ncc_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\new\\clustercentroids_data.csv")\nif_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\new\\isolationforest_data.csv")\nnm_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\new\\nearmiss_data.csv")\nmedian_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\new\\median_data.csv")\nlof_data = pd.read_csv("D:\\ml\\undersampling_data\\data\\ssh\\new\\localoutlierfactor_data.csv") '

In [27]:
print(original_data["target"].value_counts())
count1=original_data["target"].value_counts().sum()
print(count1)
count2=abs((original_data['target']==0).sum() - (original_data['target']==1).sum())
print(f"Before undersampling: {count2}")
print(f"After number of samples: {count1+count2}")

target
0    151
1     46
Name: count, dtype: int64
197
Before undersampling: 105
After number of samples: 302


### Preprocessing Data

In [3]:
#mixed data
mix_data = pd.concat([GAN_data, smote_data, borderline_data, original_data], axis=0, ignore_index=True)         
mix_data = mix_data.reset_index(drop=True)

#data with one oversampling method and original data e.g.(smote+original)
smote_data = pd.concat([smote_data, smote2_data, original_data], axis=0, ignore_index=True)
smote_data = smote_data.reset_index(drop=True)
borderline_data = pd.concat([borderline_data, borderline2_data, original_data], axis=0, ignore_index=True)
borderline_data = borderline_data.reset_index(drop=True)
GAN_data = pd.concat([GAN_data, GAN2_data, original_data], axis=0, ignore_index=True)
GAN_data = GAN_data.reset_index(drop=True)

#### Split data

In [10]:
X_mix, y_mix = mix_data.drop(columns=["target", "source"]), mix_data["target"]
X_smote, y_smote = smote_data.drop(columns=["target", "source"]), smote_data["target"]
X_GAN, y_GAN = GAN_data.drop(columns=["target", "source"]), GAN_data["target"]
X_borderline, y_borderline = borderline_data.drop(columns=["target", "source"]), borderline_data["target"]

#### Dictionary

In [14]:
data = {}
data["mix"] = (X_mix, y_mix)
data["smote"] = (X_smote, y_smote)
data["GAN"] = (X_GAN, y_GAN)
data["borderline"] = (X_borderline, y_borderline)

compare = {}
compare["data"] = mix_data
compare["smote"] = smote_data
compare["GAN"] = GAN_data
compare["borderline"] = borderline_data

### NearMiss version1

In [None]:
NM = NearMiss(version=1)

for (name, (X_train, y_train)), (_, compare_df) in zip(data.items(), compare.items()):
    
    X_NM, y_NM = NM.fit_resample(X_train, y_train)
    
    #concat resampled data
    nearmiss_data = pd.concat([X_NM, y_NM], axis=1)
    
    NM_data_nosource = compare_df.drop(columns=["source"])
    nearmiss_data_nosource = nearmiss_data

    for index, row in nearmiss_data_nosource.iterrows():
        match = NM_data_nosource.eq(row).all(axis=1)  # Sprawdza, gdzie wiersze są identyczne
        if match.any():  # Jeśli znaleziono dopasowanie
            matched_index = match.idxmax()  # Pobiera pierwszy pasujący indeks
            nearmiss_data.loc[index, "source"] = compare_df.loc[matched_index, "source"]
            
    nearmiss_data.to_csv(f"D:\\ml\\undersampling_data\\data\\ssh\\reduced\\{name}_NM_data.csv")
    
    print(f"Data reduced for {name} data")
    print(nearmiss_data["target"].value_counts())
    

target
0    151
1    151
Name: count, dtype: int64
target
0    151
1    151
Name: count, dtype: int64
target
0    151
1    151
Name: count, dtype: int64
target
0    151
1    151
Name: count, dtype: int64
