### Install and import library

In [1]:
!pip install git+https://github.com/DangLeUyen/OsImp.git

Collecting git+https://github.com/DangLeUyen/OsImp.git
  Cloning https://github.com/DangLeUyen/OsImp.git to /private/var/folders/42/h0csfrkn2fvfq63027xhz7d40000gn/T/pip-req-build-5gi9g7fg
  Running command git clone --filter=blob:none --quiet https://github.com/DangLeUyen/OsImp.git /private/var/folders/42/h0csfrkn2fvfq63027xhz7d40000gn/T/pip-req-build-5gi9g7fg
  Resolved https://github.com/DangLeUyen/OsImp.git to commit 4c0cc62aab4881dc34e570983217e191d26a64e1
  Preparing metadata (setup.py) ... [?25ldone

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.0[0m[39;49m -> [0m[32;49m24.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [2]:
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from imblearn.datasets import fetch_datasets
from OsImp import OsImp

from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings("ignore")

In [3]:
def generate_randomly_missing(X , missing_rate):
    """
    Creates a randomly missing mask for the input data.

    Args:
        data (np.ndarray): The input data.
        missing_rate (float): The ratio of missing values to create.

    Returns:
        np.ndarray: An array with the same shape as `data` where missing values are marked as NaN.
    """
    
    non_missing = [0]
    X_copy=np.copy(X)
    
    X_non_missing_col = X_copy[:, non_missing]
    X1_missing = X_copy[:, [i for i in range(X.shape[1]) if i not in non_missing]]

    X_non_missing_row = X1_missing[non_missing]
    X_missing = X1_missing[len(non_missing):(X.shape[0]+1)]
    XmShape = X_missing.shape
    na_id = np.random.randint(0, X_missing.size, round(missing_rate * X_missing.size))
    X_nan = X_missing.flatten()
    X_nan[na_id] = np.nan
    X_nan = X_nan.reshape(XmShape)

    X1_nan = np.vstack((X_non_missing_row, X_nan))
    X_nan = np.hstack((X_non_missing_col, X1_nan))
    
    return X_nan


### Load Ecoli dataset

In [4]:
data = fetch_datasets()['ecoli']
X, y = data.data, data.target
le = LabelEncoder()
y = le.fit_transform(y)
print(f"The shape of X: {np.shape(X)}")

The shape of X: (336, 7)


##### Split the ecoli dataset to the training data and the test data and introduce missing to the training data

In [5]:
# Split the dataset into 2 subsets 
Xtrain, Xtest, ytrain, ytest = train_test_split(X,y.reshape((-1,1)),test_size=0.4)

G, class_counts = np.unique(ytrain, return_counts=True)  
print(f"The shape of Xtrain: {np.shape(Xtrain)}")
print(f"the number of labels: {len(G)} and the number of each class: {class_counts} " )

# Introduce missingness into original training data with missing rate is 0.5
Xtrain = generate_randomly_missing(Xtrain, 0.5)    

# Standarize Xtrain and Xtest
scaler = StandardScaler()
scaler.fit(Xtrain)
Xtrain = scaler.transform(Xtrain)
Xtest = scaler.transform(Xtest)

The shape of Xtrain: (201, 7)
the number of labels: 2 and the number of each class: [187  14] 


### Apply OsImp to solve imbalanced and incomplete problems

In [6]:
# Here, we choose MICE algorithm to impute missing data
mice_imputer = IterativeImputer().fit_transform

# Initiate OsImp algorithm with ratio is 0.9
osimputer = OsImp(mice_imputer, R=0.9)

# Applying OsImp to the imbalanced missing dataset
Xnew, ynew = osimputer.os_and_impute(Xtrain, ytrain)

G, class_counts = np.unique(ynew, return_counts=True)  
print(f"The shape of Xnew: {np.shape(Xnew)}")
print(f"the number of labels: {len(G)} and the number of each class: {class_counts} " )

The shape of Xnew: (355, 7)
the number of labels: 2 and the number of each class: [187 168] 


### Using SVM to classify

In [7]:
from sklearn.svm import SVC
from imblearn.metrics import sensitivity_score
from sklearn.metrics import f1_score, precision_score

SVMclf = SVC(gamma='auto')
SVMclf.fit(Xnew, ynew)

ypred = SVMclf.predict(Xtest)

f1 = f1_score(ytest, ypred, average = 'weighted').round(3)
sensitivity = sensitivity_score(ytest, ypred, average='micro').round(3)

print(f"F1-Score: {f1}")
print(f"Sensitivity: {sensitivity}")

F1-Score: 0.885
Sensitivity: 0.874
