# Script: GMM GENERATOR
<h4><span style="color:blue">Juan José Hoyos Urcué</span></h4>

### Dataset and Python libraries Upload

In [1]:
import numpy as np
import pandas as pd
import sklearn as sk
import seaborn as sns
import matplotlib.pyplot as plt
import random
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import MinMaxScaler
from sklearn import mixture

file_name = "../../1-without_data_augmentation/preprocesado.xlsx"

df = pd.read_excel(file_name) # dataframe to process
original = pd.read_excel(file_name) # keep original dataset to avoid mixed data

### Initial Target variable Distribution

In [2]:
print(df["cure_or_fail"].value_counts())
# 0 cure
# 1 fail

0    189
1     58
Name: cure_or_fail, dtype: int64


### GMM Data Augmentation

In [3]:
k = 3
X = df.loc[:, df.columns != 'cure_or_fail']
y = df.cure_or_fail

model = mixture.GaussianMixture(n_components=2, covariance_type='full',random_state = 42).fit(X)
ll = model.sample(378*2)
X_res, y_res = ll

In [4]:
# Data and labels concatenation

dataAug = pd.concat([pd.DataFrame(X_res), pd.DataFrame(y_res)], axis = 1)
dataAug.columns = df.columns # those are reassigned without order (target is not at the end in original dataset)

ulcer = dataAug["cure_or_fail"] # it is unordered (it is ulcer area column)
target = dataAug["ulcer_area"] # it is  unordered (it is target area column)

# make the swap
dataAug["cure_or_fail"] = target
dataAug["ulcer_area"] = ulcer

In [5]:
dataAug["cure_or_fail"].value_counts()

1    562
0    194
Name: cure_or_fail, dtype: int64

### Get New Instances from the Dataset

In [6]:
new_samples = list()

for index, row in dataAug.iterrows():
    row = list(row)
    band = True
    for index2,row2 in original.iterrows():
        row2 = list(row2)
        if row == row2: # if an augmented instance is also in original dataset we dont want it
            band = False
            break
    if band: # if is a new instance take it
        new_samples.append(row)

### Build a Dataframe wiht those new instances

In [7]:
fake = pd.DataFrame(new_samples,columns =  dataAug.columns)

### Synthetic Target Variable Distribution

In [8]:
print(fake["cure_or_fail"].value_counts())
# 0 cure
# 1 fail

1.0    562
0.0    194
Name: cure_or_fail, dtype: int64


### Save Synthetic Dataset

In [9]:
fake.to_excel("fake_gmm.xlsx", index = False)