In [None]:
# data augmentation - numerical values

from sklearn.datasets import load_iris #importing inbuilt dataset
from sklearn.utils import shuffle # importing shuffle to shuffle the dataset so there exist no bias
import numpy as np  # for array

iris = load_iris() #4 loading the dataset
X, y = iris.data, iris.target  #5 # assigning feature and target

X, y = shuffle(X, y, random_state=42) #6
# shuffle the dataset
# random state = 0 ,traing and testing will differ each time while running the model
# random state = 42 , training and testing set won't differ

def augment_data(X, y, noise_level=0.1, num_augmented_samples=100): #7
  #defining a function to generate random samples based on the existing dataset
  # noise level = difference from the existing data i.e standard deviation
  # range (0-1), 0.1 = 10%

    augmented_X = [] #8 # empty list to store generated data
    augmented_y = [] #9
    # empty list to store duplicate data . Y is not newly generated ,its replicated

    for i in range(num_augmented_samples): #10

        noisy_X = X + np.random.normal(loc=0.0, scale=noise_level, size=X.shape) #11

      # generated values as gaussian distribution
      # loc = mean
      # scale = standard deviation

        augmented_X.append(noisy_X) #12 # join augmented dataset to existing dataset
        augmented_y.append(y) #13



    augmented_X = np.array(augmented_X) #14 # convert into array
    augmented_y = np.array(augmented_y) #15

    augmented_X = augmented_X.reshape(-1, X.shape[1]) #16 # converting into 1D array
    augmented_y = augmented_y.reshape(-1,) #17


    augmented_X = np.vstack((X, augmented_X))  #18 #vertical stacking
    augmented_y = np.hstack((y, augmented_y))#19 horizontal stacking of data

    return augmented_X, augmented_y


augmented_X, augmented_y = augment_data(X, y, noise_level=0.1, num_augmented_samples=100) #21 call the function
print("Original dataset shape:", X.shape, y.shape)
print("Augmented dataset shape:", augmented_X.shape, augmented_y.shape)


Original dataset shape: (150, 4) (150,)
Augmented dataset shape: (15150, 4) (15150,)


In [None]:
print(augmented_X)

[[6.1        2.8        4.7        1.2       ]
 [5.7        3.8        1.7        0.3       ]
 [7.7        2.6        6.9        2.3       ]
 ...
 [5.81559078 3.94052537 1.06547414 0.22574002]
 [5.7795945  2.503208   4.20609371 1.32015378]
 [7.18410471 3.07714388 5.90262915 2.04645939]]


In [None]:
import pandas as pd

augmented_df = pd.DataFrame(augmented_X, columns=iris.feature_names) #converting into dataframe
augmented_df['target'] = augmented_y

print(augmented_df.head())


   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                6.1               2.8                4.7               1.2   
1                5.7               3.8                1.7               0.3   
2                7.7               2.6                6.9               2.3   
3                6.0               2.9                4.5               1.5   
4                6.8               2.8                4.8               1.4   

   target  
0       1  
1       0  
2       2  
3       1  
4       1  
