**Data Augmentation**

**WHAT**- 1. Process of artificially increasing the size and density of dataset

2. By Creating new data points from existing data w/o changing their meanings or labels

**WHY**-

1. Overcome limited dataset size
2. Reduce Overfitting.
3. Handles class imbalance
4. Improves Model Generation









Image data methods: Rotation,Flipping,Cropping and Adding Noise

In [6]:
import pandas as pd
import numpy as np
np.random.seed(41)
n=500

data = ({
    "CGPA": np.round(np.random.uniform(6.0,9.5,n),2),
    "Age": np.random.randint(18,24,n),
    "Technical_Score": np.random.randint(40,100,n),
    "Attendance_Percentage": np.random.randint(60,100,n),
    "Department" :np.random.choice(["CSE","ECE","A&R","ME","AIML"],size=n)
})
df=pd.DataFrame(data)
df

Unnamed: 0,CGPA,Age,Technical_Score,Attendance_Percentage,Department
0,6.88,18,76,60,ECE
1,6.16,22,79,75,CSE
2,8.37,20,76,85,AIML
3,6.15,23,91,81,AIML
4,6.41,23,46,86,A&R
...,...,...,...,...,...
495,7.28,19,66,95,ECE
496,6.06,21,70,72,A&R
497,6.52,21,61,69,A&R
498,8.68,23,63,87,A&R


In [7]:
#Target Variable (imbalanced)
df["placed"] = np.where(
  (df["CGPA"]>7.5) & (df["Technical_Score"]>70),"Yes","No"
)
df

Unnamed: 0,CGPA,Age,Technical_Score,Attendance_Percentage,Department,placed
0,6.88,18,76,60,ECE,No
1,6.16,22,79,75,CSE,No
2,8.37,20,76,85,AIML,Yes
3,6.15,23,91,81,AIML,No
4,6.41,23,46,86,A&R,No
...,...,...,...,...,...,...
495,7.28,19,66,95,ECE,No
496,6.06,21,70,72,A&R,No
497,6.52,21,61,69,A&R,No
498,8.68,23,63,87,A&R,No


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 500 entries, 0 to 499
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   CGPA                   500 non-null    float64
 1   Age                    500 non-null    int64  
 2   Technical_Score        500 non-null    int64  
 3   Attendance_Percentage  500 non-null    int64  
 4   Department             500 non-null    object 
 5   placed                 500 non-null    object 
dtypes: float64(1), int64(3), object(2)
memory usage: 23.6+ KB


In [9]:
df.shape

(500, 6)

In [10]:
#Bootstrap Resampling [+200]
df.sample()   #gives few rows randomly

Unnamed: 0,CGPA,Age,Technical_Score,Attendance_Percentage,Department,placed
323,6.27,19,81,87,ECE,No


In [11]:
bootstrap= df.sample(n=200, replace=True,random_state=41)
df_aug= pd.concat([df,bootstrap],ignore_index=True)
df_aug


Unnamed: 0,CGPA,Age,Technical_Score,Attendance_Percentage,Department,placed
0,6.88,18,76,60,ECE,No
1,6.16,22,79,75,CSE,No
2,8.37,20,76,85,AIML,Yes
3,6.15,23,91,81,AIML,No
4,6.41,23,46,86,A&R,No
...,...,...,...,...,...,...
695,7.74,23,64,61,CSE,No
696,6.25,20,56,79,A&R,No
697,7.41,23,98,73,CSE,No
698,8.42,21,43,72,ME,No


In [12]:
#Adding Noise Entries
noise=df_aug.sample(n=150,random_state=41).copy()
noise["CGPA"]+=np.random.normal(0,0.1,150)
noise ["Technical_Score"] +=np.random.normal(-3,4,150)
noise["Attendance_Percentage"] += np.random.normal(0,2,150)
df_aug = pd.concat([df_aug,noise],ignore_index=True)
df_aug


Unnamed: 0,CGPA,Age,Technical_Score,Attendance_Percentage,Department,placed
0,6.880000,18,76.000000,60.000000,ECE,No
1,6.160000,22,79.000000,75.000000,CSE,No
2,8.370000,20,76.000000,85.000000,AIML,Yes
3,6.150000,23,91.000000,81.000000,AIML,No
4,6.410000,23,46.000000,86.000000,A&R,No
...,...,...,...,...,...,...
845,9.439085,21,72.157989,82.669363,ME,Yes
846,8.829188,20,31.875898,81.614278,ME,No
847,8.355753,20,38.697815,58.755672,AIML,No
848,8.226253,22,62.474056,80.256341,ECE,No


In [13]:
#Adding Noise /Logical Constraints Entries
noise["CGPA"] = noise["CGPA"].clip(6.0,9.5)
noise["Technical_Score"] = noise["Technical_Score"].clip(50,100)
noise["Attendance_Percentage"] = noise["Attendance_Percentage"].clip(60,100)
df_aug= pd.concat([df_aug,noise],ignore_index=True)
df_aug

Unnamed: 0,CGPA,Age,Technical_Score,Attendance_Percentage,Department,placed
0,6.880000,18,76.000000,60.000000,ECE,No
1,6.160000,22,79.000000,75.000000,CSE,No
2,8.370000,20,76.000000,85.000000,AIML,Yes
3,6.150000,23,91.000000,81.000000,AIML,No
4,6.410000,23,46.000000,86.000000,A&R,No
...,...,...,...,...,...,...
995,9.439085,21,72.157989,82.669363,ME,Yes
996,8.829188,20,50.000000,81.614278,ME,No
997,8.355753,20,50.000000,60.000000,AIML,No
998,8.226253,22,62.474056,80.256341,ECE,No


In [14]:
#SMOTE Method
from imblearn.over_sampling import SMOTE

df_aug

Unnamed: 0,CGPA,Age,Technical_Score,Attendance_Percentage,Department,placed
0,6.880000,18,76.000000,60.000000,ECE,No
1,6.160000,22,79.000000,75.000000,CSE,No
2,8.370000,20,76.000000,85.000000,AIML,Yes
3,6.150000,23,91.000000,81.000000,AIML,No
4,6.410000,23,46.000000,86.000000,A&R,No
...,...,...,...,...,...,...
995,9.439085,21,72.157989,82.669363,ME,Yes
996,8.829188,20,50.000000,81.614278,ME,No
997,8.355753,20,50.000000,60.000000,AIML,No
998,8.226253,22,62.474056,80.256341,ECE,No


In [16]:
x = df_aug.drop["Placed", axis =1]
y = df_aug ["Placed"]
x
df_aug

SyntaxError: invalid syntax. Maybe you meant '==' or ':=' instead of '='? (ipython-input-700042825.py, line 1)

In [None]:
SMOTE(sampling_strategy=0.8,random_state=41)