#DATA AUGMENTATION

In [6]:

import numpy as np
import pandas as pd

np.random.seed(42)

n=500

df = pd.DataFrame({
    "Age": np.random.randint(18, 60, size=n),
    "CGPA": np.round(np.random.uniform(5.0, 10.0, size=n), 2),
    "Interenship_Counts": np.random.randint(1, 12, size=n),
    "Technical_Score": np.random.randint(1, 11, size=n),
    "Attendance_Percentage": np.random.randint(50, 101, size=n),
})

#Target variable (imbalanced)
df['Placed'] = np.where(
    (df["CGPA"] >= 7.5) & (df["Technical_Score"] >= 70),"Yes", 'No'
    )

In [8]:
df

Unnamed: 0,Age,CGPA,Interenship_Counts,Technical_Score,Attendance_Percentage,Placed
0,56,9.29,7,6,65,No
1,46,7.14,2,5,61,No
2,32,8.75,7,5,87,No
3,25,8.77,10,2,83,No
4,38,5.52,7,8,63,No
...,...,...,...,...,...,...
495,56,9.28,4,2,96,No
496,49,9.15,9,8,63,No
497,41,6.99,6,7,60,No
498,40,8.34,9,1,88,No


In [9]:
df.shape

(500, 6)

In [10]:
#Bootstrap Resampling[+200

bootstrap= df.sample(n=200, replace=True, random_state=42)
df_aug=pd.concat([df, bootstrap], ignore_index=True)

df_aug

Unnamed: 0,Age,CGPA,Interenship_Counts,Technical_Score,Attendance_Percentage,Placed
0,56,9.29,7,6,65,No
1,46,7.14,2,5,61,No
2,32,8.75,7,5,87,No
3,25,8.77,10,2,83,No
4,38,5.52,7,8,63,No
...,...,...,...,...,...,...
695,59,8.10,3,3,69,No
696,36,6.73,4,10,50,No
697,49,6.38,2,4,94,No
698,42,6.16,7,6,94,No


In [11]:
noise=df.sample(n=150, random_state=42).copy()

noise['CGPA'] = noise['CGPA'] + np.random.normal(0,0.1,150)
noise['Technical_Score'] += np.random.randint(-3,4,150) 
noise['Attendance_Percentage'] += np.random.randint(0,2,150)

# Logical constraints
noise['CGPA'] = noise['CGPA'].clip(6.0,9.5)
noise['Technical_Score'] = noise['Technical_Score'].clip(50,100)
noise['Attendance_Percentage'] = noise['Attendance_Percentage'].clip(60,100)

df_aug = pd.concat([df_aug, noise], ignore_index=True)

In [12]:
df

Unnamed: 0,Age,CGPA,Interenship_Counts,Technical_Score,Attendance_Percentage,Placed
0,56,9.29,7,6,65,No
1,46,7.14,2,5,61,No
2,32,8.75,7,5,87,No
3,25,8.77,10,2,83,No
4,38,5.52,7,8,63,No
...,...,...,...,...,...,...
495,56,9.28,4,2,96,No
496,49,9.15,9,8,63,No
497,41,6.99,6,7,60,No
498,40,8.34,9,1,88,No


In [19]:
#SMOTE
from imblearn.over_sampling import SMOTE

ImportError: cannot import name '_MissingValues' from 'sklearn.utils._param_validation' (c:\Users\DELL\anaconda3\Lib\site-packages\sklearn\utils\_param_validation.py)

In [16]:
x= df_aug.drop('Placed', axis=1)
y= df_aug['Placed']

x

Unnamed: 0,Age,CGPA,Interenship_Counts,Technical_Score,Attendance_Percentage
0,56,9.290000,7,6,65
1,46,7.140000,2,5,61
2,32,8.750000,7,5,87
3,25,8.770000,10,2,83
4,38,5.520000,7,8,63
...,...,...,...,...,...
845,42,8.046156,11,50,77
846,29,8.700163,5,50,60
847,24,9.194105,1,50,69
848,57,6.844920,4,50,60
