Data Augmentation

In [2]:
import numpy as np
import pandas as pd
np.random.seed(42)
n = 500
df = pd.DataFrame({
    "Age": np.random.randint(20, 26, n),
    "CGPA": np.random.uniform(6.0, 9.5, n),
    "Internship_Count": np.random.randint(0, 4, n),
    "technical_Score": np.random.uniform(50, 100, n),
    "Attendance_Percentage": np.random.randint(60, 100, n)
})

In [3]:
df["Placed"] = np.where(
    (df["CGPA"] >= 7.5) & (df["technical_Score"] >= 70), "Yes", "No"
)



In [4]:
df.shape

(500, 6)

In [5]:
bootstrap = df.sample(n=200, replace=True, random_state=42)

In [6]:
df_aug = pd.concat([df, bootstrap], ignore_index=True)
df_aug.shape

(700, 6)

In [7]:
noise = df.sample(n=150, random_state=42).copy()
noise['CGPA'] += np.random.normal(0, 0.1, 150)
noise['technical_Score'] += np.random.randint(-3, 4, 150)
noise["Attendance_Percentage"] += np.random.randint(0, 2, 150)

#Logical Comstraints
noise["CFPA"] = noise["CGPA"].clip(6.0, 9.5)
noise["technical_Score"] = noise["technical_Score"].clip(50, 100)
noise["Attendance_Percentage"] = noise["Attendance_Percentage"].clip(60, 100)
df_aug = pd.concat([df_aug, noise], ignore_index=True)
df_aug

Unnamed: 0,Age,CGPA,Internship_Count,technical_Score,Attendance_Percentage,Placed,CFPA
0,23,6.606207,2,80.386840,78,No,
1,24,7.518481,0,75.634426,63,Yes,
2,22,7.394767,3,61.533491,85,No,
3,24,8.155475,0,58.826402,97,No,
4,24,8.222828,0,61.024310,91,No,
...,...,...,...,...,...,...,...
845,20,7.794396,3,60.841390,61,No,7.794396
846,21,7.233455,1,77.004310,80,No,7.233455
847,20,7.345824,0,69.615914,80,No,7.345824
848,21,6.263997,3,87.262883,82,No,6.263997


In [8]:
df_aug

Unnamed: 0,Age,CGPA,Internship_Count,technical_Score,Attendance_Percentage,Placed,CFPA
0,23,6.606207,2,80.386840,78,No,
1,24,7.518481,0,75.634426,63,Yes,
2,22,7.394767,3,61.533491,85,No,
3,24,8.155475,0,58.826402,97,No,
4,24,8.222828,0,61.024310,91,No,
...,...,...,...,...,...,...,...
845,20,7.794396,3,60.841390,61,No,7.794396
846,21,7.233455,1,77.004310,80,No,7.233455
847,20,7.345824,0,69.615914,80,No,7.345824
848,21,6.263997,3,87.262883,82,No,6.263997


In [9]:
pip install imbalanced-learn

Note: you may need to restart the kernel to use updated packages.


In [11]:
from imblearn.over_sampling import SMOTE
x = df_aug.drop('Placed', axis=1)   
y = df_aug['Placed']

smote = SMOTE(sampling_strategy=0.8, random_state=42)
smote

SMOTE(random_state=42, sampling_strategy=0.8)