In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

In [2]:
# Load dataset
df = pd.read_csv("../data/feature_engineered_data.csv")

In [3]:

# Drop irrelevant columns
df = df.drop(columns=['user_id', 'device_id', 'ip_address', 'signup_time', 'purchase_time'])

# Separate features and target
X = df.drop(columns=['class'])
y = df['class']

# Encode categoricals temporarily (will be redone in 6_3 later)
X = pd.get_dummies(X, drop_first=True)

# Stratified split
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.3)

# Before SMOTE
print("Before SMOTE:")
print(y_train.value_counts(normalize=True))

Before SMOTE:
class
0    0.906351
1    0.093649
Name: proportion, dtype: float64


In [4]:
# Apply SMOTE
smote = SMOTE(random_state=42)
X_train_smote, y_train_smote = smote.fit_resample(X_train, y_train)

# After SMOTE
print("\nAfter SMOTE:")
print(y_train_smote.value_counts(normalize=True))




After SMOTE:
class
0    0.5
1    0.5
Name: proportion, dtype: float64


In [5]:
# Optionally save resampled train set
pd.DataFrame(X_train_smote, columns=X_train.columns).to_csv("../data/X_train_smote.csv", index=False)
pd.DataFrame(y_train_smote, columns=["class"]).to_csv("../data/y_train_smote.csv", index=False)
X_test.to_csv("../data/X_test_original.csv", index=False)
y_test.to_csv("../data/y_test_original.csv", index=False)