## **📌 Task: Feature Engineering?**
#### **Feature engineering is the process of modifying raw data into meaningful features that help machine learning models make better predictions.**
#### 

In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

In [2]:


# Load the dataset
df = pd.read_csv("sample_feature_engineering.csv")

# Display first few rows
print(df.head())

# Check for missing values
print(df.isnull().sum())

# Check data types
print(df.info())


   Customer_ID  Age  Income  Spending_Score  Family_Members  \
0            1   56   72733              93               1   
1            2   46   85318              46               2   
2            3   32  129953               6               5   
3            4   60  109474              99               2   
4            5   25   43664              37               2   

   Last_Transaction_Days  
0                    341  
1                    282  
2                     63  
3                    217  
4                    342  
Customer_ID              0
Age                      0
Income                   0
Spending_Score           0
Family_Members           0
Last_Transaction_Days    0
dtype: int64
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200 entries, 0 to 199
Data columns (total 6 columns):
 #   Column                 Non-Null Count  Dtype
---  ------                 --------------  -----
 0   Customer_ID            200 non-null    int64
 1   Age                    20

In [6]:
# Create new features
df["Income_Per_Member"] = df["Income"] / df["Family_Members"]
df["Spending_Efficiency"] = df["Spending_Score"] / df["Age"]

# Categorizing transaction recency
df["Transaction_Category"] = pd.cut(
    df["Last_Transaction_Days"], 
    bins=[0, 90, 180, 365], 
    labels=["Recent", "Old", "Very Old"]
)

# Display updated dataset
print(df.head())

   Customer_ID  Age  Income  Spending_Score  Family_Members  \
0            1   56   72733              93               1   
1            2   46   85318              46               2   
2            3   32  129953               6               5   
3            4   60  109474              99               2   
4            5   25   43664              37               2   

   Last_Transaction_Days  Income_Per_Member  Spending_Efficiency  \
0                    341            72733.0             1.660714   
1                    282            42659.0             1.000000   
2                     63            25990.6             0.187500   
3                    217            54737.0             1.650000   
4                    342            21832.0             1.480000   

  Transaction_Category  
0             Very Old  
1             Very Old  
2               Recent  
3             Very Old  
4             Very Old  


In [4]:
# Fill missing numerical values with median
df.fillna({"Income": df["Income"].median(), "Spending_Score": df["Spending_Score"].median()}, inplace=True)

# Fill missing categorical values
#"Transaction_Category" is stored as a categorical variable with fixed categories. Pandas does not automatically allow new values (like "Unknown") to be added.
#df["Transaction_Category"]=df["Transaction_Category"].fillna("Unknown") this linewas giving above error
# Ensure 'Transaction_Category' is categorical before adding a new category
if df["Transaction_Category"].dtype.name == "category":
    df["Transaction_Category"] = df["Transaction_Category"].cat.add_categories("Unknown")

# Now fill missing values with "Unknown"
df["Transaction_Category"]=df["Transaction_Category"].fillna("Unknown")


# Check if missing values are handled
print(df.isnull().sum())

Customer_ID              0
Age                      0
Income                   0
Spending_Score           0
Family_Members           0
Last_Transaction_Days    0
Income_Per_Member        0
Spending_Efficiency      0
Transaction_Category     0
dtype: int64


In [5]:

# Define a function to remove outliers using IQR
def remove_outliers(df, column):
    Q1 = df[column].quantile(0.25)
    Q3 = df[column].quantile(0.75)
    IQR = Q3 - Q1
    lower_bound = Q1 - 1.5 * IQR
    upper_bound = Q3 + 1.5 * IQR
    return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# Remove outliers for Income & Spending Score
df = remove_outliers(df, "Income")
df = remove_outliers(df, "Spending_Score")

# Display updated shape
print(f"Dataset after removing outliers: {df.shape}")
print(df.head())

Dataset after removing outliers: (200, 9)
   Customer_ID  Age  Income  Spending_Score  Family_Members  \
0            1   56   72733              93               1   
1            2   46   85318              46               2   
2            3   32  129953               6               5   
3            4   60  109474              99               2   
4            5   25   43664              37               2   

   Last_Transaction_Days  Income_Per_Member  Spending_Efficiency  \
0                    341            72733.0             1.660714   
1                    282            42659.0             1.000000   
2                     63            25990.6             0.187500   
3                    217            54737.0             1.650000   
4                    342            21832.0             1.480000   

  Transaction_Category  
0             Very Old  
1             Very Old  
2               Recent  
3             Very Old  
4             Very Old  
