# Task: Encoding Categorical Data:

In [1]:
import pandas as pd
import numpy as np

In [4]:
df = pd.read_csv("/Users/radhikaasmar/demo2/SafeBite_AI-Powered Allergen Detection in Food/categorical_dataset_for_encodings.csv")
df.head()

Unnamed: 0,CustomerID,Gender,Education,City,MaritalStatus,AgeGroup,PurchaseIntent,House_Price
0,1,Male,Bachelors,Chicago,Married,Senior,Low,184811
1,2,Female,Masters,Houston,Married,Senior,Medium,290121
2,3,Male,High School,Los Angeles,Single,Teen,Medium,332892
3,4,Male,High School,Houston,Single,Adult,High,480172
4,5,Male,High School,Houston,Married,Teen,Low,139178


In [5]:
df.shape

(1000, 8)

In [6]:
df.dtypes

CustomerID         int64
Gender            object
Education         object
City              object
MaritalStatus     object
AgeGroup          object
PurchaseIntent    object
House_Price        int64
dtype: object

## 1.Identify Categorical Data:

### Identify ordinal and nominal categorical columns in the dataset.

## 2. One-Hot Encoding:

### Using Pandas' get_dummies()

In [13]:
df_onehot_pandas = pd.get_dummies(df, columns=['Gender', 'MaritalStatus'], drop_first=True)
df_onehot_pandas

Unnamed: 0,CustomerID,Education,City,AgeGroup,PurchaseIntent,House_Price,Gender_Male,MaritalStatus_Married,MaritalStatus_Single
0,1,Bachelors,Chicago,Senior,Low,184811,True,True,False
1,2,Masters,Houston,Senior,Medium,290121,False,True,False
2,3,High School,Los Angeles,Teen,Medium,332892,True,False,True
3,4,High School,Houston,Adult,High,480172,True,False,True
4,5,High School,Houston,Teen,Low,139178,True,True,False
...,...,...,...,...,...,...,...,...,...
995,996,High School,New York,Teen,Low,368508,True,False,True
996,997,Bachelors,Chicago,Young Adult,Low,491011,True,False,True
997,998,High School,New York,Senior,Medium,231763,False,False,True
998,999,Masters,Los Angeles,Young Adult,Medium,357971,False,True,False


### Using Sklearn's OneHotEncoder

In [18]:
from sklearn.preprocessing import OneHotEncoder

# One-hot encoding using Sklearn
encoder = OneHotEncoder(drop='first')
encoded_columns = encoder.fit_transform(df[['Gender', 'MaritalStatus']])

# Convert the result to a DataFrame and join it back to the original DataFrame
df_encoded = pd.DataFrame(encoded_columns.toarray(), columns=encoder.get_feature_names_out())
df = df.join(df_encoded).drop(['Gender', 'MaritalStatus'], axis=1)
df

Unnamed: 0,CustomerID,Education,City,AgeGroup,PurchaseIntent,House_Price,Education_ordinal,AgeGroup_ordinal,Gender_Male,MaritalStatus_Married,MaritalStatus_Single
0,1,Bachelors,Chicago,Senior,Low,184811,2.0,3.0,1.0,1.0,0.0
1,2,Masters,Houston,Senior,Medium,290121,3.0,3.0,0.0,1.0,0.0
2,3,High School,Los Angeles,Teen,Medium,332892,1.0,1.0,1.0,0.0,1.0
3,4,High School,Houston,Adult,High,480172,1.0,2.0,1.0,0.0,1.0
4,5,High School,Houston,Teen,Low,139178,1.0,1.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...
995,996,High School,New York,Teen,Low,368508,1.0,1.0,1.0,0.0,1.0
996,997,Bachelors,Chicago,Young Adult,Low,491011,2.0,,1.0,0.0,1.0
997,998,High School,New York,Senior,Medium,231763,1.0,3.0,0.0,0.0,1.0
998,999,Masters,Los Angeles,Young Adult,Medium,357971,3.0,,0.0,1.0,0.0


## 3.Ordinal Encoding:
### Perform ordinal encoding on the Education column and the AgeGroup column based on their natural order.

In [25]:
# Ordinal encoding for Education and AgeGroup
education_order = {'High School': 1, 'Bachelors': 2, 'Masters': 3}
agegroup_order = {'Teen': 1, 'Young Adult': 2, 'Adult': 3, 'Senior': 4}

df['Education_ordinal'] = df['Education'].map(education_order)
df['AgeGroup_ordinal'] = df['AgeGroup'].map(agegroup_order)
df

Unnamed: 0,CustomerID,Education,City,AgeGroup,PurchaseIntent,House_Price,Education_ordinal,AgeGroup_ordinal,Gender_Male,MaritalStatus_Married,MaritalStatus_Single,City_frequency,PurchaseIntent_encoded
0,1,Bachelors,Chicago,Senior,Low,184811,2.0,4,1.0,1.0,0.0,0.240,0
1,2,Masters,Houston,Senior,Medium,290121,3.0,4,0.0,1.0,0.0,0.241,1
2,3,High School,Los Angeles,Teen,Medium,332892,1.0,1,1.0,0.0,1.0,0.258,1
3,4,High School,Houston,Adult,High,480172,1.0,3,1.0,0.0,1.0,0.241,2
4,5,High School,Houston,Teen,Low,139178,1.0,1,1.0,1.0,0.0,0.241,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,High School,New York,Teen,Low,368508,1.0,1,1.0,0.0,1.0,0.261,0
996,997,Bachelors,Chicago,Young Adult,Low,491011,2.0,2,1.0,0.0,1.0,0.240,0
997,998,High School,New York,Senior,Medium,231763,1.0,4,0.0,0.0,1.0,0.261,1
998,999,Masters,Los Angeles,Young Adult,Medium,357971,3.0,2,0.0,1.0,0.0,0.258,1




## 4. Frequency Encoding:
### Perform frequency encoding on the City column.



In [28]:
# Frequency encoding for City
city_frequency = df['City'].value_counts(normalize=True)
df['City_frequency'] = df['City'].map(city_frequency)
df

Unnamed: 0,CustomerID,Education,City,AgeGroup,PurchaseIntent,House_Price,Education_ordinal,AgeGroup_ordinal,Gender_Male,MaritalStatus_Married,MaritalStatus_Single,City_frequency,PurchaseIntent_encoded
0,1,Bachelors,Chicago,Senior,Low,184811,2.0,4,1.0,1.0,0.0,0.240,0
1,2,Masters,Houston,Senior,Medium,290121,3.0,4,0.0,1.0,0.0,0.241,1
2,3,High School,Los Angeles,Teen,Medium,332892,1.0,1,1.0,0.0,1.0,0.258,1
3,4,High School,Houston,Adult,High,480172,1.0,3,1.0,0.0,1.0,0.241,2
4,5,High School,Houston,Teen,Low,139178,1.0,1,1.0,1.0,0.0,0.241,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,High School,New York,Teen,Low,368508,1.0,1,1.0,0.0,1.0,0.261,0
996,997,Bachelors,Chicago,Young Adult,Low,491011,2.0,2,1.0,0.0,1.0,0.240,0
997,998,High School,New York,Senior,Medium,231763,1.0,4,0.0,0.0,1.0,0.261,1
998,999,Masters,Los Angeles,Young Adult,Medium,357971,3.0,2,0.0,1.0,0.0,0.258,1


## 5. Label Encoding:
### Apply label encoding on the PurchaseIntent columns.

In [27]:
from sklearn.preprocessing import LabelEncoder

# Label encoding for PurchaseIntent
purchase_intent_order = {'Low': 0, 'Medium': 1, 'High': 2}
df['PurchaseIntent_encoded'] = df['PurchaseIntent'].map(purchase_intent_order)

df

Unnamed: 0,CustomerID,Education,City,AgeGroup,PurchaseIntent,House_Price,Education_ordinal,AgeGroup_ordinal,Gender_Male,MaritalStatus_Married,MaritalStatus_Single,City_frequency,PurchaseIntent_encoded
0,1,Bachelors,Chicago,Senior,Low,184811,2.0,4,1.0,1.0,0.0,0.240,0
1,2,Masters,Houston,Senior,Medium,290121,3.0,4,0.0,1.0,0.0,0.241,1
2,3,High School,Los Angeles,Teen,Medium,332892,1.0,1,1.0,0.0,1.0,0.258,1
3,4,High School,Houston,Adult,High,480172,1.0,3,1.0,0.0,1.0,0.241,2
4,5,High School,Houston,Teen,Low,139178,1.0,1,1.0,1.0,0.0,0.241,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,High School,New York,Teen,Low,368508,1.0,1,1.0,0.0,1.0,0.261,0
996,997,Bachelors,Chicago,Young Adult,Low,491011,2.0,2,1.0,0.0,1.0,0.240,0
997,998,High School,New York,Senior,Medium,231763,1.0,4,0.0,0.0,1.0,0.261,1
998,999,Masters,Los Angeles,Young Adult,Medium,357971,3.0,2,0.0,1.0,0.0,0.258,1


## 6.Target Encoding:
### Perform target encoding on the City column based on the average House_Price.

In [29]:
# Target encoding for City based on average House_Price
city_target = df.groupby('City')['House_Price'].mean()
df['City_target'] = df['City'].map(city_target)
df

Unnamed: 0,CustomerID,Education,City,AgeGroup,PurchaseIntent,House_Price,Education_ordinal,AgeGroup_ordinal,Gender_Male,MaritalStatus_Married,MaritalStatus_Single,City_frequency,PurchaseIntent_encoded,City_target
0,1,Bachelors,Chicago,Senior,Low,184811,2.0,4,1.0,1.0,0.0,0.240,0,299091.125000
1,2,Masters,Houston,Senior,Medium,290121,3.0,4,0.0,1.0,0.0,0.241,1,307577.493776
2,3,High School,Los Angeles,Teen,Medium,332892,1.0,1,1.0,0.0,1.0,0.258,1,303874.895349
3,4,High School,Houston,Adult,High,480172,1.0,3,1.0,0.0,1.0,0.241,2,307577.493776
4,5,High School,Houston,Teen,Low,139178,1.0,1,1.0,1.0,0.0,0.241,0,307577.493776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,High School,New York,Teen,Low,368508,1.0,1,1.0,0.0,1.0,0.261,0,318877.980843
996,997,Bachelors,Chicago,Young Adult,Low,491011,2.0,2,1.0,0.0,1.0,0.240,0,299091.125000
997,998,High School,New York,Senior,Medium,231763,1.0,4,0.0,0.0,1.0,0.261,1,318877.980843
998,999,Masters,Los Angeles,Young Adult,Medium,357971,3.0,2,0.0,1.0,0.0,0.258,1,303874.895349


### 1. `Education_ordinal`
- **Description**: This column represents the ordinal encoding of the `Education` column. The values reflect the order of education levels based on hierarchy.
- **Encoding**:
  - `High School`: 1
  - `Bachelors`: 2
  - `Masters`: 3

### 2. `AgeGroup_ordinal`
- **Description**: This column represents the ordinal encoding of the `AgeGroup` column. The values reflect the progression of age from younger to older.
- **Encoding**:
  - `Teen`: 1
  - `Young Adult`: 2
  - `Adult`: 3
  - `Senior`: 4

### 3. `Gender_Male`
- **Description**: This column is part of the one-hot encoding applied to the `Gender` column. Since one-hot encoding creates binary columns, this column represents whether the customer is male (`1`) or not (`0`). The female category is encoded implicitly by the absence of `1` in this column (i.e., when `Gender_Male = 0`).
- **Values**:
  - `1`: Male
  - `0`: Female

### 4. `MaritalStatus_Married`
- **Description**: This column is part of the one-hot encoding applied to the `MaritalStatus` column. It indicates whether the customer is married.
- **Values**:
  - `1`: Married
  - `0`: Not married

### 5. `MaritalStatus_Single`
- **Description**: This column is also a part of the one-hot encoding for `MaritalStatus`. It indicates whether the customer is single.
- **Values**:
  - `1`: Single
  - `0`: Not single

### 6. `City_frequency`
- **Description**: This column contains the frequency encoding of the `City` column. The values represent the relative frequency of each city in the dataset. It is a proportion between `0` and `1`, indicating how often each city appears in the data.
- **Example**:
  - `0.240`: The city appears in 24% of the dataset.
  - `0.241`: The city appears in 24.1% of the dataset.

### 7. `PurchaseIntent_encoded`
- **Description**: This column represents the ordinal encoding of the `PurchaseIntent` column, where the categories have been mapped based on intent strength (Low < Medium < High).
- **Encoding**:
  - `Low`: 0
  - `Medium`: 1
  - `High`: 2

### 8. `City_target`
- **Description**: This column represents the target encoding of the `City` column. The values correspond to the **average House_Price** for each city, calculated by taking the mean `House_Price` of customers residing in that city.
- **Values**:
  - Example: In Chicago, the average house price is `299,091.13`, so all rows with `Chicago` will have this value in the `City_target` column.

---


In [30]:
df

Unnamed: 0,CustomerID,Education,City,AgeGroup,PurchaseIntent,House_Price,Education_ordinal,AgeGroup_ordinal,Gender_Male,MaritalStatus_Married,MaritalStatus_Single,City_frequency,PurchaseIntent_encoded,City_target
0,1,Bachelors,Chicago,Senior,Low,184811,2.0,4,1.0,1.0,0.0,0.240,0,299091.125000
1,2,Masters,Houston,Senior,Medium,290121,3.0,4,0.0,1.0,0.0,0.241,1,307577.493776
2,3,High School,Los Angeles,Teen,Medium,332892,1.0,1,1.0,0.0,1.0,0.258,1,303874.895349
3,4,High School,Houston,Adult,High,480172,1.0,3,1.0,0.0,1.0,0.241,2,307577.493776
4,5,High School,Houston,Teen,Low,139178,1.0,1,1.0,1.0,0.0,0.241,0,307577.493776
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,996,High School,New York,Teen,Low,368508,1.0,1,1.0,0.0,1.0,0.261,0,318877.980843
996,997,Bachelors,Chicago,Young Adult,Low,491011,2.0,2,1.0,0.0,1.0,0.240,0,299091.125000
997,998,High School,New York,Senior,Medium,231763,1.0,4,0.0,0.0,1.0,0.261,1,318877.980843
998,999,Masters,Los Angeles,Young Adult,Medium,357971,3.0,2,0.0,1.0,0.0,0.258,1,303874.895349
