In [1]:
## 7. Transfer Categorical Columns to Numeric

# pd.get_dummies()

# https://stats.stackexchange.com/questions/411336/how-many-dummy-variables-should-we-include-in-our-multiple-linear-regression-ana

# Rule of Thumb: We should have at least 15 subjects per parameter.
# If using dummies we should detect whether the number of dummies would exceed
# a reasonable number and choose some other method.
# 
# Following code to propose a method of recoding categorical values to numeric 
# values with pd.factorize()

In [8]:
## Initialize Libraries
import pandas as pd
import numpy as np

In [52]:
## Read the Sample Data
df = pd.read_csv ("https://raw.githubusercontent.com/db-bangs/BDAI-Automation/main/CustSeg.Train.csv")
df.head()

Unnamed: 0,ID,Gender,Ever_Married,Age,Graduated,Profession,Work_Experience,Spending_Score,Family_Size,Var_1,Segmentation
0,462809,Male,No,22,No,Healthcare,1.0,Low,4.0,Cat_4,D
1,462643,Female,Yes,38,Yes,Engineer,,Average,3.0,Cat_4,A
2,466315,Female,Yes,67,Yes,Engineer,1.0,Low,1.0,Cat_6,B
3,461735,Male,Yes,67,Yes,Lawyer,0.0,High,2.0,Cat_6,B
4,462669,Female,Yes,40,Yes,Entertainment,,High,6.0,Cat_6,A


In [53]:
# Give information about the dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8068 entries, 0 to 8067
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ID               8068 non-null   int64  
 1   Gender           8068 non-null   object 
 2   Ever_Married     7928 non-null   object 
 3   Age              8068 non-null   int64  
 4   Graduated        7990 non-null   object 
 5   Profession       7944 non-null   object 
 6   Work_Experience  7239 non-null   float64
 7   Spending_Score   8068 non-null   object 
 8   Family_Size      7733 non-null   float64
 9   Var_1            7992 non-null   object 
 10  Segmentation     8068 non-null   object 
dtypes: float64(2), int64(2), object(7)
memory usage: 693.5+ KB


In [90]:
## List of Categorical Columns
catcols = []
for i in df.columns:
  if df[i].dtype == "object":
      catcols.append(i)
catcols 

['Gender',
 'Ever_Married',
 'Graduated',
 'Profession',
 'Spending_Score',
 'Var_1',
 'Segmentation']

In [91]:
## Subset Just Categorical Columns - DataFrame
categoricals = df[catcols]
categoricals.head()



Unnamed: 0,Gender,Ever_Married,Graduated,Profession,Spending_Score,Var_1,Segmentation
0,Male,No,No,Healthcare,Low,Cat_4,D
1,Female,Yes,Yes,Engineer,Average,Cat_4,A
2,Female,Yes,Yes,Engineer,Low,Cat_6,B
3,Male,Yes,Yes,Lawyer,High,Cat_6,B
4,Female,Yes,Yes,Entertainment,High,Cat_6,A


In [93]:
## Using pd.factorize reassign all categorical values to a numeric
## Keep the key-value pairs in a dictionary

# Instantiate a dictionary to record the recoded values
category_dict = {}

# Loop through columns and apply pd.factorize to each
for column in categoricals:
    category_dict[column] = dict(enumerate(categoricals[column].unique()))
    categoricals[column] = pd.factorize(categoricals[column])[0]

## Find a way to assign missing values to '0' ?
## then map all categorical to numeric not '0' ?

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  categoricals[column] = pd.factorize(categoricals[column])[0]


In [88]:
# Review the recoded dataframe
categoricals.head()

Unnamed: 0,Gender,Ever_Married,Graduated,Profession,Spending_Score,Var_1,Segmentation
0,0,0,0,0,0,0,0
1,1,1,1,1,1,0,1
2,1,1,1,1,0,1,2
3,0,1,1,2,2,1,2
4,1,1,1,3,2,1,1


In [89]:
# Review the dictionary of categories
category_dict

{'Gender': {0: 'Male', 1: 'Female'},
 'Ever_Married': {0: 'No', 1: 'Yes', 2: nan},
 'Graduated': {0: 'No', 1: 'Yes', 2: nan},
 'Profession': {0: 'Healthcare',
  1: 'Engineer',
  2: 'Lawyer',
  3: 'Entertainment',
  4: 'Artist',
  5: 'Executive',
  6: 'Doctor',
  7: 'Homemaker',
  8: 'Marketing',
  9: nan},
 'Spending_Score': {0: 'Low', 1: 'Average', 2: 'High'},
 'Var_1': {0: 'Cat_4',
  1: 'Cat_6',
  2: 'Cat_7',
  3: 'Cat_3',
  4: 'Cat_1',
  5: 'Cat_2',
  6: nan,
  7: 'Cat_5'},
 'Segmentation': {0: 'D', 1: 'A', 2: 'B', 3: 'C'}}

In [None]:
## Next Step: How to look up dictionary of recoded values, after analysis ?
## Is it possible to add 1 to all factors AND the dictionary, to avoid 0s ?
##      - would adding +1 be necessary or relevant?