In [None]:
import numpy as np
import pandas as pd

In [None]:
np.random.seed(42)

num_examples = 1000

data = pd.DataFrame({
    'Gender' : np.random.choice(['Male' ,'Female' ] ,size =num_examples),
    'Country' : np.random.choice(['Egypt', 'USA', 'Canada', 'UK', 'Germany', 'India'] ,size = num_examples),
    'SubscriptionType' : np.random.choice(['Free' ,'Basic' ,'Premium'] ,size = num_examples),
    'Churn' : np.random.choice([0, 1] ,size = num_examples)
})
data

Unnamed: 0,Gender,Country,SubscriptionType,Churn
0,Male,India,Premium,1
1,Female,Egypt,Free,0
2,Male,Egypt,Premium,1
3,Male,Egypt,Free,1
4,Male,Germany,Basic,0
...,...,...,...,...
995,Male,USA,Basic,0
996,Male,UK,Premium,1
997,Female,Canada,Basic,1
998,Female,India,Premium,0


In [None]:
type(data)

In [None]:
data.to_csv('/content/drive/MyDrive/DataForModels/Simulated Customer Data.csv',index = False)

In [None]:
data = pd.DataFrame({
    'feature1': ['dog', 'cat', 'elephant', 'dog', 'cat', 'monkey'],
    'feature2': ['red', 'blue', 'green', 'red', 'blue', 'green'],
    'label': [1, 0, 1, 1, 0, 0]
})

data['combined'] = data['feature1'] + "_" + data['feature2']

data


Unnamed: 0,feature1,feature2,label,combined
0,dog,red,1,dog_red
1,cat,blue,0,cat_blue
2,elephant,green,1,elephant_green
3,dog,red,1,dog_red
4,cat,blue,0,cat_blue
5,monkey,green,0,monkey_green


In [None]:
data1 = pd.DataFrame({
    'feature1': ['dog', 'cat', 'elephant', 'dog', 'cat', 'monkey'],
    'feature2': ['red', 'blue', 'green', 'red', 'blue', 'green'],
    'label': [1, 0, 1, 1, 0, 0]
})

data1['combined'] = data['feature1'] + "_" + data['feature2']

print(data1.head(5))

compined_features = data1[['feature1','feature2']].apply(tuple ,axis =1)

print(compined_features)

   feature1 feature2  label        combined
0       dog      red      1         dog_red
1       cat     blue      0        cat_blue
2  elephant    green      1  elephant_green
3       dog      red      1         dog_red
4       cat     blue      0        cat_blue
0           (dog, red)
1          (cat, blue)
2    (elephant, green)
3           (dog, red)
4          (cat, blue)
5      (monkey, green)
dtype: object


In [5]:
import numpy as np
import pandas as pd

from sklearn.feature_extraction import FeatureHasher
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


# Step 1: Create sample data
data = pd.DataFrame({
    'feature1': ['dog', 'cat', 'elephant', 'dog', 'cat', 'monkey'],
    'feature2': ['red', 'blue', 'green', 'red', 'blue', 'green'],
    'num_feature': [10, 20, 30, 40, 50, 60],  # Numerical column
    'label': [1, 0, 1, 1, 0, 0]
})
# reutrn series
# data['combined'] = data['feature1'] + "_" + data['feature2']     # return error beacuse FeatureHasher needs an iterable of iterable structures (e.g., tuples or lists),

# Combine categorical features into tuples (iterable of strings)
compined_features = data[['feature1','feature2']].apply(tuple ,axis = 1)  # return datafream
#print('\n',compined_features.head())

for n_feat in [8,16,32,64,128]:
    # Step 2: Define hash encoding with FeatureHasher
    n_features = n_feat   # Define the number of hash buckets
    hasher = FeatureHasher(n_features= n_features ,input_type='string')

    #print(data.head(5))

    # Apply hash encoding
    hashed_features = hasher.transform(compined_features)

    #print('\n',type(hashed_features))
    #print('\n',hashed_features)
    hashed_features = hashed_features.toarray()

    #print(hashed_features)

    # Combine hashed features with numerical features
    numerical_features = data[['num_feature']].values  # Convert numerical column to NumPy array
    X = np.hstack((hashed_features ,numerical_features))
    y = data['label']

    #print('\n',X)
    X_train ,X_test ,y_train ,y_test  = train_test_split(X ,y ,test_size = 0.2 ,random_state = 42)
    rand_forest_model = RandomForestClassifier(random_state = 42)
    rand_forest_model.fit(X_train ,y_train)


    # Evaluate the model
    accuracy = rand_forest_model.score(X_test, y_test)
    print(f"\nN_features = {n_features} ,Model accuracy: {accuracy * 100:.2f}%")

    y_pred = rand_forest_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    print(f"N_features = {n_features}, Accuracy: {acc:.4f}")


N_features = 8 ,Model accuracy: 50.00%
N_features = 8, Accuracy: 0.5000

N_features = 16 ,Model accuracy: 50.00%
N_features = 16, Accuracy: 0.5000

N_features = 32 ,Model accuracy: 100.00%
N_features = 32, Accuracy: 1.0000

N_features = 64 ,Model accuracy: 100.00%
N_features = 64, Accuracy: 1.0000

N_features = 128 ,Model accuracy: 100.00%
N_features = 128, Accuracy: 1.0000
