##### **Encoding categorical variables:**

In [1]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder

In [2]:
df=pd.read_parquet("okcupid_profiles.parquet")

In [3]:
df.columns

Index(['age', 'status', 'sex', 'orientation', 'body_type', 'diet', 'drinks',
       'drugs', 'education', 'ethnicity', 'height', 'income', 'job',
       'last_online', 'location', 'offspring', 'pets', 'religion', 'sign',
       'smokes', 'speaks'],
      dtype='object')

In [4]:
df["body_type"].unique().shape

(13,)

In [5]:
ohe=OneHotEncoder(
    sparse_output=False,
    drop="first" 
                )

In [6]:
entcoded=ohe.fit_transform(df[["body_type"]])

In [7]:
# entcoded

In [8]:
new_df=pd.DataFrame(entcoded,columns=ohe.get_feature_names_out())

In [9]:
# We notice here that we have 12 Columns cause the first category was droped 
# When all 0 it means the category
new_df

Unnamed: 0,body_type_athletic,body_type_average,body_type_curvy,body_type_fit,body_type_full figured,body_type_jacked,body_type_overweight,body_type_rather not say,body_type_skinny,body_type_thin,body_type_used up,body_type_None
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...
59941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
59942,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59943,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59944,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#### **get_dummies:**

In [10]:
get=pd.get_dummies(df.body_type,drop_first=True)
get

Unnamed: 0,athletic,average,curvy,fit,full figured,jacked,overweight,rather not say,skinny,thin,used up
0,False,False,False,False,False,False,False,False,False,False,False
1,False,True,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,True,False
4,True,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...
59941,False,False,False,False,False,False,False,False,False,False,False
59942,False,False,False,True,False,False,False,False,False,False,False
59943,False,True,False,False,False,False,False,False,False,False,False
59944,True,False,False,False,False,False,False,False,False,False,False


The reason why we have here only 11 columns cause we have None category in body_type that will be ignored by get_dummies So the solution would be in replace none with value like "Missing".

In [11]:
df['body_type'].unique()

array(['a little extra', 'average', 'thin', 'athletic', 'fit', None,
       'skinny', 'curvy', 'full figured', 'jacked', 'rather not say',
       'used up', 'overweight'], dtype=object)

In [12]:
df["body_type"]=df["body_type"].fillna("Missing")

In [13]:
df["body_type"].unique()

array(['a little extra', 'average', 'thin', 'athletic', 'fit', 'Missing',
       'skinny', 'curvy', 'full figured', 'jacked', 'rather not say',
       'used up', 'overweight'], dtype=object)

In [14]:
get=pd.get_dummies(df.body_type,drop_first=True)
get

Unnamed: 0,a little extra,athletic,average,curvy,fit,full figured,jacked,overweight,rather not say,skinny,thin,used up
0,True,False,False,False,False,False,False,False,False,False,False,False
1,False,False,True,False,False,False,False,False,False,False,False,False
2,False,False,False,False,False,False,False,False,False,False,True,False
3,False,False,False,False,False,False,False,False,False,False,True,False
4,False,True,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...
59941,False,False,False,False,False,False,False,False,False,False,False,False
59942,False,False,False,False,True,False,False,False,False,False,False,False
59943,False,False,True,False,False,False,False,False,False,False,False,False
59944,False,True,False,False,False,False,False,False,False,False,False,False


##### **Zu vielen Ausprägungen:**

In [15]:
from sklearn.preprocessing import OneHotEncoder

In [16]:
# We have like 199 unique Ausprägungen
print(len(df.location.unique()))
df["location"].nunique()

199


199

In [17]:
ohn=OneHotEncoder(sparse_output=False,drop="first")
ndf=ohn.fit_transform(df[["location"]])

In [18]:
my_df=pd.DataFrame(ndf,columns=ohn.get_feature_names_out())

In [19]:
my_df

Unnamed: 0,"location_albany, california","location_amsterdam, netherlands","location_arcadia, california","location_asheville, north carolina","location_ashland, california","location_astoria, new york","location_atherton, california","location_atlanta, georgia","location_austin, texas","location_bayshore, california",...,"location_vallejo, california","location_vancouver, british columbia, canada","location_walnut creek, california","location_washington, district of columbia","location_waterford, california","location_west oakland, california","location_westlake, california","location_woodacre, california","location_woodbridge, virginia","location_woodside, california"
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
59941,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59942,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59943,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
59944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [20]:
from sklearn.model_selection import train_test_split

In [22]:
X_train,X_test=train_test_split(my_df)

In [30]:
Count=X_train.sum(axis=0)
Count[Count==0].index
# Die nächste stehende Array enthält alle Variablen die nicht ins Trainigsdata schaffte. 

Index(['location_amsterdam, netherlands', 'location_asheville, north carolina',
       'location_brea, california', 'location_cambridge, massachusetts',
       'location_campbell, california', 'location_cincinnati, ohio',
       'location_costa mesa, california', 'location_fort lauderdale, florida',
       'location_grand rapids, michigan', 'location_guadalajara, mexico',
       'location_honolulu, hawaii', 'location_kansas city, missouri',
       'location_kassel, germany', 'location_kula, hawaii',
       'location_las vegas, nevada', 'location_marin city, california',
       'location_nevada city, california', 'location_ozone park, new york',
       'location_pacheco, california', 'location_philadelphia, pennsylvania',
       'location_phoenix, arizona', 'location_san luis obispo, california',
       'location_san quentin, california', 'location_santa ana, california',
       'location_santa clara, california', 'location_stratford, connecticut'],
      dtype='object')

##### **Hash Function:**

In [35]:
import pandas as pd
from sklearn.feature_extraction import FeatureHasher

In [36]:
df=pd.read_parquet("okcupid_profiles.parquet")

In [40]:
df["location"].unique()

array(['south san francisco, california', 'oakland, california',
       'san francisco, california', 'berkeley, california',
       'belvedere tiburon, california', 'san mateo, california',
       'daly city, california', 'san leandro, california',
       'atherton, california', 'san rafael, california',
       'walnut creek, california', 'menlo park, california',
       'belmont, california', 'san jose, california',
       'palo alto, california', 'emeryville, california',
       'el granada, california', 'castro valley, california',
       'fairfax, california', 'mountain view, california',
       'burlingame, california', 'martinez, california',
       'pleasant hill, california', 'hayward, california',
       'alameda, california', 'vallejo, california',
       'benicia, california', 'el cerrito, california',
       'mill valley, california', 'richmond, california',
       'redwood city, california', 'el sobrante, california',
       'stanford, california', 'san pablo, california',

In [43]:
values = [[x] for x in df["location"].unique()]
# We put inside [] because we want to pass it into transform [[]]
values

[['south san francisco, california'],
 ['oakland, california'],
 ['san francisco, california'],
 ['berkeley, california'],
 ['belvedere tiburon, california'],
 ['san mateo, california'],
 ['daly city, california'],
 ['san leandro, california'],
 ['atherton, california'],
 ['san rafael, california'],
 ['walnut creek, california'],
 ['menlo park, california'],
 ['belmont, california'],
 ['san jose, california'],
 ['palo alto, california'],
 ['emeryville, california'],
 ['el granada, california'],
 ['castro valley, california'],
 ['fairfax, california'],
 ['mountain view, california'],
 ['burlingame, california'],
 ['martinez, california'],
 ['pleasant hill, california'],
 ['hayward, california'],
 ['alameda, california'],
 ['vallejo, california'],
 ['benicia, california'],
 ['el cerrito, california'],
 ['mill valley, california'],
 ['richmond, california'],
 ['redwood city, california'],
 ['el sobrante, california'],
 ['stanford, california'],
 ['san pablo, california'],
 ['novato, calif

In [44]:
h=FeatureHasher(
    n_features=15,
    input_type="string"
)

In [45]:
f=h.transform(values,)

In [50]:
f.toarray().shape

(199, 15)

In [51]:
X_array=f.toarray()
X_array

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

#### **Bin Counting:**

In [69]:
import pandas as pd

In [70]:
df=pd.read_parquet("bin_counting.parquet")

In [71]:
df.shape

(40428967, 2)

In [55]:
df.head()

Unnamed: 0,device_id,click
0,a99f214a,0
1,a99f214a,0
2,a99f214a,0
3,a99f214a,0
4,a99f214a,0


In [60]:
print(df.columns)

Index(['device_id', 'click'], dtype='object')


We want to do Binning for the colum device_id:

In [63]:
df["device_id"].nunique()
# So we have here like more than two millions of categories
# Imagine here we use OneHotEncoder => curse of dimentionality

2686408

In [73]:
df_bin_counts=df.groupby("device_id").agg(["count","sum"])

In [76]:
df_bin_counts["No_Click"]=df_bin_counts[("click","count")]-df_bin_counts[("click","sum")]

In [77]:
df_bin_counts

Unnamed: 0_level_0,click,click,No_Click
Unnamed: 0_level_1,count,sum,Unnamed: 3_level_1
device_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
00000414,1,0,1
00000715,1,0,1
00000919,1,0,1
00000b7c,1,0,1
00001237,6,0,6
...,...,...,...
ffffd970,1,0,1
ffffd98b,8,1,7
ffffde2c,2,1,1
ffffe321,4,0,4


In [78]:
df_bin_counts.sort_values(by=[("click","sum")],ascending=False)

Unnamed: 0_level_0,click,click,No_Click
Unnamed: 0_level_1,count,sum,Unnamed: 3_level_1
device_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2
a99f214a,33358308,5809415,27548893
0f7c61dc,21356,16190,5166
c357dbff,19667,12469,7198
3cdb4052,3069,2421,648
afeffc18,9654,2211,7443
...,...,...,...
611ee133,1,0,1
611edaa9,1,0,1
611ed07b,2,0,2
611ece3f,1,0,1
