# Aggregating over users 

In [16]:
import constants
import numpy as np
import pandas as pd

In [17]:
# Load data into Pandas DataFrame
data_path = "assignment-sample-data-cleaned.csv"
df = pd.read_csv(data_path)

In [18]:
from IPython.display import display
pd.set_option('display.width', 1000)

In [19]:
df.head()

Unnamed: 0,parkinguser_id,area_type,parking_fee_sek,parking_duration,weekday,registered_cars,n_parkings,parking_activity,account_type
0,fake_c6a7eb1142,OnStreet,49.5,2.79,1,2,339,0.3,corporate
1,fake_6e03a1bfc8,OnStreet,61.3,2.05,1,3,80,0.04,private
2,fake_f6adf105e7,OnStreet,20.58,19.13,1,1,159,0.25,corporate
3,fake_d317e5c7a5,SurfaceLot,2.0,0.24,1,6,1965,1.23,corporate
4,fake_bdbd8ad11c,OnStreet,60.0,20.34,0,9,2061,0.84,corporate


In [20]:
# Map account_type to 0,1
account_type_mapping = constants.ACCOUNT_TYPE_MAPPING
df["account_type"] = df["account_type"].map(account_type_mapping)


# Map area_type to one-hot encoding
area_mapping = constants.AREA_TYPE_MAPPING
df["area_type"] = df["area_type"].map(area_mapping).apply(
    lambda x: np.eye(7, dtype=int)[x]
)

In [21]:
print(df.shape)

(81428, 9)


In [22]:
df.head()

Unnamed: 0,parkinguser_id,area_type,parking_fee_sek,parking_duration,weekday,registered_cars,n_parkings,parking_activity,account_type
0,fake_c6a7eb1142,"[0, 0, 0, 1, 0, 0, 0]",49.5,2.79,1,2,339,0.3,1
1,fake_6e03a1bfc8,"[0, 0, 0, 1, 0, 0, 0]",61.3,2.05,1,3,80,0.04,0
2,fake_f6adf105e7,"[0, 0, 0, 1, 0, 0, 0]",20.58,19.13,1,1,159,0.25,1
3,fake_d317e5c7a5,"[0, 1, 0, 0, 0, 0, 0]",2.0,0.24,1,6,1965,1.23,1
4,fake_bdbd8ad11c,"[0, 0, 0, 1, 0, 0, 0]",60.0,20.34,0,9,2061,0.84,1


## Aggregate over user

In [23]:
### Aggregate over parkinguser_id
# Create a custom aggregation dictionary
agg_dict = {
    col: 'mean' for col in df.columns 
    if df[col].dtype in ['int64', 'float64'] and col != 'parkinguser_id'
}
agg_dict['area_type'] = lambda x: np.sum(np.vstack(x), axis=0)  # Sum the one-hot vectors

# Group by parkinguser_id and apply aggregations
df_aggregated = df.groupby('parkinguser_id').agg(agg_dict)

# Normalize the area_type vectors
df_aggregated['area_type'] = df_aggregated['area_type'].apply(
    lambda x: x / np.linalg.norm(x) if np.linalg.norm(x) > 0 else x
)

# Reset index to make parkinguser_id a column again
df_aggregated = df_aggregated.reset_index()

In [24]:
pd.set_option('display.precision', 2)
df_aggregated = df_aggregated[[col for col in df_aggregated.columns if col != 'account_type'] + ['account_type']]
df_aggregated.head()

Unnamed: 0,parkinguser_id,parking_fee_sek,parking_duration,weekday,registered_cars,n_parkings,parking_activity,area_type,account_type
0,fake_00f8011540,34.74,1.48,0.5,5.0,50.0,0.05,"[0.0, 0.9642088512100443, 0.10713431680111603,...",0.0
1,fake_06c1cbc936,11.56,3.3,0.65,1.0,17.0,0.04,"[0.0, 1.0, 0.0, 0.0, 0.0, 0.0, 0.0]",0.0
2,fake_0703d99383,10.32,1.25,0.86,8.0,381.0,0.2,"[0.005728308005379121, 0.0801963120753077, 0.0...",0.0
3,fake_07ec98fe5f,11.52,1.34,0.61,4.0,31.0,0.02,"[0.035646150289975954, 0.9980922081193268, 0.0...",0.0
4,fake_0886c9a74d,66.73,5.87,0.93,2.0,154.0,0.17,"[0.00720899783080567, 0.10092596963127938, 0.0...",0.0


In [30]:
for index, row in df.iterrows():
    if len(row['area_type']) != 7:
        print(row)
print(len(row['area_type']))


7


In [32]:
df[['area_type_0', 'area_type_1', 'area_type_2', 'area_type_3', 'area_type_4', 'area_type_5', 'area_type_6']] = pd.DataFrame(df.area_type.tolist(), index=df.index)
df.head()


Unnamed: 0,parkinguser_id,area_type,parking_fee_sek,parking_duration,weekday,registered_cars,n_parkings,parking_activity,account_type,area_type_0,area_type_1,area_type_2,area_type_3,area_type_4,area_type_5,area_type_6
0,fake_c6a7eb1142,"[0, 0, 0, 1, 0, 0, 0]",49.5,2.79,1,2,339,0.3,1,0,0,0,1,0,0,0
1,fake_6e03a1bfc8,"[0, 0, 0, 1, 0, 0, 0]",61.3,2.05,1,3,80,0.04,0,0,0,0,1,0,0,0
2,fake_f6adf105e7,"[0, 0, 0, 1, 0, 0, 0]",20.58,19.13,1,1,159,0.25,1,0,0,0,1,0,0,0
3,fake_d317e5c7a5,"[0, 1, 0, 0, 0, 0, 0]",2.0,0.24,1,6,1965,1.23,1,0,1,0,0,0,0,0
4,fake_bdbd8ad11c,"[0, 0, 0, 1, 0, 0, 0]",60.0,20.34,0,9,2061,0.84,1,0,0,0,1,0,0,0


In [26]:
print(df_aggregated.shape)
print(df_aggregated["account_type"].value_counts())

(300, 9)
account_type
0.0    250
1.0     50
Name: count, dtype: int64


In [27]:
print(type(df_aggregated['area_type']))
print(type(df_aggregated['area_type'][0]))

<class 'pandas.core.series.Series'>
<class 'numpy.ndarray'>


## Save processed data

In [28]:
save_df = False
if save_df:
    path = "aggregated-user-data.csv"
    df_aggregated.to_csv(path, index=False)

## Future work

- We can reduce the length of the area_type vector by clustering similar areas and removing the least common areas. This is helpful since reducing the dimensionality of the area_type vector can improve the performance of the models that utilize distance metrics such as KNN.