# Aggregating over users 

In [1]:
import constants
import numpy as np
import pandas as pd

In [2]:
# Load data into Pandas DataFrame
data_path = "assignment-sample-data-cleaned.csv"
df = pd.read_csv(data_path)

In [3]:
from IPython.display import display
pd.set_option('display.width', 1000)

In [4]:
df.head()

Unnamed: 0,parkinguser_id,area_type,parking_fee_sek,parking_duration,weekday,registered_cars,n_parkings,parking_activity,account_type
0,fake_e764113cde,SurfaceLot,11.5,1.906667,1,9,893,0.786092,private
1,fake_87f457ddef,SurfaceLot,13.0566,0.879167,1,20,429,0.161036,private
2,fake_61d32bf6c5,OnStreet,173.33,8.662778,1,2,121,0.2,corporate
3,fake_dcec7e9cf0,SurfaceLot,20.02,1.880278,1,7,321,0.370242,private
4,fake_256473c6ae,OnStreet,12.0,5.990278,1,1,584,0.804408,private


## Map "account_type" and "area_type" to numeric values

In [5]:
# Map account_type to 0 (private),1 (corporate)
account_type_mapping = constants.ACCOUNT_TYPE_MAPPING
df["account_type"] = df["account_type"].map(account_type_mapping)
print("Count of values in account_type:", df["account_type"].value_counts())

# Map area_type to one-hot encoding
area_mapping = constants.AREA_TYPE_MAPPING
df["area_type"] = df["area_type"].map(area_mapping).apply(
    lambda x: np.eye(7, dtype=int)[x]
)

Count of values in account_type: account_type
0    52882
1    28415
Name: count, dtype: int64


In [6]:
print(df.shape)
df.head()

(81297, 9)


Unnamed: 0,parkinguser_id,area_type,parking_fee_sek,parking_duration,weekday,registered_cars,n_parkings,parking_activity,account_type
0,fake_e764113cde,"[0, 1, 0, 0, 0, 0, 0]",11.5,1.906667,1,9,893,0.786092,0
1,fake_87f457ddef,"[0, 1, 0, 0, 0, 0, 0]",13.0566,0.879167,1,20,429,0.161036,0
2,fake_61d32bf6c5,"[0, 0, 0, 1, 0, 0, 0]",173.33,8.662778,1,2,121,0.2,1
3,fake_dcec7e9cf0,"[0, 1, 0, 0, 0, 0, 0]",20.02,1.880278,1,7,321,0.370242,0
4,fake_256473c6ae,"[0, 0, 0, 1, 0, 0, 0]",12.0,5.990278,1,1,584,0.804408,0


## Aggregate over user

In [7]:
## Aggregate features over parkinguser_id

# Create a custom aggregation dictionary
agg_dict = {
    col: 'mean' for col in df.columns 
    if df[col].dtype in ['int64', 'float64'] and col != 'parkinguser_id'
}
agg_dict['account_type'] = 'first'
agg_dict['area_type'] = lambda x: np.sum(np.vstack(x), axis=0)  # Sum the one-hot vectors

# Group by parkinguser_id and apply aggregations
df_aggregated = df.groupby('parkinguser_id').agg(agg_dict)

# Normalize the area_type vectors
df_aggregated['area_type'] = df_aggregated['area_type'].apply(
    lambda x: x / np.linalg.norm(x) if np.linalg.norm(x) > 0 else x
)

# Reset index to make parkinguser_id a column again
df_aggregated = df_aggregated.reset_index()

In [8]:
# Check that all area_type vectors have length 7
for index, row in df_aggregated.iterrows():
    if len(row['area_type']) != 7:
        print(row)
print("All area_type vectors have length 7")

All area_type vectors have length 7


In [9]:
# Create new columns from the numpy arrays
area_type_expanded = pd.DataFrame(
    df_aggregated['area_type'].tolist(),
    columns=[f'area_type_{i}' for i in range(7)]
)

# Append the new columns to the original DataFrame
df_aggregated = pd.concat([df_aggregated, area_type_expanded], axis=1)


In [10]:
#pd.set_option('display.precision', 2)
pd.set_option('display.float_format', lambda x: '%.3f' % x)
df_aggregated = df_aggregated[[col for col in df_aggregated.columns if col != 'account_type'] + ['account_type']]
df_aggregated = df_aggregated.drop(columns=['area_type'])
df_aggregated.head()

Unnamed: 0,parkinguser_id,parking_fee_sek,parking_duration,weekday,registered_cars,n_parkings,parking_activity,area_type_0,area_type_1,area_type_2,area_type_3,area_type_4,area_type_5,area_type_6,account_type
0,fake_00f8011540,34.742,1.478,0.5,5.0,50.0,0.047,0.0,0.964,0.107,0.241,0.027,0.0,0.0,0
1,fake_06c1cbc936,11.556,3.299,0.647,1.0,17.0,0.037,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0
2,fake_0703d99383,10.32,1.248,0.864,8.0,381.0,0.196,0.006,0.08,0.009,0.997,0.0,0.0,0.0,0
3,fake_07ec98fe5f,11.516,1.34,0.613,4.0,31.0,0.021,0.036,0.998,0.036,0.036,0.0,0.0,0.0,0
4,fake_0886c9a74d,66.734,5.867,0.929,2.0,154.0,0.166,0.007,0.101,0.007,0.995,0.0,0.0,0.0,0


In [11]:
print(df_aggregated.shape)
print(df_aggregated["account_type"].value_counts())

(300, 15)
account_type
0    250
1     50
Name: count, dtype: int64


## Save processed data

In [None]:
save_df = False
if save_df:
    path = "aggregated-user-data.csv"
    df_aggregated.to_csv(path, index=False)

## Future work

- We can reduce the length of the area_type vector by clustering similar areas and removing the least common areas. This is helpful since reducing the dimensionality of the area_type vector can improve the performance of the models that utilize distance metrics such as KNN.