In [1]:
import os
import pandas as pd
import numpy as np

In [2]:
folder_path = 'C:/Users/Enrico/Desktop/Datasets/Activity Logs'

dfs = []

for file_name in os.listdir(folder_path):
    if file_name.endswith(".csv"):
        file_path = os.path.join(folder_path, file_name)
        
        # Loading the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Adding the DataFrame to the list
        dfs.append(df)

In [3]:
merged_df = pd.concat(dfs, ignore_index=True)
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113923735 entries, 0 to 113923734
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   timestamp          object 
 1   currentLocation    object 
 2   participantId      float64
 3   currentMode        object 
 4   hungerStatus       object 
 5   sleepStatus        object 
 6   apartmentId        float64
 7   availableBalance   float64
 8   jobId              float64
 9   financialStatus    object 
 10  dailyFoodBudget    float64
 11  weeklyExtraBudget  float64
dtypes: float64(6), object(6)
memory usage: 10.2+ GB


In [4]:
activity_logs_df = merged_df
activity_logs_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 113923735 entries, 0 to 113923734
Data columns (total 12 columns):
 #   Column             Dtype  
---  ------             -----  
 0   timestamp          object 
 1   currentLocation    object 
 2   participantId      float64
 3   currentMode        object 
 4   hungerStatus       object 
 5   sleepStatus        object 
 6   apartmentId        float64
 7   availableBalance   float64
 8   jobId              float64
 9   financialStatus    object 
 10  dailyFoodBudget    float64
 11  weeklyExtraBudget  float64
dtypes: float64(6), object(6)
memory usage: 10.2+ GB


In [8]:
activity_logs_df["financialStatus"].unique()

array(['Stable', 'Unstable', 'Unknown', nan], dtype=object)

In [5]:
activity_logs_df["currentMode"].unique()

array(['AtHome', 'Transport', 'AtRecreation', 'AtRestaurant', 'AtWork',
       nan], dtype=object)

In [6]:
activity_logs_df["currentMode"].value_counts()

AtHome          69236289
AtWork          27239602
Transport       10246066
AtRecreation     5436347
AtRestaurant     1765430
Name: currentMode, dtype: int64

In [7]:
# Pivot della tabella per ottenere le frequenze delle attività per ogni participantId
df_pivot = activity_logs_df.pivot_table(index='participantId', columns='currentMode', aggfunc='size', fill_value=0)

In [8]:
df_pivot

currentMode,AtHome,AtRecreation,AtRestaurant,AtWork,Transport
participantId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0.0,70724,5665,1888,30816,20185
1.0,75175,8458,1324,30816,13505
2.0,65051,5091,1716,30816,26604
3.0,74014,7478,1812,30816,15158
4.0,77428,9350,1732,31138,9630
...,...,...,...,...,...
1006.0,79789,6102,2768,30816,9802
1007.0,78303,11575,2652,30816,5931
1008.0,80468,2798,1768,31137,13106
1009.0,78806,6904,2723,31137,9707


In [10]:
import numpy as np
import pandas as pd
from scipy.spatial.distance import pdist, squareform

def calculate_dissimilarities(data, metric):
    """
    Calcola una matrice di dissimilarità per un dato metrica.
    """
    # Calcola la matrice di dissimilarità e convertila in forma quadrata
    return squareform(pdist(data, metric=metric))

def top_n_dissimilar_pairs(data, n=10):
    """
    Trova le top N coppie più dissimili nel dataset usando diverse metriche di dissimilarità.
    """
    metrics = ['euclidean', 'cityblock', 'cosine']
    results = {}
    
    for metric in metrics:
        # Calcola la dissimilarità
        dissimilarity_matrix = calculate_dissimilarities(data, metric)
        dissimilarity_df = pd.DataFrame(dissimilarity_matrix)
        
        # Rimuovi le coppie duplicate e auto-comparazioni
        dissimilarity_df = dissimilarity_df.where(np.triu(np.ones(dissimilarity_df.shape), k=1).astype(bool))
        
        # Trasforma in un elenco di coppie dissimili
        dissimilar_pairs = dissimilarity_df.stack().reset_index()
        dissimilar_pairs.columns = ['ParticipantId_1', 'ParticipantId_2', 'Dissimilarity']
        
        # dissimilarità del coseno = 1 - similarità
        if metric == 'cosine':
            dissimilar_pairs['Dissimilarity'] = 1 - dissimilar_pairs['Dissimilarity']
        
        # Ordina le coppie per dissimilarità decrescente e seleziona le top N
        results[metric] = dissimilar_pairs.sort_values(by='Dissimilarity', ascending=False).head(n)
    
    return results


# top 10 coppie più dissimili per ogni metrica
results = top_n_dissimilar_pairs(df_pivot, n=10)

for metric, result in results.items():
    print(f"Top 10 dissimilar pairs using {metric}:")
    print(result, "\n")


Top 10 dissimilar pairs using euclidean:
        ParticipantId_1  ParticipantId_2  Dissimilarity
210435              235              816  100001.283197
210474              235              855  100000.424419
210222              235              603   99990.608369
133501              142              235   99990.322352
210196              235              577   99988.591959
210001              235              382   99983.422856
210387              235              768   99980.355140
210282              235              663   99978.102583
210221              235              602   99977.387063
210472              235              853   99974.711813 

Top 10 dissimilar pairs using cityblock:
        ParticipantId_1  ParticipantId_2  Dissimilarity
292949              350              875       128127.0
287307              342              541       128127.0
287368              342              602       128127.0
306943              372              602       128127.0
306944              

In [11]:
from IPython.display import display
from ipywidgets import Output
import pandas as pd


In [12]:
# `results` è il dizionario contenente i DataFrame delle top 10 coppie più dissimili per ogni metrica
# results = top_n_dissimilar_pairs(data, n=10)

for metric, result in results.items():
    
    out = Output()
    out.layout.height = '250px'    
    
    with out:
        print(f"Top 10 dissimilar pairs using {metric}:")
        display(result)
    
    display(out)


Output(layout=Layout(height='250px'))

Output(layout=Layout(height='250px'))

Output(layout=Layout(height='250px'))

In [33]:
participant_235_816 = activity_logs_df[(activity_logs_df["participantId"]==235) | (activity_logs_df["participantId"]==816) ]

In [37]:
participant_235_816.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130501 entries, 235 to 113923664
Data columns (total 12 columns):
 #   Column             Non-Null Count   Dtype  
---  ------             --------------   -----  
 0   timestamp          130501 non-null  object 
 1   currentLocation    130501 non-null  object 
 2   participantId      130501 non-null  float64
 3   currentMode        130501 non-null  object 
 4   hungerStatus       130501 non-null  object 
 5   sleepStatus        130501 non-null  object 
 6   apartmentId        129278 non-null  float64
 7   availableBalance   130501 non-null  float64
 8   jobId              130501 non-null  float64
 9   financialStatus    130501 non-null  object 
 10  dailyFoodBudget    130501 non-null  float64
 11  weeklyExtraBudget  130501 non-null  float64
dtypes: float64(6), object(6)
memory usage: 12.9+ MB


In [38]:
participant_235_816['timestamp'] = pd.to_datetime(participant_235_816["timestamp"])

# Estrazione della parte della data (anno-mese-giorno)
participant_235_816['date'] = participant_235_816['timestamp'].dt.strftime('%Y-%m-%d')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  participant_235_816['timestamp'] = pd.to_datetime(participant_235_816["timestamp"])
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  participant_235_816['date'] = participant_235_816['timestamp'].dt.strftime('%Y-%m-%d')


In [39]:
participant_235_816.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 130501 entries, 235 to 113923664
Data columns (total 13 columns):
 #   Column             Non-Null Count   Dtype              
---  ------             --------------   -----              
 0   timestamp          130501 non-null  datetime64[ns, UTC]
 1   currentLocation    130501 non-null  object             
 2   participantId      130501 non-null  float64            
 3   currentMode        130501 non-null  object             
 4   hungerStatus       130501 non-null  object             
 5   sleepStatus        130501 non-null  object             
 6   apartmentId        129278 non-null  float64            
 7   availableBalance   130501 non-null  float64            
 8   jobId              130501 non-null  float64            
 9   financialStatus    130501 non-null  object             
 10  dailyFoodBudget    130501 non-null  float64            
 11  weeklyExtraBudget  130501 non-null  float64            
 12  date               130501

In [40]:
participant_235_816 = participant_235_816[['participantId', 'timestamp', 'currentMode', 'currentLocation', 'date', 'hungerStatus', 'sleepStatus']]

In [41]:
participant_235_816.to_csv("participant_235_816.csv", index=False)