In [None]:
import pandas as pd
from scipy.spatial.distance import hamming

# Load the dataset
file_path = '/content/ALLUSERS_32_1HR_WITHOUT_THRESHOLD.xlsx'  # Update with your file path
df = pd.read_excel(file_path)

# Ensure the dataset is filtered to include only 10 users
df = df[df['user'].isin(df['user'].unique()[:10])]

# Define train and test sequences
train_sequences = df[((df['Year'] == 2019) & (df['Month'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9]))) |
                     ((df['Year'] == 2018) & (df['Month'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])))]

test_sequences = df[(df['Month'].isin([1, 2])) & (df['Year'] == 2020)]

# Drop unnecessary columns after filtering
train_sequences = train_sequences.drop(columns=['Month', 'Year', 'date'])
test_sequences = test_sequences.drop(columns=['Month', 'Year', 'date'])

# Identify sequence columns (all except 'user')
sequence_columns = [col for col in train_sequences.columns if col != 'user']

# Calculate average Hamming distance for each test user against all train users
results = []
for test_user in test_sequences['user'].unique():  # Iterate over test users
    test_user_sequences = test_sequences[test_sequences['user'] == test_user]
    user_distances = []

    for train_user in train_sequences['user'].unique():  # Iterate over train users
        train_user_sequences = train_sequences[train_sequences['user'] == train_user]
        distances = []

        # Compute Hamming distance for all sequences of test_user and train_user
        for _, test_row in test_user_sequences.iterrows():
            test_sequence = test_row[sequence_columns].values

            for _, train_row in train_user_sequences.iterrows():
                train_sequence = train_row[sequence_columns].values
                distance = hamming(test_sequence, train_sequence)
                distances.append(distance)

        # Calculate average Hamming distance for the train_user
        avg_distance = sum(distances) / len(distances) if distances else float('inf')
        user_distances.append({'TrainUser': train_user, 'AvgHammingDistance': avg_distance})

    # Identify the train user with the minimum average Hamming distance
    closest_user = min(user_distances, key=lambda x: x['AvgHammingDistance'])
    results.append({
        'TestUser': test_user,
        'ClosestTrainUser': closest_user['TrainUser'],
        'MinAvgHammingDistance': closest_user['AvgHammingDistance']
    })

# Convert results to a DataFrame for better readability
results_df = pd.DataFrame(results)

# Display the results
print(results_df)

# Save the results to a file if needed
results_df.to_csv('/content/hamming_distance_results.csv', index=False)


   TestUser  ClosestTrainUser  MinAvgHammingDistance
0         0                 0               0.336799
1         1                 1               0.313300
2         2                 2               0.345145
3         3                 3               0.336910
4         4                 9               0.332911
5         5                 5               0.301490
6         6                 6               0.324759
7         7                 6               0.345333
8         8                 8               0.310081
9         9                 9               0.321395


In [None]:
import pandas as pd
from scipy.spatial.distance import hamming

# Load the dataset
file_path = '/content/ALLUSERS_32_1HR_WITHOUT_THRESHOLD.xlsx'  # Update with your file path
df = pd.read_excel(file_path)

# Ensure the dataset is filtered to include only 32 users
df = df[df['user'].isin(df['user'].unique()[:32])]

# Define train and test sequences
train_sequences = df[((df['Year'] == 2019) & (df['Month'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9]))) |
                     ((df['Year'] == 2018) & (df['Month'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])))]
test_sequences = df[(df['Month'].isin([1, 2])) & (df['Year'] == 2020)]

# Drop unnecessary columns after filtering
train_sequences = train_sequences.drop(columns=['Month', 'Year', 'date'])
test_sequences = test_sequences.drop(columns=['Month', 'Year', 'date'])

# Identify sequence columns (all except 'user')
sequence_columns = [col for col in train_sequences.columns if col != 'user']

# Calculate average Hamming distance for each test user against all train users
results = []
for test_user in test_sequences['user'].unique():  # Iterate over test users
    test_user_sequences = test_sequences[test_sequences['user'] == test_user]
    user_distances = []

    for train_user in train_sequences['user'].unique():  # Iterate over train users
        train_user_sequences = train_sequences[train_sequences['user'] == train_user]
        distances = []

        # Compute Hamming distance for all sequences of test_user and train_user
        for _, test_row in test_user_sequences.iterrows():
            test_sequence = test_row[sequence_columns].values

            for _, train_row in train_user_sequences.iterrows():
                train_sequence = train_row[sequence_columns].values
                distance = hamming(test_sequence, train_sequence)
                distances.append(distance)

        # Calculate average Hamming distance for the train_user
        avg_distance = sum(distances) / len(distances) if distances else float('inf')
        user_distances.append({'TrainUser': train_user, 'AvgHammingDistance': avg_distance})

    # Identify the train user with the minimum average Hamming distance
    closest_user = min(user_distances, key=lambda x: x['AvgHammingDistance'])
    results.append({
        'TestUser': test_user,
        'ClosestTrainUser': closest_user['TrainUser'],
        'MinAvgHammingDistance': closest_user['AvgHammingDistance']
    })

# Convert results to a DataFrame for better readability
results_df = pd.DataFrame(results)

# Display the results
print(results_df)

# Save the results to a file if needed
results_df.to_csv('/content/hamming_distance_results_32.csv', index=False)


    TestUser  ClosestTrainUser  MinAvgHammingDistance
0          0                18               0.330322
1          1                15               0.299651
2          2                20               0.343505
3          3                29               0.333274
4          4                20               0.330633
5          5                 5               0.301490
6          6                20               0.313453
7          7                15               0.332117
8          8                 8               0.310081
9          9                20               0.321107
10        10                 9               0.393562
11        11                15               0.355877
12        12                29               0.317752
13        13                21               0.396041
14        14                15               0.320987
15        15                15               0.240694
16        16                15               0.284560
17        17                

In [None]:
import pandas as pd
from scipy.spatial.distance import hamming

# Load the dataset
file_path = '/content/ALLUSERS32_15MIN_WITHOUTTHREHOLD.xlsx'  # Update with your file path
df = pd.read_excel(file_path)

# Ensure the dataset is filtered to include only 32 users
df = df[df['user'].isin(df['user'].unique()[:32])]

# Define train and test sequences
train_sequences = df[((df['Year'] == 2019) & (df['Month'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9]))) |
                     ((df['Year'] == 2018) & (df['Month'].isin([1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12])))]
test_sequences = df[(df['Month'].isin([1, 2])) & (df['Year'] == 2020)]

# Drop unnecessary columns after filtering
train_sequences = train_sequences.drop(columns=['Month', 'Year', 'date','DayOfWeek'])
test_sequences = test_sequences.drop(columns=['Month', 'Year', 'date','DayOfWeek'])

# Identify sequence columns (all except 'user')
sequence_columns = [col for col in train_sequences.columns if col != 'user']

# Calculate average Hamming distance for each test user against all train users
results = []
for test_user in test_sequences['user'].unique():  # Iterate over test users
    test_user_sequences = test_sequences[test_sequences['user'] == test_user]
    user_distances = []

    for train_user in train_sequences['user'].unique():  # Iterate over train users
        train_user_sequences = train_sequences[train_sequences['user'] == train_user]
        distances = []

        # Compute Hamming distance for all sequences of test_user and train_user
        for _, test_row in test_user_sequences.iterrows():
            test_sequence = test_row[sequence_columns].values

            for _, train_row in train_user_sequences.iterrows():
                train_sequence = train_row[sequence_columns].values
                distance = hamming(test_sequence, train_sequence)
                distances.append(distance)

        # Calculate average Hamming distance for the train_user
        avg_distance = sum(distances) / len(distances) if distances else float('inf')
        user_distances.append({'TrainUser': train_user, 'AvgHammingDistance': avg_distance})

    # Identify the train user with the minimum average Hamming distance
    closest_user = min(user_distances, key=lambda x: x['AvgHammingDistance'])
    results.append({
        'TestUser': test_user,
        'ClosestTrainUser': closest_user['TrainUser'],
        'MinAvgHammingDistance': closest_user['AvgHammingDistance']
    })

# Convert results to a DataFrame for better readability
results_df = pd.DataFrame(results)

# Display the results
print(results_df)

# Save the results to a file if needed
results_df.to_csv('/content/hamming_distance_results_32_15.csv', index=False)
