In [8]:
!pip install pandas numpy scikit-learn nltk faiss-cpu transformers
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import os



In [9]:
data_path = './dataset/ml100k/train.txt'

if not os.path.exists(data_path):
    print("Train data not found. Please make sure the train.txt file is in the current directory.")
    exit()

# Load the data
print("Loading training data...")
# File is space-separated with no timestamp
ratings = pd.read_csv(data_path, 
                      sep=' ', 
                      header=None, 
                      names=['user_id', 'movie_id', 'rating'])

ratings = ratings[['user_id', 'movie_id', 'rating']]

ratings

Loading training data...


Unnamed: 0,user_id,movie_id,rating
0,0,2656,1
1,0,1466,1
2,0,964,1
3,0,2218,1
4,0,2586,1
...,...,...,...
77784,5969,1631,0
77785,5969,535,1
77786,5969,1954,1
77787,5969,3115,1


In [10]:
user_item_matrix = ratings.pivot_table(index='user_id', columns='movie_id', values='rating')

user_item_matrix = user_item_matrix.fillna(0)

user_similarity = cosine_similarity(user_item_matrix)
user_similarity_df = pd.DataFrame(user_similarity, 
                                 index=user_item_matrix.index, 
                                 columns=user_item_matrix.index)

user_similarity_df

user_id,0,1,2,3,4,5,6,7,8,9,...,5960,5961,5962,5963,5964,5965,5966,5967,5968,5969
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.182574,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
1,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
2,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.140028,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
4,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.032564,...,0.0,0.075755,0.0,0.0,0.0,1.000000,0.045083,0.0,0.0,0.143315
5966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.045083,1.000000,0.0,0.0,0.000000
5967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,1.0,0.0,0.000000
5968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.104257,...,0.0,0.000000,0.0,0.0,0.0,0.000000,0.000000,0.0,1.0,0.000000


In [11]:
SIMILARITY_THRESHOLD = 0.35
social_data_by_threshold = []

for trustor in user_item_matrix.index:
    # Get similarities to all other users
    similarities = user_similarity_df.loc[trustor].sort_values(ascending=False)
    
    # Skip self-similarity (similarity with oneself is always 1.0)
    similarities = similarities[similarities.index != trustor]
    
    # Filter users above threshold
    potential_trustees = similarities[similarities > SIMILARITY_THRESHOLD]
    
    # Add trustor-trustee pairs for all users above threshold
    for trustee, similarity in potential_trustees.items():
        social_data_by_threshold.append((trustor, trustee, similarity))

social_df_by_threshold = pd.DataFrame(social_data_by_threshold, columns=['trustor', 'trustee', 'similarity'])

social_df_by_threshold

Unnamed: 0,trustor,trustee,similarity
0,0,2586,0.408248
1,0,452,0.408248
2,0,1398,0.408248
3,0,2955,0.408248
4,1,4302,0.447214
...,...,...,...
72977,5968,3442,0.447214
72978,5968,4805,0.353553
72979,5968,26,0.353553
72980,5968,5576,0.353553


In [12]:
TOP_K = 10
social_data_by_topk = []

for trustor in user_item_matrix.index:
    # Get similarities to all other users
    similarities = user_similarity_df.loc[trustor].sort_values(ascending=False)
    
    # Skip self-similarity (similarity with oneself is always 1.0)
    similarities = similarities[similarities.index != trustor]
    
    # Take top K most similar users
    top_k_trustees = similarities.head(TOP_K)
    
    # Add trustor-trustee pairs
    for trustee, similarity in top_k_trustees.items():
        social_data_by_topk.append((trustor, trustee, similarity))

social_df_by_topk = pd.DataFrame(social_data_by_topk, columns=['trustor', 'trustee', 'similarity'])

social_df_by_topk

Unnamed: 0,trustor,trustee,similarity
0,0,2586,0.408248
1,0,452,0.408248
2,0,1398,0.408248
3,0,2955,0.408248
4,0,3829,0.288675
...,...,...,...
59695,5969,5462,0.229416
59696,5969,1765,0.229416
59697,5969,579,0.229416
59698,5969,1813,0.229416


In [13]:
social_df = pd.concat([social_df_by_threshold, social_df_by_topk], ignore_index=True)

social_df = social_df.drop_duplicates(subset=['trustor', 'trustee'], keep='first')

social_df

Unnamed: 0,trustor,trustee,similarity
0,0,2586,0.408248
1,0,452,0.408248
2,0,1398,0.408248
3,0,2955,0.408248
4,1,4302,0.447214
...,...,...,...
132677,5969,5462,0.229416
132678,5969,1765,0.229416
132679,5969,579,0.229416
132680,5969,1813,0.229416


In [14]:
with open('social.txt', 'w') as f:
    for _, row in social_df.iterrows():
        f.write(f"{int(row['trustor'])} {int(row['trustee'])} {float(row['similarity'])}\n")

print(f"Done! Generated {len(social_df)} trust relationships.")
print(f"Social data saved to social.txt")

Done! Generated 101504 trust relationships.
Social data saved to social.txt
