# CA3

#### Import libraries

In [18]:
import pandas as pd
import numpy as np
from collections import deque, defaultdict
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

#### Import dataset

In [9]:
trust_path = "train_data_movie_trust.csv"
rating_path = "train_data_movie_rate.csv"

trust_df = pd.read_csv(trust_path).dropna()
rating_df = pd.read_csv(rating_path).dropna()

#### Define bfs function

In [10]:
def avg_trust_network_rating(user_id, item_id, trust_df, rating_df, max_depth=3):

    # Build adjacency list: who does each user trust?
    adjacency = defaultdict(list)
    for tr, te in zip(trust_df.user_id_trustor, trust_df.user_id_trustee):
        adjacency[tr].append(te)
        
    # Build lookup of ratings by (user, item)
    rating_lookup = {
        (u, i): r
        for u, i, r in zip(rating_df.user_id, rating_df.item_id, rating_df.label)
    }
    
    # BFS out to max_depth
    visited = set([user_id])
    queue   = deque([(user_id, 0)])
    ratings = []
    num = 0
    while queue:
        current, depth = queue.popleft()
        if depth >= max_depth:
            continue
        for nbr in adjacency.get(current, []):
            if nbr in visited:
                continue
            visited.add(nbr)
            # if this neighbor rated the item, collect it
            rt = rating_lookup.get((nbr, item_id))
            if rt is not None:
                num+=1
                ratings.append(rt)
            queue.append((nbr, depth + 1))
    mean = np.mean(ratings) if ratings else -1
    std = np.std(ratings) if ratings else -1
    the_min = np.min(ratings) if ratings else -1
    the_max = np.max(ratings) if ratings else -1
    median = np.median(ratings) if ratings else -1
    count = len(ratings)
    return [mean, std, count, median, the_max, the_min]


#### Test BFS function

In [None]:
for i in range(1,15):
    print("i =", i, "rating = ", avg_trust_network_rating(16,12, trust_df, rating_df, i))

i= 1 rating =  [3.5, 0.5, 2, 3.5, 4.0, 3.0]
i= 2 rating =  [2.5925925925925926, 1.0277360685564152, 27, 3.0, 4.0, 0.5]
i= 3 rating =  [2.6294117647058823, 0.8648459270347815, 85, 2.5, 4.0, 0.5]
i= 4 rating =  [2.5588235294117645, 0.8868234090133716, 119, 2.5, 4.0, 0.5]
i= 5 rating =  [2.5892857142857144, 0.8949618051364322, 140, 2.5, 4.0, 0.5]
i= 6 rating =  [2.5993150684931505, 0.8809627808709003, 146, 2.5, 4.0, 0.5]
i= 7 rating =  [2.6033333333333335, 0.896840131919966, 150, 2.5, 4.0, 0.5]
i= 8 rating =  [2.6059602649006623, 0.8944443528850059, 151, 2.5, 4.0, 0.5]
i= 9 rating =  [2.6059602649006623, 0.8944443528850059, 151, 2.5, 4.0, 0.5]
i= 10 rating =  [2.6059602649006623, 0.8944443528850059, 151, 2.5, 4.0, 0.5]
i= 11 rating =  [2.6059602649006623, 0.8944443528850059, 151, 2.5, 4.0, 0.5]
i= 12 rating =  [2.6059602649006623, 0.8944443528850059, 151, 2.5, 4.0, 0.5]
i= 13 rating =  [2.6059602649006623, 0.8944443528850059, 151, 2.5, 4.0, 0.5]
i= 14 rating =  [2.6059602649006623, 0.8944

#### Create trust graph

In [12]:
# 1) Precompute once: adjacency and rating lookup
adjacency = defaultdict(list)
for tr, te in zip(trust_df.user_id_trustor, trust_df.user_id_trustee):
    adjacency[tr].append(te)

rating_lookup = {
    (u, i): r
    for u, i, r in zip(rating_df.user_id, rating_df.item_id, rating_df.label)
}

#### Create a function to calculate users stats

In [13]:
# 2) A leaner trust‑stats function that reuses those dicts
def trust_stats(u, i, max_depth=3):
    visited = {u}
    queue   = deque([(u, 0)])
    vals    = []
    while queue:
        curr, d = queue.popleft()
        if d >= max_depth:
            continue
        for nbr in adjacency.get(curr, []):
            if nbr in visited:
                continue
            visited.add(nbr)
            r = rating_lookup.get((nbr, i))
            if r is not None:
                vals.append(r)
            queue.append((nbr, d+1))
    if not vals:
        return pd.Series({
            'trust_mean': -1,
            'trust_std': -1,
            'trust_count': 0,
            'trust_median': -1,
            'trust_max': -1,
            'trust_min': -1
        })
    return pd.Series({
        'trust_mean':  np.mean(vals),
        'trust_std':   np.std(vals),
        'trust_count': len(vals),
        'trust_median': np.median(vals),
        'trust_max':    np.max(vals),
        'trust_min':    np.min(vals)
    })

#### Adding new features to train data

In [14]:
# 3) Apply to your feature‑dataframe:
#    e.g. train_df, test_df, or train_merged
  # or test_df

def add_the_mean_count_var(df, max_range):
    for i in range(1,max_range):
        print(i)
        df[[f'trust_mean{i}',f'trust_std{i}',f'trust_count{i}', f'median{i}', f'min{i}', f'max{i}']] = (
            df.apply(lambda r: trust_stats(r.user_id, r.item_id, max_depth=i), axis=1)
        )
    return df

train_df = add_the_mean_count_var(rating_df, 10)

1
2
3
4
5
6
7
8
9


In [15]:
print(train_df.tail(5))

          id  user_id  item_id  label  trust_mean1  trust_std1  trust_count1  \
34293  34294     1508       84    3.5        -1.00       -1.00           0.0   
34294  34295     1508       17    4.0         2.75        0.75           2.0   
34295  34296     1508      669    1.0        -1.00       -1.00           0.0   
34296  34297     1508      686    2.5         3.00        0.00           1.0   
34297  34298     1508      806    3.5        -1.00       -1.00           0.0   

       median1  min1  max1  ...  trust_count8  median8  min8  max8  \
34293    -1.00  -1.0  -1.0  ...          57.0     3.00   4.0   0.5   
34294     2.75   3.5   2.0  ...         171.0     3.00   4.0   0.5   
34295    -1.00  -1.0  -1.0  ...           2.0     4.00   4.0   4.0   
34296     3.00   3.0   3.0  ...           4.0     2.25   3.5   1.5   
34297    -1.00  -1.0  -1.0  ...           6.0     3.25   4.0   0.5   

       trust_mean9  trust_std9  trust_count9  median9  min9  max9  
34293     2.807018    0.902017

#### Create X(features) and y(target) for our model

In [None]:
train_df = train_df.dropna() # Handle NaNs (basic strategy)
X = train_df.drop(columns=['id','user_id', 'item_id', 'label'])  # keep only numeric features
y = train_df['label']

#### Define our model

In [19]:
model = xgb.XGBRegressor(
    n_estimators=70,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model.fit(X, y, verbose=False)

#### Read test data csv

In [20]:
test = pd.read_csv("test_data.csv")  # columns: user_id, item_id

test_df = add_the_mean_count_var(test, 10)

1
2
3
4
5
6
7
8
9


In [21]:
print(test_df.head())

   id  user_id  item_id  trust_mean1  trust_std1  trust_count1  median1  min1  \
0   1        6      211         3.50        0.00           1.0     3.50   3.5   
1   2        6        7         2.00        0.00           1.0     2.00   2.0   
2   3       16        2         3.00        0.00           2.0     3.00   3.0   
3   4       16        4         3.25        0.75           2.0     3.25   4.0   
4   5       16       11         3.25        0.75           2.0     3.25   4.0   

   max1  trust_mean2  ...  trust_count8  median8  min8  max8  trust_mean9  \
0   3.5     3.250000  ...         109.0      3.5   4.0   1.0     3.315315   
1   2.0     2.000000  ...         214.0      3.5   4.0   0.5     3.227064   
2   3.0     3.166667  ...         175.0      3.5   4.0   0.5     3.242857   
3   2.5     3.281250  ...         111.0      3.5   4.0   0.5     3.139640   
4   2.5     3.431818  ...         149.0      3.5   4.0   0.5     3.359060   

   trust_std9  trust_count9  median9  min9  max9  

#### Predict test data entries and store it in a csv

In [22]:
# Fill NaNs just in case
X_test = test_df.drop(columns=['user_id', 'item_id', 'id'], errors='ignore').fillna(-1)

# Predict residual
final_preds = model.predict(X_test)

# Prepare submission DataFrame
submission = test_df.copy()
submission['label'] = final_preds

# Save to CSV (preferred for submissions)
submission[['id','label']].to_csv("submission_with_trust_features_2.csv", index=False)