# create the dataset

In [None]:
import pandas as pd
import numpy as np
from collections import deque, defaultdict
from sklearn.model_selection import train_test_split

trust_path = "train_data_movie_trust.csv"
rating_path = "train_data_movie_rate.csv"

trust_df = pd.read_csv(trust_path).dropna()
rating_df = pd.read_csv(rating_path).dropna()


In [24]:
import pandas as pd
import numpy as np
from collections import deque, defaultdict

# 1) Precompute once: adjacency and rating lookup
adjacency = defaultdict(list)
for tr, te in zip(trust_df.user_id_trustor, trust_df.user_id_trustee):
    adjacency[tr].append(te)

rating_lookup = {
    (u, i): r
    for u, i, r in zip(rating_df.user_id, rating_df.item_id, rating_df.label)
}

# 2) A leaner trust‑stats function that reuses those dicts
def trust_stats(u, i, max_depth=3):
    visited = {u}
    queue   = deque([(u, 0)])
    vals    = []
    while queue:
        curr, d = queue.popleft()
        if d >= max_depth:
            continue
        for nbr in adjacency.get(curr, []):
            if nbr in visited:
                continue
            visited.add(nbr)
            r = rating_lookup.get((nbr, i))
            if r is not None:
                vals.append(r)
            queue.append((nbr, d+1))
    if not vals:
        return pd.Series({
            'trust_mean': -1,
            'trust_std': -1,
            'trust_count': 0,
            'trust_median': -1,
            'trust_max': -1,
            'trust_min': -1
        })
    return pd.Series({
        'trust_mean':  np.mean(vals),
        'trust_std':   np.std(vals),
        'trust_count': len(vals),
        'trust_median': np.median(vals),
        'trust_max':    np.max(vals),
        'trust_min':    np.min(vals)
    })


# 3) Apply to your feature‑dataframe:
#    e.g. train_df, test_df, or train_merged
  # or test_df

def add_the_mean_count_var(df, max_range):
    for i in range(1,max_range):
        print(i)
        df[[f'trust_mean{i}',f'trust_std{i}',f'trust_count{i}', f'median{i}', f'min{i}', f'max{i}']] = (
            df.apply(lambda r: trust_stats(r.user_id, r.item_id, max_depth=i), axis=1)
        )
    return df




In [25]:
train_df = add_the_mean_count_var(rating_df, max_range=15)


1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [26]:
print(train_df.tail(5))

          id  user_id  item_id  label  trust_mean1  trust_std1  trust_count1  \
34293  34294     1508       84    3.5        -1.00       -1.00           0.0   
34294  34295     1508       17    4.0         2.75        0.75           2.0   
34295  34296     1508      669    1.0        -1.00       -1.00           0.0   
34296  34297     1508      686    2.5         3.00        0.00           1.0   
34297  34298     1508      806    3.5        -1.00       -1.00           0.0   

       median1  min1  max1  ...  trust_count13  median13  min13  max13  \
34293    -1.00  -1.0  -1.0  ...           57.0      3.00    4.0    0.5   
34294     2.75   3.5   2.0  ...          171.0      3.00    4.0    0.5   
34295    -1.00  -1.0  -1.0  ...            2.0      4.00    4.0    4.0   
34296     3.00   3.0   3.0  ...            4.0      2.25    3.5    1.5   
34297    -1.00  -1.0  -1.0  ...            6.0      3.25    4.0    0.5   

       trust_mean14  trust_std14  trust_count14  median14  min14  max14  


In [38]:
train_df = train_df.dropna()
X_train = train_df.drop(columns=['id','user_id', 'item_id', 'label'])  # keep only numeric features
y_train = train_df['label']


# Handle NaNs (basic strategy)


In [27]:
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score

model = xgb.XGBRegressor(
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)
param_grid = {
    'max_depth': [3, 4, 5, 6, 7 ],
    'n_estimators': [50, 75,100, 125],
    'learning_rate': [ 0.075, 0.05, 0.1],
}

grid = GridSearchCV(
    model,
    param_grid,
    scoring='neg_root_mean_squared_error',
    cv=5,
    verbose=1,
    n_jobs=-1
)

grid.fit(X_train, y_train)
print("Best params:", grid.best_params_)


Fitting 5 folds for each of 60 candidates, totalling 300 fits
Best params: {'learning_rate': 0.1, 'max_depth': 7, 'n_estimators': 125}


In [39]:
model = xgb.XGBRegressor(
    n_estimators=200,
    max_depth=5,
    learning_rate=0.05,
    subsample=0.8,
    colsample_bytree=0.6,
    reg_alpha=1.0,      # L1 regularization
    reg_lambda=3.0,     # L2 regularization
    min_child_weight=10,
    gamma=0.1,
    random_state=42,
    n_jobs=-1
)
model.fit(X_train, y_train)

In [31]:
test = pd.read_csv("test_data.csv")  # columns: user_id, item_id



test_df = add_the_mean_count_var(test, 15)


1
2
3
4
5
6
7
8
9
10
11
12
13
14


In [34]:
print(test_df.head())

   id  user_id  item_id  trust_mean1  trust_std1  trust_count1  median1  min1  \
0   1        6      211         3.50        0.00           1.0     3.50   3.5   
1   2        6        7         2.00        0.00           1.0     2.00   2.0   
2   3       16        2         3.00        0.00           2.0     3.00   3.0   
3   4       16        4         3.25        0.75           2.0     3.25   4.0   
4   5       16       11         3.25        0.75           2.0     3.25   4.0   

   max1  trust_mean2  ...  trust_count13  median13  min13  max13  \
0   3.5     3.250000  ...          112.0       3.5    4.0    1.0   
1   2.0     2.000000  ...          219.0       3.5    4.0    0.5   
2   3.0     3.166667  ...          175.0       3.5    4.0    0.5   
3   2.5     3.281250  ...          111.0       3.5    4.0    0.5   
4   2.5     3.431818  ...          149.0       3.5    4.0    0.5   

   trust_mean14  trust_std14  trust_count14  median14  min14  max14  
0      3.312500     0.723104      

In [40]:
# Fill NaNs just in case
best_model = grid.best_estimator_

X_test = test_df.drop(columns=['user_id', 'item_id', 'id'], errors='ignore').fillna(-1)

# Predict residual
residual_preds = model.predict(X_test)



final_preds = residual_preds

# Prepare submission DataFrame
submission = test_df.copy()
submission['label'] = final_preds

# Save to CSV (preferred for submissions)
submission[['id','label']].to_csv("submission_with_trust_features_4.csv", index=False)

