# create the dataset

In [143]:
import pandas as pd

In [161]:
trust_path = "train_data_movie_trust.csv"
rating_path = "train_data_movie_rate.csv"

trust_df = pd.read_csv(trust_path)
rating_df = pd.read_csv(rating_path)

In [162]:
global_mean = rating_df['label'].mean()
user_bias = rating_df.groupby('user_id')['label'].agg(lambda x: (x - global_mean).mean())
item_bias = rating_df.groupby('item_id')['label'].agg(lambda x: (x - global_mean).mean())

train_features['b_u'] = train_features['user_id'].map(user_bias)
train_features['b_i'] = train_features['item_id'].map(item_bias)
train_features['baseline_pred'] = global_mean + train_features['b_u'] + train_features['b_i']
train_features['residual'] = train_features['label'] - train_features['baseline_pred']

In [163]:
movie_stats = rating_df.groupby('item_id')['label'].agg(
    average_rating='mean',
    variance_rating='var',
    number_of_reviews='count',
    min_rating='min',
    max_rating='max',
    median_rating='median'
).reset_index()

In [164]:
print(movie_stats.head())

   item_id  average_rating  variance_rating  number_of_reviews  min_rating  \
0        1        2.978339         0.732663                831         0.5   
1        2        3.190286         0.689436                875         0.5   
2        3        3.045519         0.674563                703         0.5   
3        4        3.192969         0.823033                640         0.5   
4        5        3.230030         0.643675                676         0.5   

   max_rating  median_rating  
0         4.0            3.0  
1         4.0            3.5  
2         4.0            3.0  
3         4.0            3.5  
4         4.0            3.5  


In [165]:
rating_stats = rating_df.groupby('user_id')['label'].agg(
    average_rating='mean',
    variance_rating='var',
    number_of_ratings='count',
    median_of_ratings='median'
).reset_index()

In [166]:
# Total number of trustees for each user (i.e., users they trust)
trustees = trust_df.groupby('user_id_trustor')['user_id_trustee'].count().reset_index()
trustees.columns = ['user_id', 'total_number_of_trustees']

# Total number of trusters for each user (i.e., users who trust them)
trusters = trust_df.groupby('user_id_trustee')['user_id_trustor'].count().reset_index()
trusters.columns = ['user_id', 'total_number_of_trusters']

In [167]:
user_features = rating_stats

user_features = user_features.merge(trustees, on='user_id', how='left')

user_features = user_features.merge(trusters, on='user_id', how='left')

user_features[['total_number_of_trustees', 'total_number_of_trusters']] = user_features[
    ['total_number_of_trustees', 'total_number_of_trusters']
].fillna(0).astype(int)

In [168]:
print(user_features.head())

   user_id  average_rating  variance_rating  number_of_ratings  \
0        1        3.416667         0.446970                 12   
1        3        2.829016         1.335195                193   
2        4        2.333333         0.966667                  6   
3        5        3.000000              NaN                  1   
4        6        3.875000         0.062500                  4   

   median_of_ratings  total_number_of_trustees  total_number_of_trusters  
0               3.50                         0                         0  
1               3.00                         0                         0  
2               2.75                         0                         1  
3               3.00                         1                         0  
4               4.00                         1                         5  


In [169]:
import numpy as np

# Precompute user -> set of trustees
user_to_trustees = trust_df.groupby('user_id_trustor')['user_id_trustee'].apply(set).to_dict()

# Precompute (user_id, item_id) -> rating
rating_lookup = rating_df.set_index(['user_id', 'item_id'])['label'].to_dict()

# Function to extract movie_profile stats
def get_movie_profile_features(user_id, item_id):
    trustees = user_to_trustees.get(user_id, set())
    
    trustee_ratings = [
        rating_lookup[(t, item_id)] 
        for t in trustees 
        if (t, item_id) in rating_lookup
    ]
    
    if len(trustee_ratings) == 0:
        return [np.nan]*6
    
    return [
        np.mean(trustee_ratings),
        np.var(trustee_ratings),
        np.min(trustee_ratings),
        np.max(trustee_ratings),
        np.median(trustee_ratings),
        len(trustee_ratings)
    ]

# Prepair Final Dataset 

In [170]:
train = pd.read_csv("train_data_movie_rate.csv")  # columns: user_id, item_id

# Merge in user-level features
train_merged = train.merge(user_features, on='user_id', how='left')
train_merged = train_merged.merge(train_features[['residual', 'id', 'item_id']], on=['id','item_id'], how='left')

# Merge in movie-level features
train_merged = train_merged.merge(movie_stats, on='item_id', how='left')  # movie_stats = earlier created movie features

# Apply movie_profile feature extraction
train_merged[['avg_rating_trustees', 'var_rating_trustees', 'min_rating_trustees',
             'max_rating_trustees', 'median_rating_trustees', 'num_trustees_rated']] = train_merged.apply(
    lambda row: pd.Series(get_movie_profile_features(row['user_id'], row['item_id'])),
    axis=1
).fillna(-1)

In [171]:
print(train_merged.head())

   id  user_id  item_id  label  average_rating_x  variance_rating_x  \
0   1        1        1    2.0          3.416667            0.44697   
1   2        1        2    4.0          3.416667            0.44697   
2   3        1        3    3.5          3.416667            0.44697   
3   4        1        4    3.0          3.416667            0.44697   
4   5        1        5    4.0          3.416667            0.44697   

   number_of_ratings  median_of_ratings  total_number_of_trustees  \
0                 12                3.5                         0   
1                 12                3.5                         0   
2                 12                3.5                         0   
3                 12                3.5                         0   
4                 12                3.5                         0   

   total_number_of_trusters  ...  number_of_reviews  min_rating  max_rating  \
0                         0  ...                831         0.5         4.0   


# Train the Model

In [172]:
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score



In [173]:
# Separate features and target
X = train_merged.drop(columns=['id','user_id', 'item_id', 'label', 'residual'])  # keep only numeric features
y = train_merged['residual']

# Handle NaNs (basic strategy)
X = X.fillna(-1)




In [174]:
model = xgb.XGBRegressor(
    n_estimators=70,
    max_depth=4,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    n_jobs=-1
)

model.fit(X, y, verbose=False)


# Generate test answers

In [175]:
test = pd.read_csv("test_data.csv")  # columns: user_id, item_id

# Merge in user-level features
test_merged = test.merge(user_features, on='user_id', how='left')

# Merge in movie-level features
test_merged = test_merged.merge(movie_stats, on='item_id', how='left')  # movie_stats = earlier created movie features

# Apply movie_profile feature extraction
test_merged[['avg_rating_trustees', 'var_rating_trustees', 'min_rating_trustees',
             'max_rating_trustees', 'median_rating_trustees', 'num_trustees_rated']] = test_merged.apply(
    lambda row: pd.Series(get_movie_profile_features(row['user_id'], row['item_id'])),
    axis=1
)

In [176]:
print(test_merged.head())

   id  user_id  item_id  average_rating_x  variance_rating_x  \
0   1        6      211             3.875             0.0625   
1   2        6        7             3.875             0.0625   
2   3       16        2             3.600             0.1750   
3   4       16        4             3.600             0.1750   
4   5       16       11             3.600             0.1750   

   number_of_ratings  median_of_ratings  total_number_of_trustees  \
0                  4                4.0                         1   
1                  4                4.0                         1   
2                  5                3.5                         2   
3                  5                3.5                         2   
4                  5                3.5                         2   

   total_number_of_trusters  average_rating_y  ...  number_of_reviews  \
0                         5          3.194107  ...                577   
1                         5          3.164824  ...    

In [178]:
# 1) Compute baseline terms on test set
test_merged['b_u'] = test_merged['user_id'].map(user_bias)
test_merged['b_i'] = test_merged['item_id'].map(item_bias)
test_merged['baseline_pred'] = global_mean + test_merged['b_u'] + test_merged['b_i']

# 2) Prepare X_test exactly as in training (drop identifiers & baseline columns)
X_test = (
    test_merged
    .drop(columns=['user_id', 'item_id', 'id', 'b_u', 'b_i', 'baseline_pred'])
    .fillna(-1)
)

# 3) Predict the residuals
residual_preds = model.predict(X_test)

# 4) Add the baseline back
final_preds = test_merged['baseline_pred'].values + residual_preds

# 5) Build your submission
submission = test_merged[['id', 'user_id', 'item_id']].copy()
submission['predicted_rating'] = final_preds

submission[['id','predicted_rating']].to_csv("xgboost_with_baseline.csv", index=False)
