In [1]:
# import os
# import time
# import gc
# import glob
# import pickle

# import numpy as np
# import pandas as pd
# import lightgbm as lgb
# from sklearn.model_selection import RandomizedSearchCV

# def fetch_data(filepath):
#     gc.collect()
#     return pd.read_csv(filepath)

# def clean_missing_values(df, threshold=0.9,
#     exclude_cols=["visitor_hist_adr_usd","visitor_hist_starrating","srch_query_affinity_score"]
# ):
#     total = len(df)
#     to_drop = [
#         c for c in df.columns
#         if (df[c].isnull().mean() > threshold) and (c not in exclude_cols)
#     ]
#     df = df.drop(columns=to_drop)
#     gc.collect()
#     return df

# def extract_datetime_features(df, date_col="date_time"):
#     ts = pd.to_datetime(df[date_col])
#     df["month"]     = ts.dt.month
#     df["dayofweek"] = ts.dt.dayofweek
#     df["hour"]      = ts.dt.hour
#     df.drop(columns=[date_col], inplace=True)
#     return df

# def add_temporal_features(df):
#     df['month_sin']  = np.sin(2*np.pi*df['month']/12)
#     df['month_cos']  = np.cos(2*np.pi*df['month']/12)
#     df['dow_sin']    = np.sin(2*np.pi*df['dayofweek']/7)
#     df['dow_cos']    = np.cos(2*np.pi*df['dayofweek']/7)
#     df['is_weekend'] = df['dayofweek'].isin([5,6]).astype('int8')
#     return df

# def freq_encode(df, col):
#     freq = df[col].value_counts(normalize=True)
#     df[f"{col}_freq"] = df[col].map(freq).astype('float32')
#     return df

# def scale_features(df, group_by, column, log_transform=False):
#     eps = 1e-4
#     if log_transform:
#         df[column] = np.log10(df[column] + eps)
#     agg_m = ["mean","std","median"]
#     stats = df.groupby(group_by)[column].agg(agg_m).reset_index()
#     rename_map = {m:f"{column}_{m}_by_{group_by}" for m in agg_m}
#     stats.rename(columns=rename_map, inplace=True)
#     merged = df.merge(stats, on=group_by, how="left")
#     mcol = f"{column}_mean_by_{group_by}"
#     scol = f"{column}_std_by_{group_by}"
#     merged[f"{column}_norm_{group_by}"] = (
#         (merged[column] - merged.get(mcol, 0))
#         / (merged.get(scol, 1) + eps)
#     )
#     merged.drop(columns=list(rename_map.values()), inplace=True)
#     gc.collect()
#     return merged

# def feature_aggregation(df, group_by, target_col,
#                         agg_methods=["mean","median","std","var"],
#                         transformations={"mean":["subtract"]}):
#     stats = df.groupby(group_by)[target_col].agg(agg_methods).reset_index()
#     for m in agg_methods:
#         stats.rename(columns={m:f"{target_col}_{m}_by_{group_by}"}, inplace=True)
#     merged = df.merge(stats, on=group_by, how="left")
#     for m, ops in transformations.items():
#         coln = f"{target_col}_{m}_by_{group_by}"
#         for op in ops:
#             if op=="subtract":
#                 merged[f"{target_col}_diff_{m}"] = merged[target_col] - merged[coln]
#             elif op=="ratio":
#                 merged[f"{target_col}_ratio_{m}"] = merged[target_col] / (merged[coln] + 1e-4)
#     return merged

# def data_processing(df, mode='train'):
#     gc.collect()
#     if mode=="train":
#         if {"click_bool","booking_bool"}.issubset(df.columns):
#             df["target"] = np.select(
#                 [df["click_bool"]==1, df["booking_bool"]==1],
#                 [1,2], default=0
#             )
#             target = df["target"].values
#         else:
#             raise KeyError("Missing click_bool/booking_bool in training data")
#     else:
#         target = None

#     df = extract_datetime_features(df)
#     df = add_temporal_features(df)
#     df = clean_missing_values(df, 0.9)

#     for c in ['prop_country_id','site_id','visitor_location_country_id','srch_destination_id']:
#         if c in df.columns:
#             df = freq_encode(df, c)

#     for col, grp, lg in [
#         ('price_usd','srch_id',True),
#         ('price_usd','prop_id',False),
#         ('prop_starrating','srch_id',False),
#     ]:
#         if col in df.columns and grp in df.columns:
#             df = scale_features(df, grp, col, lg)

#     for grp, col in [
#         ('prop_id','price_usd'),
#         ('srch_id','prop_starrating'),
#         ('srch_id','prop_location_score1'),
#         ('srch_id','prop_location_score2'),
#         ('srch_id','prop_review_score'),
#         ('srch_id','promotion_flag'),
#         ('srch_destination_id','price_usd'),
#     ]:
#         if col in df.columns and grp in df.columns:
#             df = feature_aggregation(df, grp, col)

#     for col, grp, lg in [
#         ('prop_starrating','srch_id',False),
#         ('prop_location_score1','srch_id',False),
#         ('prop_location_score2','srch_id',False),
#         ('prop_review_score','srch_id',False),
#     ]:
#         if col in df.columns and grp in df.columns:
#             df = scale_features(df, grp, col, lg)

#     drop_list = [
#         'prop_country_id','site_id','visitor_location_country_id',
#         'click_bool','booking_bool','gross_bookings_usd'
#     ]
#     df.drop(columns=[c for c in drop_list if c in df.columns], inplace=True)
#     return df, target

# def drop_unnecessary_columns(df,
#     exclude_cols=["srch_id","prop_id","position","random_bool"]
# ):
#     to_drop = [c for c in exclude_cols if c in df.columns]
#     return df.drop(columns=to_drop)

# def integrate_estimated_position(df, position_stats):
#     return df.merge(position_stats, how="left",
#                     on=["srch_destination_id","prop_id"])

# def split_training_data(data, target, vs_start=0, vs_end=0):
#     train_x = pd.concat([data[:vs_start], data[vs_end:]])
#     train_y = np.concatenate([target[:vs_start], target[vs_end:]])
#     val_x   = data[vs_start:vs_end]
#     val_y   = target[vs_start:vs_end]

#     filtered = train_x[train_x["random_bool"]==0]
#     pos_stats = (
#         filtered.groupby(["srch_destination_id","prop_id"])["position"]
#         .mean().reset_index()
#     )
#     pos_stats.rename(columns={"position":"avg_position"}, inplace=True)
#     pos_stats["avg_position"] = 1/pos_stats["avg_position"]

#     train_x = integrate_estimated_position(train_x, pos_stats)
#     val_x   = integrate_estimated_position(val_x, pos_stats)

#     train_groups = train_x["srch_id"].value_counts(sort=False).sort_index()
#     val_groups   = val_x["srch_id"].value_counts(sort=False).sort_index()

#     return train_x, val_x, train_y, val_y, train_groups, val_groups, pos_stats

# def identify_categorical_features(df):
#     auto = df.select_dtypes(include=['category','object']).columns.tolist()
#     defaults = ["month","dayofweek","hour","is_weekend"]
#     cats = [c for c in auto+defaults if c in df.columns]
#     return [df.columns.get_loc(c) for c in cats]

# def train_recommender(data, target, vs_start, vs_end,
#                       learning_rate=0.12, boost_type="dart", optimize=False):
#     x_train, x_val, y_train, y_val, train_groups, val_groups, _ = \
#         split_training_data(data, target, vs_start, vs_end)

#     if optimize:
#         tune_idx  = x_train.sample(n=200_000, random_state=42).index
#         X_tune    = x_train.loc[tune_idx]
#         y_tune    = y_train[tune_idx]
#         grp_sizes = X_tune.groupby("srch_id").size().tolist()
#         X_feat    = drop_unnecessary_columns(X_tune)

#         tuner = RandomizedSearchCV(
#             estimator=lgb.LGBMRanker(
#                 objective="lambdarank", metric="ndcg",
#                 boosting_type=boost_type, n_estimators=200, random_state=69
#             ),
#             param_distributions=param_dist,
#             n_iter=10, cv=3,
#             scoring='neg_mean_squared_error',
#             random_state=42, n_jobs=1
#         )
#         tuner.fit(X_feat, y_tune, group=grp_sizes)
#         best = tuner.best_params_
#         learning_rate      = best.pop('learning_rate')
#         num_leaves         = best.pop('num_leaves')
#         feature_fraction   = best.pop('feature_fraction')
#         bagging_fraction   = best.pop('bagging_fraction')
#         bagging_freq       = best.pop('bagging_freq')
#         min_data_in_leaf   = best.pop('min_data_in_leaf')
#     else:
#         num_leaves = 31
#         feature_fraction   = 1.0
#         bagging_fraction   = 1.0
#         bagging_freq       = 0
#         min_data_in_leaf   = 20

#     x_train = drop_unnecessary_columns(x_train)
#     x_val   = drop_unnecessary_columns(x_val)

#     model = lgb.LGBMRanker(
#         objective="lambdarank", metric="ndcg",
#         learning_rate=learning_rate,
#         num_leaves=num_leaves,
#         feature_fraction=feature_fraction,
#         bagging_fraction=bagging_fraction,
#         bagging_freq=bagging_freq,
#         min_data_in_leaf=min_data_in_leaf,
#         label_gain=[0,1,2],
#         boosting_type=boost_type,
#         random_state=69, n_estimators=1000, eval_at=[5]
#     )
#     model.fit(
#         x_train, y_train,
#         group=train_groups,
#         eval_set=[(x_val,y_val)],
#         eval_group=[val_groups],
#         eval_metric="ndcg",
#         categorical_feature=identify_categorical_features(x_train),
#         callbacks=[lgb.early_stopping(50), lgb.log_evaluation(10)]
#     )

#     fn = f"model_v{int(time.time())}.dat"
#     with open(fn, "wb") as f:
#         pickle.dump(model, f)
#     print(f"Saved {fn}")
#     return model

# def align_features(train_df, test_df):
#     for c in set(train_df.columns) - set(test_df.columns):
#         test_df[c] = 0
#     return test_df[train_df.columns]

# def generate_predictions(test_df, position_stats, model_version="latest"):
#     if model_version=="latest":
#         files = glob.glob("model_v*.dat")
#         fname = max(files, key=os.path.getctime)
#     else:
#         fname = f"model_{model_version}.dat"
#     model = pickle.load(open(fname,"rb"))

#     df = test_df.copy()
#     df = integrate_estimated_position(df, position_stats)
#     pairs = df[["srch_id","prop_id"]]

#     feat_names = model.booster_.feature_name()
#     sample = pd.DataFrame(columns=feat_names)
#     df_aligned = align_features(sample, df)

#     preds = model.predict(df_aligned)
#     pairs["prediction"] = preds
#     pairs.sort_values(["srch_id","prediction"],
#                       ascending=[True,False],
#                       inplace=True)
#     pairs[["srch_id","prop_id"]].to_csv("submission.csv", index=False)
#     print("Wrote submission.csv")

# if __name__=="__main__":
#     config = {
#         "train_file": "training_set_VU_DM.csv",
#         "test_file":  "test_set_VU_DM.csv",
#         "boosting_method": "dart",
#         "learning_rate":   0.12,
#         "validation_size": 150000,
#         "start_index":     1
#     }

#     print("Loading train…")
#     train_raw = fetch_data(config["train_file"])
#     train_df, target = data_processing(train_raw, mode="train")

#     si = config["start_index"]
#     vs = config["validation_size"]
#     vs_start = si * vs
#     vs_end   = (si+1) * vs

#     print("Training…")
#     model = train_recommender(
#         train_df, target,
#         vs_start, vs_end,
#         learning_rate=config["learning_rate"],
#         boost_type=config["boosting_method"],
#         optimize=True
#     )

#     print("Loading test…")
#     test_raw = fetch_data(config["test_file"])
#     test_df, _ = data_processing(test_raw, mode="test")

#     # regenerate position_stats for test merge
#     _, _, _, _, _, _, position_stats = split_training_data(
#         train_df, target, vs_start, vs_end
#     )
#     print("Predicting…")
#     generate_predictions(test_df, position_stats, model_version="latest")


In [2]:
import os
import time
import gc
import glob
import pickle

import numpy as np
import pandas as pd
import lightgbm as lgb

def fetch_data(path):
    gc.collect()
    return pd.read_csv(path)

def clean_missing_values(df, threshold=0.9,
                         exclude_cols=None):
    exclude_cols = exclude_cols or []
    to_drop = [
        c for c in df.columns
        if df[c].isnull().mean() > threshold
           and c not in exclude_cols
    ]
    return df.drop(columns=to_drop)

def extract_datetime_features(df, date_col="date_time"):
    ts = pd.to_datetime(df[date_col])
    df["month"]     = ts.dt.month
    df["dayofweek"] = ts.dt.dayofweek
    df["hour"]      = ts.dt.hour
    df.drop(columns=[date_col], inplace=True)
    return df

def add_temporal_features(df):
    df['month_sin']  = np.sin(2*np.pi * df['month']    / 12)
    df['month_cos']  = np.cos(2*np.pi * df['month']    / 12)
    df['dow_sin']    = np.sin(2*np.pi * df['dayofweek']/  7)
    df['dow_cos']    = np.cos(2*np.pi * df['dayofweek']/  7)
    df['is_weekend'] = df['dayofweek'].isin([5,6]).astype('int8')
    return df

def freq_encode(df, col):
    freq = df[col].value_counts(normalize=True)
    df[f"{col}_freq"] = df[col].map(freq).astype('float32')
    return df

def scale_features(df, group_by, column, log_transform=False):
    """
    Normalize `column` within each `group_by` block:
      x' = (x - μ_group)/σ_group  via transform → no merge needed
    """
    eps = 1e-4
    df = df.copy()
    if log_transform:
        df[column] = np.log10(df[column] + eps)
    grp = df.groupby(group_by)[column]
    mean_map = grp.transform("mean")
    std_map  = grp.transform("std")
    df[f"{column}_norm_{group_by}"] = (df[column] - mean_map) / (std_map + eps)
    return df

def feature_aggregation(df, group_by, target_col,
                        agg_methods=["mean","median","std","var"],
                        transformations={"mean":["subtract"]}):
    stats = df.groupby(group_by)[target_col].agg(agg_methods).reset_index()
    for m in agg_methods:
        stats.rename(
            columns={m:f"{target_col}_{m}_by_{group_by}"}, inplace=True
        )
    df = df.merge(stats, on=group_by, how="left")
    for m, ops in transformations.items():
        base = f"{target_col}_{m}_by_{group_by}"
        for op in ops:
            if op=="subtract":
                df[f"{target_col}_diff_{m}"] = df[target_col] - df[base]
            elif op=="ratio":
                df[f"{target_col}_ratio_{m}"] = df[target_col] / (df[base] + 1e-4)
    return df

def data_processing(df, mode='train'):
    if mode == "train":
        if not {"click_bool","booking_bool"}.issubset(df.columns):
            raise KeyError("Training data must contain click_bool & booking_bool")
        df["target"] = np.select(
            [df["click_bool"]==1, df["booking_bool"]==1],
            [1,2], default=0
        )
        target = df["target"].values
    else:
        target = None

    df = extract_datetime_features(df)
    df = add_temporal_features(df)
    df = clean_missing_values(
        df, threshold=0.9,
        exclude_cols=[
            "visitor_hist_adr_usd",
            "visitor_hist_starrating",
            "srch_query_affinity_score"
        ]
    )

    # Frequency encodings
    for c in [
        'prop_country_id','site_id',
        'visitor_location_country_id','srch_destination_id'
    ]:
        if c in df.columns:
            df = freq_encode(df, c)

    # Initial scaling
    for col, grp, lg in [
        ('price_usd','srch_id', True),
        ('price_usd','prop_id', False),
        ('prop_starrating','srch_id', False),
    ]:
        if col in df.columns and grp in df.columns:
            df = scale_features(df, grp, col, log_transform=lg)

    # Aggregated features
    for grp, col in [
        ('prop_id','price_usd'),
        ('srch_id','prop_starrating'),
        ('srch_id','prop_location_score1'),
        ('srch_id','prop_location_score2'),
        ('srch_id','prop_review_score'),
        ('srch_id','promotion_flag'),
        ('srch_destination_id','price_usd'),
    ]:
        if grp in df.columns and col in df.columns:
            df = feature_aggregation(df, grp, col)

    # Final pass of scaling on reviews/location
    for col, grp, lg in [
        ('prop_starrating','srch_id', False),
        ('prop_location_score1','srch_id', False),
        ('prop_location_score2','srch_id', False),
        ('prop_review_score','srch_id', False),
    ]:
        if col in df.columns and grp in df.columns:
            df = scale_features(df, grp, col, log_transform=lg)

    # Drop unused
    drop_list = [
        'prop_country_id','site_id','visitor_location_country_id',
        'click_bool','booking_bool','gross_bookings_usd'
    ]
    df.drop(columns=[c for c in drop_list if c in df.columns],
            inplace=True, errors='ignore')

    return df, target

def drop_unnecessary_columns(df):
    cols = ["srch_id","prop_id","position","random_bool"]
    return df.drop(columns=[c for c in cols if c in df], errors='ignore')

def integrate_estimated_position(df, position_stats):
    # ensure merge keys are ints
    for c in ["srch_destination_id","prop_id"]:
        df[c] = df[c].astype(int)
        position_stats[c] = position_stats[c].astype(int)
    return df.merge(
        position_stats,
        on=["srch_destination_id","prop_id"],
        how="left"
    )

def split_training_data(data, target, vs_start=0, vs_end=0):
    train_x = pd.concat([data[:vs_start], data[vs_end:]])
    train_y = np.concatenate([target[:vs_start], target[vs_end:]])
    val_x   = data[vs_start:vs_end]
    val_y   = target[vs_start:vs_end]

    filtered = train_x[train_x["random_bool"]==0]
    pos_stats = (
        filtered
        .groupby(["srch_destination_id","prop_id"])["position"]
        .mean().reset_index()
        .rename(columns={"position":"estimated_position"})
    )
    pos_stats["estimated_position"] = 1 / pos_stats["estimated_position"]

    train_groups = (
        train_x["srch_id"]
               .value_counts(sort=False)
               .sort_index()
               .tolist()
    )
    val_groups = (
        val_x["srch_id"]
             .value_counts(sort=False)
             .sort_index()
             .tolist()
    )

    train_x = integrate_estimated_position(train_x, pos_stats)
    val_x   = integrate_estimated_position(val_x, pos_stats)

    return train_x, val_x, train_y, val_y, train_groups, val_groups, pos_stats

def identify_categorical_features(df):
    cats = df.select_dtypes(include=['category','object']).columns.tolist()
    for c in ["month","dayofweek","hour","is_weekend"]:
        if c in df.columns and c not in cats:
            cats.append(c)
    return [df.columns.get_loc(c) for c in cats]

def train_recommender(data, target, vs_start, vs_end,
                      learning_rate=0.12, boost_type="dart"):
    x_train, x_val, y_train, y_val, train_groups, val_groups, _ = \
        split_training_data(data, target, vs_start, vs_end)

    X_train = drop_unnecessary_columns(x_train)
    X_val   = drop_unnecessary_columns(x_val)

    model = lgb.LGBMRanker(
        objective="lambdarank", metric="ndcg",
        learning_rate=learning_rate,
        num_leaves=31,
        feature_fraction=1.0,
        bagging_fraction=1.0,
        bagging_freq=0,
        min_data_in_leaf=20,
        label_gain=[0,1,2],
        boosting_type=boost_type,
        random_state=69,
        n_estimators=1000,
        eval_at=[5]
    )
    model.fit(
        X_train, y_train,
        group=train_groups,
        eval_set=[(X_val, y_val)],
        eval_group=[val_groups],
        eval_metric="ndcg",
        categorical_feature=identify_categorical_features(X_train),
        callbacks=[lgb.early_stopping(50), lgb.log_evaluation(10)]
    )

    fn = f"model_v{int(time.time())}.dat"
    with open(fn, "wb") as f:
        pickle.dump(model, f)
    print(f"Saved {fn}")
    return model

def align_features(train_df, test_df):
    for c in set(train_df.columns) - set(test_df.columns):
        test_df[c] = 0
    return test_df[train_df.columns]

def generate_predictions(test_df, position_stats, model_version="latest"):
    if model_version=="latest":
        files = glob.glob("model_v*.dat")
        fname = max(files, key=os.path.getctime)
    else:
        fname = f"model_{model_version}.dat"

    model = pickle.load(open(fname, "rb"))

    for c in ["srch_destination_id","prop_id"]:
        test_df[c] = test_df[c].astype(int)

    df = test_df.copy()
    df = integrate_estimated_position(df, position_stats)
    pairs = df[["srch_id","prop_id"]]

    feat_names = model.booster_.feature_name()
    sample     = pd.DataFrame(columns=feat_names)
    df_aligned = align_features(sample, df)

    preds = model.predict(df_aligned)
    pairs["prediction"] = preds
    pairs.sort_values(
        ["srch_id","prediction"],
        ascending=[True, False],
        inplace=True
    )
    pairs[["srch_id","prop_id"]].to_csv("submission.csv", index=False)
    print("Wrote submission.csv")

if __name__=="__main__":
    config = {
        "train_file":      "training_set_VU_DM.csv",
        "test_file":       "test_set_VU_DM.csv",
        "boosting_method": "dart",
        "learning_rate":   0.12,
        "validation_size": 150_000,
        "start_index":     1
    }

    print("Loading train…")
    train_raw = fetch_data(config["train_file"])
    train_df, target = data_processing(train_raw, mode="train")

    si = config["start_index"]
    vs = config["validation_size"]
    vs_start = si * vs
    vs_end   = (si + 1) * vs

    print("Training…")
    model = train_recommender(
        train_df, target,
        vs_start, vs_end,
        learning_rate=config["learning_rate"],
        boost_type=config["boosting_method"]
    )

    print("Loading test…")
    test_raw = fetch_data(config["test_file"])
    test_df, _ = data_processing(test_raw, mode="test")

    # rebuild position_stats for merging
    _, _, _, _, _, _, position_stats = split_training_data(
        train_df, target, vs_start, vs_end
    )

    print("Predicting…")
    generate_predictions(test_df, position_stats, model_version="latest")


Loading train…
Training…


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c].astype(int)


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.616190 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 13386
[LightGBM] [Info] Number of data points in the train set: 4808347, number of used features: 84




[10]	valid_0's ndcg@5: 1
[20]	valid_0's ndcg@5: 1
[30]	valid_0's ndcg@5: 1
[40]	valid_0's ndcg@5: 1
[50]	valid_0's ndcg@5: 1
[60]	valid_0's ndcg@5: 1
[70]	valid_0's ndcg@5: 1
[80]	valid_0's ndcg@5: 1
[90]	valid_0's ndcg@5: 1
[100]	valid_0's ndcg@5: 1
[110]	valid_0's ndcg@5: 1
[120]	valid_0's ndcg@5: 1
[130]	valid_0's ndcg@5: 1
[140]	valid_0's ndcg@5: 1
[150]	valid_0's ndcg@5: 1
[160]	valid_0's ndcg@5: 1
[170]	valid_0's ndcg@5: 1
[180]	valid_0's ndcg@5: 1
[190]	valid_0's ndcg@5: 1
[200]	valid_0's ndcg@5: 1
[210]	valid_0's ndcg@5: 1
[220]	valid_0's ndcg@5: 1
[230]	valid_0's ndcg@5: 1
[240]	valid_0's ndcg@5: 1
[250]	valid_0's ndcg@5: 1
[260]	valid_0's ndcg@5: 1
[270]	valid_0's ndcg@5: 1
[280]	valid_0's ndcg@5: 1
[290]	valid_0's ndcg@5: 1
[300]	valid_0's ndcg@5: 1
[310]	valid_0's ndcg@5: 1
[320]	valid_0's ndcg@5: 1
[330]	valid_0's ndcg@5: 1
[340]	valid_0's ndcg@5: 1
[350]	valid_0's ndcg@5: 1
[360]	valid_0's ndcg@5: 1
[370]	valid_0's ndcg@5: 1
[380]	valid_0's ndcg@5: 1
[390]	valid_0's ndcg@

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c].astype(int)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[c] = df[c].astype(int)


Predicting…






A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pairs["prediction"] = preds
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  pairs.sort_values(


Wrote submission.csv
