In [1]:
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from catboost import CatBoostRegressor, CatBoostClassifier
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import train_test_split

# Data loading

In [2]:
train_df = pd.read_csv('train_new.csv')
test_df = pd.read_csv('test_new.csv')
val_df = pd.read_csv('val_new.csv')

# Data preparation

In [3]:
# Split inputs and targets
train_inputs = train_df.drop(columns=['score'])
train_targets = train_df['score']
test_inputs = test_df.copy()
test_inputs_copy = test_df.copy(deep=True)
train_inputs_copy= train_inputs.copy(deep=True)

# Select features
dropped_columns = ['object_id','sensor_id','occlusion_level']
train_inputs.drop(columns=dropped_columns, inplace=True)
dropped_columns.append('score')
test_inputs.drop(columns=dropped_columns, inplace=True)

# Transform categorical features
categorical_features = ['object_type']
oe = OrdinalEncoder()
train_inputs[categorical_features] = oe.fit_transform(train_inputs[categorical_features])
test_inputs[categorical_features] = oe.transform(test_inputs[categorical_features])

# Missing value imputation
train_inputs.fillna(0, inplace=True)
test_inputs.fillna(0, inplace=True)

### Prediction using XGBoost

In [4]:
X_train_xg, X_test_xg, y_train_xg, y_test_xg = train_test_split(
        train_inputs, train_targets, test_size=0.2, random_state=42
    )
model = XGBRegressor(
        n_estimators=100,
        learning_rate=0.1,
        max_depth=5,
        random_state=42
    )

model.fit(X_train_xg, y_train_xg)

y_pred_xg = model.predict(X_test_xg)

r2 = r2_score(y_test_xg, y_pred_xg)
rmse = np.sqrt(mean_squared_error(y_test_xg, y_pred_xg))
mae = mean_absolute_error(y_test_xg, y_pred_xg)



print(f"R²: {r2:.4f}")
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")


R²: 0.9544
RMSE: 10.8574
MAE: 4.3775


# Cat Boost Algorithm Prediction

In [5]:
def preprocess_nan_values(df):
    """Replace NaN in categorical columns with empty string and maintain proper dtypes"""
    # Convert to object type first to allow empty string insertion
    df['occlusion_level'] = df['occlusion_level'].astype(object)
    df['object_type'] = df['object_type'].astype(object)
    df['sensor_id'] = df['object_type'].astype(object)
    
    # Fill NaN with empty string
    df[['occlusion_level', 'object_type','sensor_id']] = df[['occlusion_level', 'object_type','sensor_id']].fillna('')
    
    # Convert back to categorical type
    df['occlusion_level'] = df['occlusion_level'].astype('category')
    df['object_type'] = df['object_type'].astype('category')
    df['sensor_id'] = df['sensor_id'].astype('category')
    
    return df

In [None]:
occlusion_map = {
      'NOT_OCCLUDED': 0.29,
      'PARTIALLY_OCCLUDED': 0.64,
      'MOSTLY_OCCLUDED': 0.98,
      '': 0.2  # default for blank or missing
}
categorical_features = [ 
    'occlusion_level',
    'object_type',
    'sensor_id'
]

train_inputs_cat = preprocess_nan_values(train_inputs_copy.copy())
test_inputs_cat =preprocess_nan_values(test_inputs_copy.copy())


test_targets_cat = test_inputs_cat['score']
dropped_columns = ['object_id']
train_inputs_cat.drop(columns=dropped_columns, inplace=True)
dropped_columns.append('score')
test_inputs_cat.drop(columns=dropped_columns, inplace=True)
dropped_columns.pop()




# Model for CatBoostAlgorithm 
model = CatBoostRegressor(
    iterations=1000,
    learning_rate=0.1,
    depth=6,
    cat_features=categorical_features,
    verbose=True,
    random_seed=42,
    early_stopping_rounds=50,
    loss_function='RMSE'  
)

model.fit(
    train_inputs_cat,
    train_targets,
    eval_set=(test_inputs_cat,test_targets_cat), 
    use_best_model=True
)

predictions = model.predict(test_inputs_cat)

accuracy = accuracy_score(test_targets, np.rint(predictions))


['object_id']
0:	learn: 36.0123076	test: 27.8150930	best: 27.8150930 (0)	total: 65.8ms	remaining: 1m 5s
1:	learn: 33.4843370	test: 27.4464666	best: 27.4464666 (1)	total: 68.7ms	remaining: 34.3s
2:	learn: 31.1799202	test: 26.9451985	best: 26.9451985 (2)	total: 71.4ms	remaining: 23.7s
3:	learn: 29.0216668	test: 26.6656088	best: 26.6656088 (3)	total: 73.7ms	remaining: 18.4s
4:	learn: 27.3060858	test: 26.6791779	best: 26.6656088 (3)	total: 75ms	remaining: 14.9s
5:	learn: 25.4448472	test: 26.4352763	best: 26.4352763 (5)	total: 78.7ms	remaining: 13s
6:	learn: 23.7459544	test: 26.1151609	best: 26.1151609 (6)	total: 80.9ms	remaining: 11.5s
7:	learn: 22.1284135	test: 25.8613530	best: 25.8613530 (7)	total: 82.4ms	remaining: 10.2s
8:	learn: 20.7119115	test: 25.7949444	best: 25.7949444 (8)	total: 84ms	remaining: 9.25s
9:	learn: 19.4283118	test: 25.7121810	best: 25.7121810 (9)	total: 85.9ms	remaining: 8.51s
10:	learn: 18.3620953	test: 25.6922373	best: 25.6922373 (10)	total: 88.7ms	remaining: 7.97s


NameError: name 'accuracy_score' is not defined

In [None]:
print(train_inputs_cat.iloc[8])
test_inputs_cat.iloc[8]

# Prediction algorithm using Random Forest

In [None]:
# Split training and validation tests

X_train, X_test, y_train, y_test = train_test_split(train_inputs, train_targets, test_size = 0.2, random_state = 0)

# Create algorithm
rf = RandomForestRegressor()

# Train
rf.fit(X_train, y_train)

# Validation
val_predictions = rf.predict(X_test)
# print(test_inputs.head(10))
# Test
test_predictions = rf.predict(test_inputs)

# Compute error metric

In [None]:
rmse = mean_squared_error(y_test, val_predictions, squared=False)
print(f"Root Mean Squared Error = {rmse / 1e6:.3} Mbit/s")
r2 = r2_score(y_test, val_predictions)
print(f"R²: {r2:.4f}")

# Save results

In [None]:
# Add index to results
predictions_df = pd.DataFrame({'id': test_df.object_id, 'target': test_predictions})
predictions_df.to_csv("BenchmarkSubmission.csv", index = False)
predictions_df.head()