Imports

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score, classification_report
from xgboost import XGBClassifier
import warnings
warnings.filterwarnings('ignore')

LOADING

In [13]:
# Load training data
customers = pd.read_csv('Train/train_customers.csv')
locations = pd.read_csv('Train/train_locations.csv')
orders = pd.read_csv('Train/orders.csv', low_memory=False)
vendors = pd.read_csv('Train/vendors.csv')

# Load test data
test_customers = pd.read_csv('Test/test_customers.csv')
test_locations = pd.read_csv('Test/test_locations.csv')

print(f"Training data shapes:")
print(f"- Customers: {customers.shape}")
print(f"- Locations: {locations.shape}")
print(f"- Orders: {orders.shape}")
print(f"- Vendors: {vendors.shape}")

Training data shapes:
- Customers: (34674, 8)
- Locations: (59503, 5)
- Orders: (135303, 26)
- Vendors: (100, 59)


PREPROCESSING

In [14]:
def clean_dataframe(df, threshold=0.5):
    # Drop columns with >50% missing values
    cols_to_drop = df.columns[df.isnull().mean() > threshold]
    if len(cols_to_drop) > 0:
        print(f"Dropping columns with >{threshold*100}% missing: {list(cols_to_drop)}")
        df = df.drop(columns=cols_to_drop)
    
    # Fill numeric columns with median
    numeric_cols = df.select_dtypes(include=[np.number]).columns
    for col in numeric_cols:
        if df[col].isnull().sum() > 0:
            df[col] = df[col].fillna(df[col].median())
    
    # Fill categorical columns with mode or 'unknown'
    categorical_cols = df.select_dtypes(include=['object']).columns
    for col in categorical_cols:
        if df[col].isnull().sum() > 0:
            if len(df[col].mode()) > 0:
                df[col] = df[col].fillna(df[col].mode()[0])
            else:
                df[col] = df[col].fillna('unknown')
    
    return df

# Clean all dataframes
customers = clean_dataframe(customers)
locations = clean_dataframe(locations)
orders = clean_dataframe(orders)
vendors = clean_dataframe(vendors)
test_customers = clean_dataframe(test_customers)
test_locations = clean_dataframe(test_locations)

# Process customer age if DOB exists
if 'dob' in customers.columns:
    customers['dob'] = pd.to_datetime(customers['dob'], errors='coerce')
    customers['age'] = 2025 - customers['dob'].dt.year
    # Drop age if too many missing values
    if customers['age'].isnull().mean() > 0.8:
        customers = customers.drop(columns=['age', 'dob'])


Dropping columns with >50.0% missing: ['dob']
Dropping columns with >50.0% missing: ['promo_code', 'promo_code_discount_percentage', 'vendor_rating', 'delivery_time', 'driver_accepted_time', 'delivery_date']
Dropping columns with >50.0% missing: ['sunday_from_time2', 'sunday_to_time2', 'monday_from_time2', 'monday_to_time2', 'tuesday_from_time2', 'tuesday_to_time2', 'wednesday_from_time2', 'wednesday_to_time2', 'thursday_from_time2', 'thursday_to_time2', 'friday_from_time2', 'friday_to_time2', 'saturday_from_time2', 'saturday_to_time2']
Dropping columns with >50.0% missing: ['dob']


TRAINING SET

In [15]:

# Get unique customer locations
customer_locations = locations[['customer_id', 'location_number', 'location_type', 'latitude', 'longitude']].drop_duplicates()

# Get unique vendors
unique_vendors = orders['vendor_id'].unique()
vendor_df = pd.DataFrame({'vendor_id': unique_vendors})

# Create cartesian product (all possible combinations)
customer_locations['key'] = 1
vendor_df['key'] = 1
all_combinations = customer_locations.merge(vendor_df, on='key').drop('key', axis=1)


# positive classes
actual_orders = orders[['customer_id', 'LOCATION_NUMBER', 'vendor_id']].copy()
actual_orders.columns = ['customer_id', 'location_number', 'vendor_id']
actual_orders['target'] = 1
actual_orders = actual_orders.drop_duplicates()

print(f"Actual orders (positive samples): {len(actual_orders):,}")

# Merge dataset
training_data = all_combinations.merge(
    actual_orders[['customer_id', 'location_number', 'vendor_id', 'target']], 
    on=['customer_id', 'location_number', 'vendor_id'], 
    how='left'
)
training_data['target'] = training_data['target'].fillna(0).astype(int)

print(f"Positive rate: {training_data['target'].mean():.4f}")


Actual orders (positive samples): 80,142
Positive rate: 0.0135


Negative Sampling

In [16]:
# Separate positive and negative samples
positive_samples = training_data[training_data['target'] == 1]
negative_samples = training_data[training_data['target'] == 0]

# Sample negatives: use 2x positive samples for better balance
n_negative_samples = min(len(positive_samples) * 2, len(negative_samples))
negative_samples_sampled = negative_samples.sample(n=n_negative_samples, random_state=42)

# Combine and shuffle
balanced_training = pd.concat([positive_samples, negative_samples_sampled])
balanced_training = balanced_training.sample(frac=1, random_state=42).reset_index(drop=True)

print(f"Final training data: {len(balanced_training):,}")
print(f"Class distr:\n{balanced_training['target'].value_counts()}")

Final training data: 240,426
Class distr:
target
0    160284
1     80142
Name: count, dtype: int64


FEATURES

In [17]:
# Add customer features
balanced_training = balanced_training.merge(
    customers[['customer_id', 'gender', 'status', 'verified', 'language']], 
    on='customer_id', how='left'
)

# Add vendor features
vendor_features = ['id', 'latitude', 'longitude', 'vendor_category_en', 'delivery_charge','serving_distance', 'vendor_rating', 'discount_percentage']
available_vendor_features = [col for col in vendor_features if col in vendors.columns]
balanced_training = balanced_training.merge(
    vendors[available_vendor_features], 
    left_on='vendor_id', right_on='id', how='left', suffixes=('_customer', '_vendor')
)





# Calculate distance between customer and vendor (if coordinates available)
if 'latitude_customer' in balanced_training.columns and 'latitude_vendor' in balanced_training.columns:
    def haversine_distance(lat1, lon1, lat2, lon2):
        """Calculate distance between two points on Earth"""
        R = 6371  # Earth radius in km
        lat1, lon1, lat2, lon2 = map(np.radians, [lat1, lon1, lat2, lon2])
        dlat = lat2 - lat1
        dlon = lon2 - lon1
        a = np.sin(dlat/2)**2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon/2)**2
        c = 2 * np.arcsin(np.sqrt(a))
        return R * c
    
    balanced_training['customer_vendor_distance'] = haversine_distance(
        balanced_training['latitude_customer'].fillna(0),
        balanced_training['longitude_customer'].fillna(0),
        balanced_training['latitude_vendor'].fillna(0),
        balanced_training['longitude_vendor'].fillna(0)
    )

# Handle missing values in new features
balanced_training = clean_dataframe(balanced_training, threshold=0.7)


Modeling

In [18]:
# Select relevant features for modeling
feature_columns = [
    'location_number', 'latitude_customer', 'longitude_customer',
    'vendor_id', 'delivery_charge', 'serving_distance', 'discount_percentage'
]

# Add categorical features if they exist
categorical_features = ['gender', 'location_type', 'vendor_category_en']
for feat in categorical_features:
    if feat in balanced_training.columns:
        feature_columns.append(feat)

# Add distance if calculated
if 'customer_vendor_distance' in balanced_training.columns:
    feature_columns.append('customer_vendor_distance')

# Keep only available features
feature_columns = [col for col in feature_columns if col in balanced_training.columns]

print(f"Selected features: {feature_columns}")

# Prepare feature matrix
X = balanced_training[feature_columns].copy()
y = balanced_training['target']

# Handle categorical variables
label_encoders = {}
categorical_cols = X.select_dtypes(include=['object']).columns

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))
    label_encoders[col] = le

# Fill any remaining missing values
X = X.fillna(X.median())


Selected features: ['location_number', 'latitude_customer', 'longitude_customer', 'vendor_id', 'delivery_charge', 'serving_distance', 'discount_percentage', 'gender', 'location_type', 'vendor_category_en', 'customer_vendor_distance']


TRAINING MODEL

In [19]:
# Split data
X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=42, stratify=y
)

# Train XGBoost model
model = XGBClassifier(
    n_estimators=100,
    max_depth=6,
    learning_rate=0.1,
    subsample=0.8,
    colsample_bytree=0.8,
    random_state=42,
    eval_metric='logloss',
    use_label_encoder=False
)

model.fit(X_train, y_train)

# Evaluate model
y_pred = model.predict(X_val)
accuracy = accuracy_score(y_val, y_pred)
print(f"\nValidation Accuracy: {accuracy:.4f}")
print("\nClassification Report:")
print(classification_report(y_val, y_pred))




Validation Accuracy: 0.8051

Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.89      0.86     32149
           1       0.75      0.63      0.68     16056

    accuracy                           0.81     48205
   macro avg       0.79      0.76      0.77     48205
weighted avg       0.80      0.81      0.80     48205



TEST

In [20]:
# Create test combinations
test_locations_clean = test_locations[['customer_id', 'location_number', 'location_type', 'latitude', 'longitude']].drop_duplicates()

test_locations_clean['key'] = 1
test_combinations = test_locations_clean.merge(vendor_df, on='key').drop('key', axis=1)

# Add customer features
test_combinations = test_combinations.merge(
    test_customers[['customer_id', 'gender', 'status', 'verified', 'language']], 
    on='customer_id', how='left'
)

# Add vendor features
test_combinations = test_combinations.merge(
    vendors[available_vendor_features], 
    left_on='vendor_id', right_on='id', how='left', suffixes=('_customer', '_vendor')
)

# Calculate distance if possible
if 'latitude_customer' in test_combinations.columns and 'latitude_vendor' in test_combinations.columns:
    test_combinations['customer_vendor_distance'] = haversine_distance(
        test_combinations['latitude_customer'].fillna(0),
        test_combinations['longitude_customer'].fillna(0),
        test_combinations['latitude_vendor'].fillna(0),
        test_combinations['longitude_vendor'].fillna(0)
    )

# Clean test data
test_combinations = clean_dataframe(test_combinations, threshold=0.7)

# Prepare test features
X_test = test_combinations[feature_columns].copy()

# Apply same encodings
for col in categorical_cols:
    if col in X_test.columns and col in label_encoders:
        # Handle unseen categories
        X_test[col] = X_test[col].astype(str)
        mask = X_test[col].isin(label_encoders[col].classes_)
        X_test.loc[~mask, col] = label_encoders[col].classes_[0]  # Use first class for unseen
        X_test[col] = label_encoders[col].transform(X_test[col])

# Fill missing values
X_test = X_test.fillna(X_test.median())

# Make predictions
test_predictions = model.predict(X_test)
test_probabilities = model.predict_proba(X_test)[:, 1]

SUBMISSIONS

In [21]:
# Add predictions to test combinations
test_combinations['target'] = test_predictions
test_combinations['probability'] = test_probabilities

# Create submission format
test_combinations['CID X LOC_NUM X VENDOR'] = (
    test_combinations['customer_id'] + ' X ' + 
    test_combinations['location_number'].astype(str) + ' X ' + 
    test_combinations['vendor_id'].astype(str)
)

# Full submission
submission_full = test_combinations[['CID X LOC_NUM X VENDOR', 'target']]
submission_full.to_csv('submission.csv', index=False)

print("\nModel training and prediction complete!")


Model training and prediction complete!
