In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, LabelEncoder
from scipy.stats import mstats
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import balanced_accuracy_score

# Load the dataset
file_path = "/kaggle/input/ds-3-datathon-2025-classifying-accidents/Classifying_accidents-train.csv"
train_set = pd.read_csv(file_path)
# accident = train_set.copy()

# The columns that seems to be useless
cols_to_del = ['ID', 'Zipcode', 'Weather_Timestamp', 'State', 'Airport_Code', 'Country']

# Check for non-numeric columns and numeric columns
# cat_cols = accident.select_dtypes(include=['object', 'bool']).columns
# num_cols = accident.select_dtypes(include=['float64', 'int64']).columns

# Calculate the correlation matrix
# acc = accident[num_cols]
# correlation_matrix = acc.corr()

# Create a heatmap
# plt.figure(figsize=(12, 8))
# sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='coolwarm', square=True)
# plt.title('Correlation Matrix Heatmap')
# plt.show()

# high-correlated columns which seems to be redundant
cols_to_del_2 = ['Start_Lat', 'Start_Lng', 'Wind_Chill(F)']

In [None]:
# Label heavy-tailed columns
ht_cols = ['Street', 'City', 'County']

# Label columns with outliers obviously
ot_cols = ['Distance(mi)', 'Pressure(in)', 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)']

# Define imputer, encoder, scaler
num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')
low_d_encoder = OneHotEncoder(sparse_output=False)
std_scaler = StandardScaler()
label_encoder = OrdinalEncoder(categories=[["Source1", "Source2"]])
cont_features = ['End_Lat', 'End_Lng', 'Distance(mi)', 'Temperature(F)', 'Humidity(%)', 'Pressure(in)',
                 'Visibility(mi)', 'Wind_Speed(mph)', 'Precipitation(in)']
freq_features = ['Street', 'City', 'County', 'Wind_Direction', 'Weather_Condition']

# Define preprocessing funciton
def preprocess_data(df, is_train=True, freq_mapping=None):
    # drop columns
    df = df.drop(columns=cols_to_del)
    df = df.drop(columns=cols_to_del_2)
    print(df.columns)

    # Access 'Class' if is training set
    target_variable = None
    if is_train:
        target_variable = df[['Class']]
        
        df = df.drop(columns=['Class'])

    # Recalculate after dropping columns
    num_cols = df.select_dtypes(include=['float64', 'int64']).columns
    cat_cols = df.select_dtypes(include=['object', 'bool']).columns
    # Label the high-cardinality columns
    high_d_cols = ['Street', 'City', 'County', 'Wind_Direction', 'Weather_Condition']
    low_d_cols = list(set(cat_cols) - set(high_d_cols))


    # Impute columns
    df[num_cols] = num_imputer.fit_transform(df[num_cols]) if is_train else num_imputer.transform(df[num_cols])
    df[cat_cols] = cat_imputer.fit_transform(df[cat_cols]) if is_train else cat_imputer.transform(df[cat_cols])
    
    # Using OneHotEncoder to convert low-cardinality data
    if is_train:
        encoded_features = low_d_encoder.fit_transform(df[low_d_cols])
    else:
        encoded_features = low_d_encoder.transform(df[low_d_cols])
    # Convert sparse matrix to DataFrame
    encoded_df = pd.DataFrame(encoded_features, columns=low_d_encoder.get_feature_names_out(low_d_cols))

    # Frequency encoding for high-cardinality columns
    if is_train:
        freq_mapping = {}
        for col in high_d_cols:
            freq_mapping[col] = df[col].value_counts().to_dict()
            df[col] = df[col].map(freq_mapping[col])
            # Fill NaN with the mean of the column after mapping
            mean_value = df[col].mean()
            df[col] = df[col].fillna(mean_value)
    else:
        for col in high_d_cols:
            df[col] = df[col].map(freq_mapping.get(col, {}))
            # Fill NaN with the mean of the column after mapping
            mean_value = df[col].mean()
            df[col] = df[col].fillna(mean_value)

    frequency_encoded_df = df[high_d_cols].copy()

    # Combine with the original DataFrame (excluding original categorical features)
    df = pd.concat([df.drop(columns=cat_cols), encoded_df, frequency_encoded_df], axis=1)

    # Transform heavy-tailed colums and columns with abvious outliers
    for col in ht_cols:
        df[col] = np.log1p(df[col])
    for col in ot_cols:
        df[col] = np.array(mstats.winsorize(df[col], limits=[0.02, 0.02]))

    # Scale features
    scale_features = cont_features + freq_features
    if is_train:
        df[scale_features] = std_scaler.fit_transform(df[scale_features])
    else:
        df[scale_features] = std_scaler.transform(df[scale_features])

    return df, target_variable, freq_mapping

In [None]:
processed_data, target_variable, freq_map = preprocess_data(train_set, is_train=True)

# Check the columns and their dtypes
print("Processed DataFrame columns and dtypes:")
print(processed_data.dtypes)

x = processed_data
y = target_variable

# Convert the target variable to numeric using LabelEncoder
y_encoded = label_encoder.fit_transform(y)

# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y_encoded, test_size=0.2, random_state=42)

# Initialize the XGBoost classifier
xgb_model = XGBClassifier(
    objective="binary:logistic",
    learning_rate=0.1,
    max_depth=6,
    n_estimators=200,
    random_state=42
)

# Train the model
xgb_model.fit(x_train, y_train)

# Make predictions
y_pred = xgb_model.predict(x_test)

# Calculate and print the accuracy
accuracy = balanced_accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.4f}")

# Convert predicted labels back to original labels
# predicted_labels = label_encoder.inverse_transform(y_pred)

In [None]:
test_set = pd.read_csv("/kaggle/input/ds-3-datathon-2025-classifying-accidents/Classifying_accidents - test.csv")
processed_test_data, _, _ = preprocess_data(test_set, is_train=False, freq_mapping=freq_map)

# Make predictions
y_test_pred = xgb_model.predict(processed_test_data)

output_df = pd.DataFrame({
    'ID': test_set['ID'],
    'Source': y_test_pred
})

print(output_df.head())


In [None]:
output_df.to_csv("car_acc_submission.csv", index=False)