In [1]:
import pandas as pd

In [2]:
import numpy as np

In [83]:
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import log_loss
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier

In [13]:
test = pd.read_csv("test.csv")

In [12]:
train = pd.read_csv("train.csv")

In [76]:
train

Unnamed: 0,Id,distance_km,payload_kg,drone_model,slot_15min,wind_token,route_turns,alt_profile,landing_zone,operator_tag,on_time,num_turns,alt_changes,wind_direction,wind_avg_speed,wind_gust
0,1662,3.498,2.676,B,75,S-0/4,UUUUURLUD,0-0-0-0-1-1-1-1-1,LZ-146,safe,1,2,1,S,0,4
1,4591,2.209,1.480,B,30,SE-3/5,LLLULLLULLLDLUUULUUU,1-1-1-2-2-2-2-3-2-1-2-3-2-1-1-2-3-2-3-3,LZ-278,balanced,0,11,12,SE,3,5
2,38756,1.876,0.267,A,26,N-3/5,RRURDDDRDDDRRURDRDRD,0-0-0-1-1-1-1-0-1-1-2-2-2-3-3-3-3-3-3-3,LZ-108,balanced,0,9,5,N,3,5
3,31027,1.496,0.510,A,9,E-10/18,LLDDDDLDDRDDRDDDDDDD,0-1-0-0-0-1-2-2-2-2-2-2-3-3-3-3-3-3-3-3,LZ-243,balanced,0,5,5,E,10,18
4,1642,2.041,0.240,B,52,SW-11/19,DRRDRRRURDRRDDLDDLDR,1-1-1-1-1-1-2-2-2-2-2-2-2-1-1-1-2-2-2-2,LZ-030,safe,0,11,3,SW,11,19
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39995,3320,2.677,0.506,A,40,S-2/11,RURUULUURRRDRURRUURU,0-0-1-1-2-2-2-1-2-1-2-3-3-3-3-3-3-3-3-3,LZ-175,safe,0,10,7,S,2,11
39996,24733,2.823,1.319,C,54,NW-4/11,LLDLLLLLLDLLLLLULLLL,0-0-0-0-1-0-0-0-0-0-0-1-1-1-2-1-1-1-0-0,LZ-284,balanced,1,17,6,NW,4,11
39997,33296,3.577,0.410,B,62,NE-11/17,RDRRRDRDRRURRRDRRDDR,0-1-1-1-1-1-0-0-0-1-1-0-0-0-1-0-0-0-0-0,LZ-133,safe,0,13,6,NE,11,17
39998,5647,3.715,1.998,B,83,N-8/13,LLLLLULUU,0-0-0-0-0-0-1-2-2,LZ-074,balanced,1,6,2,N,8,13


In [54]:
train['num_turns'] = train['route_turns'].apply(lambda x: sum(c in 'LR' for c in x))
test['num_turns'] = test['route_turns'].apply(lambda x: sum(c in 'LR' for c in x))

train['alt_changes'] = train['alt_profile'].apply(lambda x: sum(abs(int(x.split('-')[i+1]) - int(x.split('-')[i])) for i in range(len(x.split('-'))-1)))
test['alt_changes'] = test['alt_profile'].apply(lambda x: sum(abs(int(x.split('-')[i+1]) - int(x.split('-')[i])) 
                                                              for i in range(len(x.split('-'))-1)))

In [75]:
def extract_wind_features(df):
    df['wind_direction'] = df['wind_token'].apply(lambda x: x.split('-')[0])
    df['wind_avg_speed'] = df['wind_token'].apply(lambda x: int(x.split('-')[1].split('/')[0]))
    df['wind_gust'] = df['wind_token'].apply(lambda x: int(x.split('-')[1].split('/')[1]))
    return df

train = extract_wind_features(train)
test = extract_wind_features(test)

In [77]:
X = train[["distance_km", "payload_kg", "num_turns", "alt_changes",
    "drone_model", "slot_15min", "operator_tag",
    "wind_direction", "wind_avg_speed", "wind_gust", "landing_zone"]]

In [78]:
y = train["on_time"]

In [79]:
y

0        1
1        0
2        0
3        0
4        0
        ..
39995    0
39996    1
39997    0
39998    1
39999    0
Name: on_time, Length: 40000, dtype: int64

In [80]:
X_test = test[X.columns]

In [81]:
X_test

Unnamed: 0,distance_km,payload_kg,num_turns,alt_changes,drone_model,slot_15min,operator_tag,wind_direction,wind_avg_speed,wind_gust,landing_zone
0,3.498,0.420,14,6,A,79,aggressive,E,6,17,LZ-265
1,3.851,1.470,8,7,A,82,balanced,SE,12,20,LZ-268
2,1.182,0.168,9,7,A,11,balanced,SE,14,23,LZ-031
3,2.981,1.096,5,3,A,32,safe,E,7,13,LZ-122
4,3.662,0.335,19,7,B,59,balanced,NE,11,24,LZ-035
...,...,...,...,...,...,...,...,...,...,...,...
9995,3.174,1.526,15,10,B,38,balanced,SE,11,18,LZ-276
9996,3.249,0.465,14,9,A,9,safe,W,5,14,LZ-200
9997,1.098,2.682,8,6,A,6,balanced,N,14,19,LZ-056
9998,2.378,2.403,12,7,A,62,balanced,NW,2,9,LZ-283


In [82]:
prep = ColumnTransformer([
    ("categorical", OneHotEncoder(handle_unknown="ignore"),
     ["drone_model", "slot_15min", "operator_tag", "wind_direction", "landing_zone"]),
    ("numerical", "passthrough", ["distance_km", "payload_kg", "num_turns", "alt_changes", "wind_avg_speed", "wind_gust"])])


In [84]:
model = Pipeline([
    ("prep", prep),
    ("clf", RandomForestClassifier(n_estimators=100, random_state=42))
])

In [85]:
X_tr, X_val, y_tr, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [86]:
model.fit(X_tr, y_tr)

In [87]:
model.predict_proba(X_val)[:,1]

array([0.58, 0.88, 0.64, ..., 0.16, 0.9 , 0.9 ])

In [88]:
log_loss(y_val, model.predict_proba(X_val)[:,1])

0.46820685752831387

In [89]:
test_pred = model.predict_proba(X_test)[:,1]

In [90]:
submission = pd.DataFrame({
    "Id": test["Id"],
    "on_time": test_pred})
    

In [91]:
submission

Unnamed: 0,Id,on_time
0,45558,0.65
1,454,0.12
2,49297,0.66
3,45176,0.64
4,15904,0.71
...,...,...
9995,17056,0.18
9996,9046,0.24
9997,46532,0.30
9998,35156,0.14


In [92]:
submission.to_csv("submission.csv", index = False)