In [1]:
import numpy as np
import pandas as pd

from sklearn.metrics import f1_score, roc_auc_score

from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
features_df = pd.read_csv(
    "train.csv", 
    index_col="tripid"
)

In [3]:
features_df['label'] = np.where(features_df['label']=='correct', 1, 0)
testSet = features_df['label']
features_df= features_df.drop(['label'],axis=1)

In [4]:
def formatDf(dataFrame):
    dataFrame.pickup_time = pd.to_datetime(dataFrame.pickup_time)

    dataFrame['is_fare_NAN'] = np.where(pd.isna(dataFrame['fare']), 1, 0)

    dataFrame['running_time'] = dataFrame['duration'] - dataFrame['meter_waiting']

    dataFrame['total_fare'] = dataFrame['fare'] - dataFrame['additional_fare'] - dataFrame['meter_waiting_fare']

    dataFrame['diff_lat'] = dataFrame['drop_lat'] - dataFrame['pick_lat']
    dataFrame['diff_lon'] = dataFrame['drop_lon'] - dataFrame['pick_lon']

    dataFrame['distance'] = (dataFrame['diff_lat']**2 + dataFrame['diff_lon']**2)**(1/2)

    dataFrame['pickup_hour'] = dataFrame['pickup_time'].apply(lambda row: row.hour)

    dataFrame= dataFrame.drop(['diff_lat','diff_lon','drop_time','meter_waiting_till_pickup','pickup_time','drop_lat','drop_lon','pick_lat','pick_lon'],axis=1)
    
    return dataFrame

In [5]:
features_df = formatDf(features_df)

In [7]:
numeric_cols = features_df.columns[features_df.dtypes != "object"].values
numeric_preprocessing_steps = Pipeline([
    ('standard_scaler', StandardScaler()),
    ('simple_imputer', SimpleImputer(strategy='median'))
])

In [8]:
X_train, X_eval, y_train, y_eval = train_test_split(
    features_df,
    testSet,
    test_size=0.3,
    random_state=6,
    stratify=testSet
)

preprocessor = ColumnTransformer(
    transformers = [
        ("numeric", numeric_preprocessing_steps, numeric_cols)
    ],
    remainder = "drop"
)


In [9]:
mlp=MLPClassifier( solver='lbfgs', alpha=1, hidden_layer_sizes=(10,2), random_state=0,
                         max_iter=1000, activation='tanh')

rn=RandomForestClassifier(n_estimators=300)
rn2=RandomForestClassifier(n_estimators=100)

estimator = XGBClassifier(alpha = 0.01,learning_rate=0.1, n_estimators=1400, 
                          min_child_weight=3,gamma=1.4,subsample=0.9,colsample_bytree=0.8,max_depth=3)

estimator2 = XGBClassifier(alpha = 0.01,learning_rate=0.1, n_estimators=1100, 
                          min_child_weight=3,gamma=1.4,subsample=0.9,colsample_bytree=0.8,max_depth=3)
            
classifiers = [('RandomForestClassifier1', rn),('xgb1', estimator), ('RandomForestClassifier2', rn2), ('xgb2', estimator2),('mlp', mlp)]

vc = VotingClassifier(estimators=classifiers,voting='soft')  

full_pipeline = Pipeline([
    ("preprocessor", preprocessor),
    ("estimators", vc),
])

In [10]:
full_pipeline.fit(X_train, y_train)
preds = full_pipeline.predict(X_eval)

In [11]:
y_preds = pd.DataFrame(
    {
        "label": preds
    },
    index = y_eval.index
)

In [12]:
print(full_pipeline.score(X_eval,y_eval))

0.9543954977682904


In [13]:
print(roc_auc_score(y_eval, y_preds))

0.80755269950527


In [14]:
print(f1_score(y_eval, y_preds, average='macro'))

0.8517161342147429


In [15]:
full_pipeline.fit(features_df, testSet)
None

In [16]:
preds2 = full_pipeline.predict(features_df)

y_preds2 = pd.DataFrame(
    {
        "label": preds2
    },
    index = testSet.index
)

In [17]:
test_features_df = pd.read_csv("test.csv", 
                               index_col="tripid")

test_features_df = formatDf(test_features_df)

In [18]:
test_probas = full_pipeline.predict(test_features_df)

submission_df = pd.read_csv("sample_submission.csv", 
                            index_col="tripid")

np.testing.assert_array_equal(test_features_df.index.values, 
                              submission_df.index.values)
submission_df["prediction"] = test_probas

submission_df.to_csv('my_submission.csv', index=True)