In [2]:
#import the data
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
df=pd.read_csv('rides_by_user')

In [3]:
df.head()

Unnamed: 0,user_id,platform,age_range,time_to_signup,time_to_first_request,mean_wait_time,acceptance_rate,churned
0,103775,web,25-34,103080.0,86440.0,1000.0,0.111111,1
1,113929,android,Unknown,121140.0,70782.0,340.8,1.0,0
2,103676,web,35-44,60900.0,21020391.0,407.727273,1.0,0
3,117225,ios,35-44,37920.0,122001.0,1020.0,0.16,1
4,100066,web,45-54,88320.0,7547249.0,351.818182,1.0,0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12406 entries, 0 to 12405
Data columns (total 8 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   user_id                12406 non-null  int64  
 1   platform               12406 non-null  object 
 2   age_range              12406 non-null  object 
 3   time_to_signup         12406 non-null  float64
 4   time_to_first_request  12406 non-null  float64
 5   mean_wait_time         12278 non-null  float64
 6   acceptance_rate        12406 non-null  float64
 7   churned                12406 non-null  int64  
dtypes: float64(4), int64(2), object(2)
memory usage: 775.5+ KB


In [5]:
#onehotencode the categorical features
enc=OneHotEncoder(handle_unknown='ignore', drop='first')
enc_df = pd.DataFrame(enc.fit_transform(df[['platform', 'age_range']]).toarray(), columns=enc.get_feature_names_out())
enc_df = df.join(enc_df)

In [6]:
#drop id columns and cat columns
data= enc_df.drop(['user_id', 'platform', 'age_range'], axis=1)
data.dropna(inplace=True)

In [7]:
y= data['churned']
X= data.loc[:, data.columns != 'churned']

In [8]:
#decision tree classifier, no need for scaling
from sklearn.model_selection import KFold, cross_validate
from sklearn.tree import DecisionTreeClassifier
dt = DecisionTreeClassifier()
cv = KFold(n_splits=5, shuffle=True, random_state=8)
cv_score = cross_validate(dt, X, y, cv=cv, return_estimator=True)
print(cv_score['test_score'])
#churn is entirely predicted by avg wait time and acceptance rate (perfect separation)
for x in range(0,5):
    print(cv_score['estimator'][x].feature_importances_)

[1. 1. 1. 1. 1.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]
[0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]


In [9]:
#trying a pipeline with knn, using ColumnTransormer with StandardScaler for scaling to avoid leakage during cross validation
#no need for hyperparameter tuning given the model's perfect performance
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import KFold, cross_val_score
X_cont = X[['time_to_signup', 'time_to_first_request', 'mean_wait_time', 'acceptance_rate']].columns
t= [('cont', StandardScaler(), X_cont)]
ct = ColumnTransformer(transformers= t, remainder='passthrough')
knn = KNeighborsClassifier()
steps = [('scaling', ct), ('knn', knn)]
pipeline = Pipeline(steps)
cv = KFold(n_splits=5, shuffle=True, random_state=8)
cv_score = cross_val_score(pipeline, X, y, cv=cv)
print(cv_score)


[1. 1. 1. 1. 1.]
