In [1]:
import pandas as pd

In [4]:
churn_data = pd.read_csv('https://raw.githubusercontent.com/zekelabs/machine-learning-for-beginners/master/data/churn.csv.txt', parse_dates=['last_trip_date','signup_date'])

In [3]:
churn_data.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4


In [5]:
churn_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50000 entries, 0 to 49999
Data columns (total 12 columns):
avg_dist                  50000 non-null float64
avg_rating_by_driver      49799 non-null float64
avg_rating_of_driver      41878 non-null float64
avg_surge                 50000 non-null float64
city                      50000 non-null object
last_trip_date            50000 non-null datetime64[ns]
phone                     49604 non-null object
signup_date               50000 non-null datetime64[ns]
surge_pct                 50000 non-null float64
trips_in_first_30_days    50000 non-null int64
luxury_car_user           50000 non-null bool
weekday_pct               50000 non-null float64
dtypes: bool(1), datetime64[ns](2), float64(6), int64(1), object(2)
memory usage: 4.2+ MB


In [6]:
churn_data.last_trip_date.max()

Timestamp('2014-07-01 00:00:00')

In [7]:
import datetime
date_cutoff = churn_data.last_trip_date.max() - datetime.timedelta(30,0,0)

In [8]:
churn_data['churn'] = (churn_data.last_trip_date < date_cutoff).astype(int)

In [9]:
churn_data.head()

Unnamed: 0,avg_dist,avg_rating_by_driver,avg_rating_of_driver,avg_surge,city,last_trip_date,phone,signup_date,surge_pct,trips_in_first_30_days,luxury_car_user,weekday_pct,churn
0,3.67,5.0,4.7,1.1,King's Landing,2014-06-17,iPhone,2014-01-25,15.4,4,True,46.2,0
1,8.26,5.0,5.0,1.0,Astapor,2014-05-05,Android,2014-01-29,0.0,0,False,50.0,1
2,0.77,5.0,4.3,1.0,Astapor,2014-01-07,iPhone,2014-01-06,0.0,3,False,100.0,1
3,2.36,4.9,4.6,1.14,King's Landing,2014-06-29,iPhone,2014-01-10,20.0,9,True,80.0,0
4,3.13,4.9,4.4,1.19,Winterfell,2014-03-15,Android,2014-01-27,11.8,14,False,82.4,1


In [10]:
churn_data.dropna(inplace=True)

In [14]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler, LabelEncoder

In [11]:
from sklearn_pandas  import DataFrameMapper

In [28]:
df_mapper = DataFrameMapper([
    (['avg_dist'],StandardScaler()),
    (['avg_rating_by_driver'],MinMaxScaler()),
    ('luxury_car_user',LabelEncoder()),
    ('surge_pct',None),
    ('weekday_pct',None),
    ('trips_in_first_30_days',None)
])

In [36]:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier

In [30]:
from sklearn.feature_selection import SelectKBest

In [31]:
pipeline = Pipeline([
    ('feature_mapper', df_mapper),
    ('feature_selector', SelectKBest(k=4)),
    ('clf', RandomForestClassifier(n_estimators=10))
])

In [32]:
pipeline.fit(churn_data, churn_data.churn)

Pipeline(memory=None,
     steps=[('feature_mapper', DataFrameMapper(default=False, df_out=False,
        features=[(['avg_dist'], StandardScaler(copy=True, with_mean=True, with_std=True)), (['avg_rating_by_driver'], MinMaxScaler(copy=True, feature_range=(0, 1))), ('luxury_car_user', LabelEncoder()), ('surge_pct', None), ('we...n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False))])

### Next Level

In [33]:
from sklearn.model_selection import train_test_split
trainX, testX, trainY, testY = train_test_split(churn_data.drop('churn',axis=1), churn_data.churn)

In [48]:
class Classifiers(object):
    def __init__(self,clsf,mapper):
        self.clsf = clsf
        self.pipelines = []
        self.create_pipeline(mapper)
        
    def create_pipeline(self,mapper):
        for model in self.clsf:
            self.pipelines.append(Pipeline([ 
                                           ('mapper',mapper),
                                           ('clsf', model)]))
            
    def fit(self,trainX,trainY):
        for pipeline in self.pipelines:
            pipeline.fit(trainX,trainY)
            
    def score(self,testX,testY):
        for pipeline in self.pipelines:
            print (pipeline.score(testX,testY))
             

In [49]:
clsfs = Classifiers([RandomForestClassifier(), AdaBoostClassifier()], df_mapper)

In [50]:
clsfs.fit(trainX,trainY)

In [51]:
clsfs.score(testX,testY)

0.6970662034356302
0.7348002316155182


In [56]:
list(map(lambda p: p.predict(testX), clsfs.pipelines))

[array([1, 0, 0, ..., 1, 1, 0]), array([1, 1, 1, ..., 0, 1, 0])]