# Pipelines using Pandas.

# David Brookes July 2021.

In [1]:
import pandas as pd

df = pd.read_csv(r'D:\My Documents\Python Code\Pipelines\Special_Events_Permits.csv')
df.head()

Unnamed: 0,Application Date,Permit Status,Permit Type,Event Category,Event Sub-Category,Name of Event,Year-Month-App#,Event Start Date,Event End Date,Event Location - Park,Event Location - Neighborhood,Council District,Precinct,Organization,Attendance
0,,Cancelled,Charter Vessel,,,Argosy - Lady Mary,CV14JY314,07/04/2014 12:00:00 AM,07/04/2014 12:00:00 AM,,Fremont,,,Argosy LP,170.0
1,,Complete,Charter Vessel,,,Argosy - Celebrations,CV14JY315,07/04/2014 12:00:00 AM,07/04/2014 12:00:00 AM,,Fremont,,,Argosy LP,100.0
2,,Complete,Charter Vessel,,,Argosy - Goodtime 3,CV14JY316,07/04/2014 12:00:00 AM,07/04/2014 12:00:00 AM,,Fremont,,,Argosy LP,170.0
3,,Complete,Charter Vessel,,,Argosy - Sightseer,CV14JY317,07/04/2014 12:00:00 AM,07/04/2014 12:00:00 AM,,Fremont,,,Argosy LP,130.0
4,,Complete,Charter Vessel,,,Argosy - Champagne Lady,CV14JY318,07/04/2014 12:00:00 AM,07/04/2014 12:00:00 AM,,Fremont,,,Argosy LP,90.0


In [2]:
# Convert column names to lower case with underscores.
df.columns = df.columns.str.replace(' ', '_')
df.columns = df.columns.str.lower()

print(df.columns)
print(df.shape)


Index(['application_date', 'permit_status', 'permit_type', 'event_category',
       'event_sub-category', 'name_of_event', 'year-month-app#',
       'event_start_date', 'event_end_date', 'event_location_-_park',
       'event_location_-_neighborhood', 'council_district', 'precinct',
       'organization', 'attendance'],
      dtype='object')
(3433, 15)


In [3]:
print(type(df['event_start_date'][0]))
print(df['event_start_date'][0])

<class 'str'>
07/04/2014 12:00:00 AM


In [4]:
# Function to extract time information.

time_info = df['event_start_date'][0]

def extract_info(time_info):
    month = int(time_info[0:2])
    day = int(time_info[3:5])
    year = int(time_info[6:10])
    time = time_info[11:19]
    am_pm = time_info[20:22]
    
    return (month, day, year, time, am_pm)
    
month, day, year, time, am_pm = extract_info(time_info)

print(month)
print(day)
print(year)
print(time, am_pm)

7
4
2014
12:00:00 AM


In [5]:
# Just look at a selection of the data i.e. from 2016.
booleans=[]
for time_info in df['event_start_date']:
    month, day, year, time, am_pm = extract_info(time_info)
    if year == 2016:
        booleans.append(True)
    else:
        booleans.append(False)
        
df_2016 = df[booleans]

print(df_2016.head())
print(df_2016.shape)

          application_date permit_status     permit_type event_category  \
27  05/31/2016 12:00:00 AM      Complete  Charter Vessel            NaN   
28  05/31/2016 12:00:00 AM      Complete  Charter Vessel            NaN   
29  06/01/2016 12:00:00 AM      Complete  Charter Vessel            NaN   
30  06/01/2016 12:00:00 AM      Complete  Charter Vessel            NaN   
31  06/01/2016 12:00:00 AM      Complete  Charter Vessel            NaN   

   event_sub-category                        name_of_event year-month-app#  \
27                NaN  Sternwheeler Charters - Christine W       CV16JY325   
28                NaN         Anchor Bay Charters - Seeker       CV16JY326   
29                NaN       Waterways Cruises-Emerald Star       CV16JY328   
30                NaN       Waterways Cruises-Olympic Star       CV16JY329   
31                NaN          Waterways Cruises-West Star       CV16JY330   

          event_start_date          event_end_date event_location_-_park  \
27  

# Build a machine learning model.
- Outcome - permit_status
    Binary - will event be "Complete" or not

- Features 
    - everything else!
    - Raw, transformed, combinations etc.

# Modelling with scikit-learn.
- Set aside test data. No peeking!

In [6]:
from sklearn.model_selection import train_test_split
df_train, df_test = train_test_split(df_2016)

- Define outcome, and also one feature only.

In [7]:
import numpy as np

y_train = np.where(df_train['permit_status'] == 'Complete', 1, 0)
y_test = np.where(df_test['permit_status'] == 'Complete', 1, 0)

# One feature used.
X_train = df_train[['attendance']].fillna(value=0)
X_test = df_test[['attendance']].fillna(value=0)


In [8]:
# Fit the model.
# Create model object.

from sklearn.linear_model import LogisticRegression
model = LogisticRegression()

# Fit model and predict on training data.

model.fit(X_train, y_train)
y_pred_train = model.predict(X_train)
p_pred_train = model.predict_proba(X_train)[:,1]


In [9]:
# Evaluation.

# Predict on test data.
p_baseline = [y_train.mean()]*len(y_test) # Simple model that predicts the mean.
p_pred_test = model.predict_proba(X_test)[:,1]

# Measure performance on the test set.
from sklearn.metrics import roc_auc_score
auc_base = roc_auc_score(y_test, p_baseline)
auc_test = roc_auc_score(y_test, p_pred_test)

print('auc_base:', auc_base)
print('auc_test:', auc_test)

auc_base: 0.5
auc_test: 0.3961004273504274


# Transformers.

In [10]:
# Several transformations of the data may be required.
# For example, imputation followed by creating of polynomial features
# followed by standardisation.

from sklearn.impute import SimpleImputer
from sklearn.preprocessing import (PolynomialFeatures,
                                  StandardScaler)
#imputer = SimpleImputer()
#quadratic = PolynomialFeatures()
#standardiser = StandardScaler()

Instead of writing this:-   \
X_train_imp = imputer.fit_transform(X_train_raw)   \
X_train_quad = quadratic.fit_transform(X_train_imp)   \
X_train = standardiser.fit_transform(X_train_quad)   

and

X_test_imp = imputer.transform(X_test_raw)   \
X_test_quad = quadratic.transform(X_test_imp)   \
X_test = standardiser.transform(X_test_quad)   

Create a pipeline instead!

In [11]:
from sklearn.pipeline import Pipeline

pipeline = Pipeline([('imputer', SimpleImputer())
                        ,('quadratic', PolynomialFeatures())
                        ,('standardiser', StandardScaler())])

X_train_pipeline_processed = pipeline.fit_transform(X_train)
X_test_pipeline_processed = pipeline.transform(X_test)

In [12]:
print(X_train)

      attendance
1184       325.0
1320       705.0
1564       310.0
1282      2000.0
1233      3600.0
...          ...
1251        60.0
1511       250.0
37          30.0
1362       705.0
1298      5030.0

[396 rows x 1 columns]


In [13]:
print(X_train_pipeline_processed)

[[ 0.         -0.21186911 -0.10205332]
 [ 0.         -0.19234759 -0.10195311]
 [ 0.         -0.2126397  -0.10205576]
 ...
 [ 0.         -0.22702398 -0.10208013]
 [ 0.         -0.19234759 -0.10195311]
 [ 0.          0.02983817 -0.09560263]]


In [14]:
# A useful function - FunctionTransformer().

from sklearn.preprocessing import FunctionTransformer

logger = FunctionTransformer(np.log1p) # Choose any function you like.

X_train_log = logger.transform(X_train)
print(X_train_log)

      attendance
1184    5.786897
1320    6.559615
1564    5.739793
1282    7.601402
1233    8.188967
...          ...
1251    4.110874
1511    5.525453
37      3.433987
1362    6.559615
1298    8.523374

[396 rows x 1 columns]


Or, create a custom transformer!

In [15]:
from sklearn.base import TransformerMixin

class Log1pTransformer(TransformerMixin):
    def fit(self, X, y=None):
        return self
        
    def transform(self, X):
        Xlog = np.log1p(X)
        return Xlog
    
logger_custom = Log1pTransformer()
X_train_logger_custom = logger_custom.fit_transform(X_train) # Note TransformerMixin creates fit_transform method.

print(X_train_logger_custom)

      attendance
1184    5.786897
1320    6.559615
1564    5.739793
1282    7.601402
1233    8.188967
...          ...
1251    4.110874
1511    5.525453
37      3.433987
1362    6.559615
1298    8.523374

[396 rows x 1 columns]
