## Project 2: Using pipelines for Encoding, Scaling, Modeling and Scoring
### Note: ml_pipelines 3 test scaler in line.ipynb is very similar.

In [258]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler, add_dummy_feature, LabelEncoder, OrdinalEncoder
from sklearn.compose import ColumnTransformer

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score, balanced_accuracy_score




In [259]:
# Look at the first few rows of the data

# data is from https://data.world/uci/auto-mpg
# Import the data
df = pd.read_csv("auto_data_2.csv")

df = df.replace("?", pd.NA)
df = df.reset_index(drop=True)

# print("df head...")
# display(df.head(10))

# print("/ndf info...\n")
# display(df.info())

# print("/Unshown columns from above info...\n")
# display(df.loc[:,'fuel-system': 'highway-mpg'].info())

print("/ndf SHAPE...\n")
display(df.shape)

# print("/ndf COLUMNS (use in Excel to specify encoding)...\n")
# display(df.columns)

# print("/ndf DESCRIBE...\n")
# display(df.describe())

# display(df.dtypes)





/ndf SHAPE...



(205, 26)

In [260]:
for the_col in df.columns:
    print(f"\nColumn Name: {the_col} and the DType is: {df[the_col].dtype}")
    print(f"value counts are: \n{df[the_col].value_counts()}\n")



Column Name: symboling and the DType is: int64
value counts are: 
symboling
 0    67
 1    54
 2    32
 3    27
-1    22
-2     3
Name: count, dtype: int64


Column Name: normalized-losses and the DType is: object
value counts are: 
normalized-losses
161    11
91      8
150     7
128     6
134     6
104     6
95      5
102     5
103     5
74      5
85      5
168     5
94      5
65      5
106     4
122     4
148     4
118     4
93      4
101     3
125     3
137     3
154     3
83      3
115     3
119     2
87      2
194     2
197     2
108     2
89      2
164     2
158     2
145     2
192     2
188     2
81      2
110     2
113     2
129     2
153     2
107     1
78      1
186     1
231     1
77      1
98      1
121     1
90      1
142     1
256     1
Name: count, dtype: int64


Column Name: make and the DType is: object
value counts are: 
make
toyota           32
nissan           18
mazda            17
mitsubishi       13
honda            13
volkswagen       12
subaru           12
peu

### Now subset into X and Y
### map anything that can be numeric to numeric.

In [276]:
# attributes for X'set
X_df = df[['make','fuel-type','aspiration','num-of-doors','body-style','drive-wheels','engine-location','wheel-base','length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']].copy()
# ,'body-style','drive-wheels','engine-location','wheel-base','length','width','height','curb-weight','engine-type','num-of-cylinders','engine-size','fuel-system','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']]
# fix bore, stoke and HP. Make them float64'
X_df['bore'] = pd.to_numeric(X_df['bore'])
X_df['stroke'] = pd.to_numeric(X_df['stroke'])
X_df['horsepower'] = pd.to_numeric(X_df['horsepower'])
X_df['peak-rpm'] = pd.to_numeric(X_df['peak-rpm'])
X_df['num-of-doors'].fillna('four', inplace=True)
X_df['num-of-doors'] = X_df['num-of-doors'].map({'two':2,'four':4})
X_df['num-of-doors'] = pd.to_numeric(X_df['num-of-doors'])
X_df['num_cylinders'] = X_df['num-of-cylinders'].map({'two':2,'three':3,'four':4,'five':5,'six':6,'eight':8,'twelve':12})
X_df['num_cylinders'] = pd.to_numeric(X_df['num_cylinders'])
X_df.drop(columns = ['num-of-cylinders'], inplace=True)
X_df['price'] = pd.to_numeric(X_df['price'])
# had a bug issue with 'fuel-system'. Just dropping it
X_df = X_df.drop(columns = 'fuel-system').copy()
print('X_df head..\n')
# display(X_df.head(10))
y_df = df[['symboling']]
# print('\ny df is...\n')
display(X_df.head(10))
# display(f"the type of bore is: {type(X_df['make'][0])}")


display(f"X_df shape is {X_df.shape}")


X_df head..



Unnamed: 0,make,fuel-type,aspiration,num-of-doors,body-style,drive-wheels,engine-location,wheel-base,length,width,...,engine-size,bore,stroke,compression-ratio,horsepower,peak-rpm,city-mpg,highway-mpg,price,num_cylinders
0,alfa-romero,gas,std,2,convertible,rwd,front,88.6,168.8,64.1,...,130,3.47,2.68,9.0,111.0,5000.0,21,27,13495.0,4
1,alfa-romero,gas,std,2,convertible,rwd,front,88.6,168.8,64.1,...,130,3.47,2.68,9.0,111.0,5000.0,21,27,16500.0,4
2,alfa-romero,gas,std,2,hatchback,rwd,front,94.5,171.2,65.5,...,152,2.68,3.47,9.0,154.0,5000.0,19,26,16500.0,6
3,audi,gas,std,4,sedan,fwd,front,99.8,176.6,66.2,...,109,3.19,3.4,10.0,102.0,5500.0,24,30,13950.0,4
4,audi,gas,std,4,sedan,4wd,front,99.4,176.6,66.4,...,136,3.19,3.4,8.0,115.0,5500.0,18,22,17450.0,5
5,audi,gas,std,2,sedan,fwd,front,99.8,177.3,66.3,...,136,3.19,3.4,8.5,110.0,5500.0,19,25,15250.0,5
6,audi,gas,std,4,sedan,fwd,front,105.8,192.7,71.4,...,136,3.19,3.4,8.5,110.0,5500.0,19,25,17710.0,5
7,audi,gas,std,4,wagon,fwd,front,105.8,192.7,71.4,...,136,3.19,3.4,8.5,110.0,5500.0,19,25,18920.0,5
8,audi,gas,turbo,4,sedan,fwd,front,105.8,192.7,71.4,...,131,3.13,3.4,8.3,140.0,5500.0,17,20,23875.0,5
9,audi,gas,turbo,2,hatchback,4wd,front,99.5,178.2,67.9,...,131,3.13,3.4,7.0,160.0,5500.0,16,22,,5


'X_df shape is (205, 23)'

In [262]:
# lets do a simple mean imputer for bore, stroke and hoursepower (the rest of the columns do not have missing values)

for column in X_df.columns:
    # each of the statements below act on the entire column.
    if (X_df[column].isnull().any()) and  X_df[column].dtype != 'O':
        mean_value = X_df[column].mean()
        print(f"Column: {column:15} Missing values filled with: {mean_value:.6f}")
        X_df[column].fillna(mean_value, inplace=True)

X_df.info()

Column: bore            Missing values filled with: 3.329751
Column: stroke          Missing values filled with: 3.255423
Column: horsepower      Missing values filled with: 104.256158
Column: peak-rpm        Missing values filled with: 5125.369458
Column: price           Missing values filled with: 13207.129353
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 205 entries, 0 to 204
Data columns (total 23 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   make               205 non-null    object 
 1   fuel-type          205 non-null    object 
 2   aspiration         205 non-null    object 
 3   num-of-doors       205 non-null    int64  
 4   body-style         205 non-null    object 
 5   drive-wheels       205 non-null    object 
 6   engine-location    205 non-null    object 
 7   wheel-base         205 non-null    float64
 8   length             205 non-null    float64
 9   width              205 non-null    float64
 10  

In [263]:
# sanity check the object columns. should be no numeric
char_columns = []
for the_col in X_df.columns:
    if X_df[the_col].dtype == 'O':
        print(f"Col: '{the_col:15}' type is 'O', first val is: {X_df[the_col][0]}")
        char_columns.append(the_col)

print(f"the char_columns are: \n{char_columns}")

# X_df['fuel-system'].value_counts()
X_df['make'].value_counts()

Col: 'make           ' type is 'O', first val is: alfa-romero
Col: 'fuel-type      ' type is 'O', first val is: gas
Col: 'aspiration     ' type is 'O', first val is: std
Col: 'body-style     ' type is 'O', first val is: convertible
Col: 'drive-wheels   ' type is 'O', first val is: rwd
Col: 'engine-location' type is 'O', first val is: front
Col: 'engine-type    ' type is 'O', first val is: dohc
the char_columns are: 
['make', 'fuel-type', 'aspiration', 'body-style', 'drive-wheels', 'engine-location', 'engine-type']


make
toyota           32
nissan           18
mazda            17
mitsubishi       13
honda            13
volkswagen       12
subaru           12
peugot           11
volvo            11
dodge             9
mercedes-benz     8
bmw               8
audi              7
plymouth          7
saab              6
porsche           5
isuzu             4
jaguar            3
chevrolet         3
alfa-romero       3
renault           2
mercury           1
Name: count, dtype: int64

In [264]:
# Column specification. Can have mix of OneHot, numeric, Ordinal and LabelEncoded.  Empty list is not a problem for ML pipeline.
# OneHotEncoder, add_dummy_feature, LabelEncoder, OrdinalEncoder
# need to add 'fuel-system' back
OneHotEncoder_features = ['make','fuel-type','aspiration','body-style','drive-wheels','engine-location','engine-type']
numeric_features = ['num-of-doors','wheel-base','length','width','height','curb-weight','num_cylinders','engine-size','bore','stroke','compression-ratio','horsepower','peak-rpm','city-mpg','highway-mpg','price']
Ordinal_features = []
LabelEncoder_features = []

# change to have this read in excel file and fill these out.

In [265]:
preprocessor = ColumnTransformer(
    transformers=[
        ('OHE', OneHotEncoder(), OneHotEncoder_features),  # No effect with empty list
        ('LabEnc', LabelEncoder(), LabelEncoder_features),
        ('Ord', OrdinalEncoder(), Ordinal_features),
        ('num', 'passthrough', numeric_features)
    ])


In [266]:
import warnings
warnings.filterwarnings("ignore")

# Split the dataset into training and testing sets

X_train, X_test, y_train, y_test = train_test_split(X_df, y_df, random_state=42)

# Define individual classifiers
clf1 = LogisticRegression(random_state=42)
clf2 = RandomForestClassifier(random_state=42, max_depth=9)
clf3 = SVC(probability=True, random_state=42)
clf4 = GradientBoostingClassifier(random_state=42)
clf5 = AdaBoostClassifier(random_state=42)

# Create a VotingClassifier with soft voting
voting_clf = VotingClassifier(estimators=[
    ('lr', clf1), ('rf', clf2), ('svc', clf3), ('GBC', clf4),('ABC',clf5)],
    voting='soft')

# Define the pipeline
pipeline = Pipeline(steps=[
    ('preprocessing', preprocessor),
    ('scaling', StandardScaler()),
    ('classifier', VotingClassifier(estimators=[
        ('lr', clf1), ('rf', clf2), ('svc', clf3), ('GBC', clf4),('ABC',clf5)],
        voting='soft'))
])

# Fit the voting classifier
pipeline.fit(X_train, y_train)

# Make predictions and evaluate
y_predictions = pipeline.predict(X_test)
print(f"Voting Classifier Accuracy: {(accuracy_score(y_test, y_predictions)*100):.2f}")

X_train_transformed = pipeline.transform(X_train)
X_test_transformed = pipeline.transform(X_test)

# Evaluate each individual classifier
# (clf1, clf2, clf3, clf4, clf5, voting_clf)
for clf in (clf1, clf2, clf3, clf4, clf5, voting_clf):
    clf.fit(X_train_transformed, y_train)
    y_test_predictions = clf.predict(X_test_transformed)
    print(f"{clf.__class__.__name__:30}          Test Accuracy: {(accuracy_score(y_test, y_test_predictions)*100):.2f}    Test BALANCED Accuracy: {(balanced_accuracy_score(y_test, y_test_predictions)*100):.2f}")
    # balanced_accuracy_score
    # print(f"{clf.__class__.__name__:30} Test BALANCED Accuracy: {(balanced_accuracy_score(y_test, y_test_predictions)*100):.2f}")

Voting Classifier Accuracy: 82.69
LogisticRegression                      Test Accuracy: 80.77    Test BALANCED Accuracy: 87.07
RandomForestClassifier                  Test Accuracy: 78.85    Test BALANCED Accuracy: 84.29
SVC                                     Test Accuracy: 78.85    Test BALANCED Accuracy: 86.34
GradientBoostingClassifier              Test Accuracy: 84.62    Test BALANCED Accuracy: 88.04
AdaBoostClassifier                      Test Accuracy: 75.00    Test BALANCED Accuracy: 82.84
VotingClassifier                        Test Accuracy: 76.92    Test BALANCED Accuracy: 83.56


In [277]:
# X_train_transformed

feature_names = preprocessor.get_feature_names_out()
print(f"len of feature names is: {len(feature_names)}")
print(f"\nFeature names:\n{feature_names}")
print(f"shape of x train trans is: {np.shape(X_train_transformed)}")
X_train_transformed_df = pd.DataFrame(X_train_transformed)
X_train_transformed_df.head()

print(f"shape of X_train            : {np.shape(X_train)}")
print(f"shape of X_train transformed: {np.shape(X_train_transformed)}")

len of feature names is: 59

Feature names:
['OHE__make_alfa-romero' 'OHE__make_audi' 'OHE__make_bmw'
 'OHE__make_chevrolet' 'OHE__make_dodge' 'OHE__make_honda'
 'OHE__make_isuzu' 'OHE__make_jaguar' 'OHE__make_mazda'
 'OHE__make_mercedes-benz' 'OHE__make_mercury' 'OHE__make_mitsubishi'
 'OHE__make_nissan' 'OHE__make_peugot' 'OHE__make_plymouth'
 'OHE__make_porsche' 'OHE__make_renault' 'OHE__make_saab'
 'OHE__make_subaru' 'OHE__make_toyota' 'OHE__make_volkswagen'
 'OHE__make_volvo' 'OHE__fuel-type_diesel' 'OHE__fuel-type_gas'
 'OHE__aspiration_std' 'OHE__aspiration_turbo'
 'OHE__body-style_convertible' 'OHE__body-style_hardtop'
 'OHE__body-style_hatchback' 'OHE__body-style_sedan'
 'OHE__body-style_wagon' 'OHE__drive-wheels_4wd' 'OHE__drive-wheels_fwd'
 'OHE__drive-wheels_rwd' 'OHE__engine-location_front'
 'OHE__engine-location_rear' 'OHE__engine-type_dohc'
 'OHE__engine-type_dohcv' 'OHE__engine-type_l' 'OHE__engine-type_ohc'
 'OHE__engine-type_ohcf' 'OHE__engine-type_ohcv' 'OHE__engine-

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,20,21,22,23,24,25,26,27,28,29
0,5.4e-05,0.000456,0.036028,0.897578,0.065883,8.18097e-07,0.0,0.0,0.07175,0.901583,...,0.000862,0.997639,0.00131,0.000132,1.750215e-15,5.821766e-08,0.249306,0.260879,0.257794,0.2320211
1,0.000501,0.841189,0.127331,0.030179,0.000795,5.507047e-06,0.004044,0.815418,0.13992,0.013554,...,0.003045,0.000918,0.000221,7e-06,3.620258e-13,0.2481493,0.254786,0.249369,0.247696,4.534621e-10
2,7.3e-05,0.00251,0.241227,0.7557,0.000459,3.183886e-05,0.0,0.014353,0.138471,0.811176,...,0.012189,0.987226,0.000489,6.7e-05,3.620258e-13,0.2481493,0.254786,0.249369,0.247696,4.534621e-10
3,0.000695,0.00011,0.000847,0.239843,0.755726,0.002778599,0.00037,0.020016,0.063895,0.131187,...,0.000802,0.003814,0.995062,0.000252,1.750215e-15,5.821766e-08,0.249306,0.260879,0.257794,0.2320211
4,0.000793,0.000964,0.004001,0.040725,0.952876,0.0006396017,0.0,0.006094,0.003319,0.034293,...,0.001194,0.002102,0.995531,0.000636,1.750215e-15,5.821766e-08,0.249306,0.260879,0.257794,0.2320211
