# Automated ML Pipeline Generator using TPOT in Python
---
<br>

#### To install :
    pip install tpot

#### Dependencies :
- scikit learn
- numpy 

In [1]:
# import libraries
import numpy as np
import pandas as pd

# import Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB


In [2]:
data_url = "https://raw.githubusercontent.com/20b2122/AutoML-using-TPOT-in-python/main/Voice/voice.csv"

In [3]:
df = pd.read_csv(data_url)

In [4]:
# Species to Numerical
output = {value:index for index,value in enumerate(df['label'].unique())}
print(output)

{'male': 0, 'female': 1}


In [5]:
df.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,male
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,male
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,male
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,male
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,male


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3168 entries, 0 to 3167
Data columns (total 21 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   meanfreq  3168 non-null   float64
 1   sd        3168 non-null   float64
 2   median    3168 non-null   float64
 3   Q25       3168 non-null   float64
 4   Q75       3168 non-null   float64
 5   IQR       3168 non-null   float64
 6   skew      3168 non-null   float64
 7   kurt      3168 non-null   float64
 8   sp.ent    3168 non-null   float64
 9   sfm       3168 non-null   float64
 10  mode      3168 non-null   float64
 11  centroid  3168 non-null   float64
 12  meanfun   3168 non-null   float64
 13  minfun    3168 non-null   float64
 14  maxfun    3168 non-null   float64
 15  meandom   3168 non-null   float64
 16  mindom    3168 non-null   float64
 17  maxdom    3168 non-null   float64
 18  dfrange   3168 non-null   float64
 19  modindx   3168 non-null   float64
 20  label     3168 non-null   obje

In [7]:
# Checking for missing
df.isnull().sum()

meanfreq    0
sd          0
median      0
Q25         0
Q75         0
IQR         0
skew        0
kurt        0
sp.ent      0
sfm         0
mode        0
centroid    0
meanfun     0
minfun      0
maxfun      0
meandom     0
mindom      0
maxdom      0
dfrange     0
modindx     0
label       0
dtype: int64

In [8]:
# convert categorical (stories, driveway, recroom, fullbase, gashw, airco, prefarea) to Numerical

df['label'] = df['label'].map({'female': 1, 'male': 0})

In [9]:
df.head()

Unnamed: 0,meanfreq,sd,median,Q25,Q75,IQR,skew,kurt,sp.ent,sfm,...,centroid,meanfun,minfun,maxfun,meandom,mindom,maxdom,dfrange,modindx,label
0,0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,...,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.0,0.0,0
1,0.066009,0.06731,0.040229,0.019414,0.092666,0.073252,22.423285,634.613855,0.892193,0.513724,...,0.066009,0.107937,0.015826,0.25,0.009014,0.007812,0.054688,0.046875,0.052632,0
2,0.077316,0.083829,0.036718,0.008701,0.131908,0.123207,30.757155,1024.927705,0.846389,0.478905,...,0.077316,0.098706,0.015656,0.271186,0.00799,0.007812,0.015625,0.007812,0.046512,0
3,0.151228,0.072111,0.158011,0.096582,0.207955,0.111374,1.232831,4.177296,0.963322,0.727232,...,0.151228,0.088965,0.017798,0.25,0.201497,0.007812,0.5625,0.554688,0.247119,0
4,0.13512,0.079146,0.124656,0.07872,0.206045,0.127325,1.101174,4.333713,0.971955,0.783568,...,0.13512,0.106398,0.016931,0.266667,0.712812,0.007812,5.484375,5.476562,0.208274,0


In [10]:
x = df.iloc[:,:20] # input
y = df.iloc[:,20:] # output - label

---

### Individual Algorithm - find the mean of the algorithm

In [11]:
from sklearn.model_selection import cross_val_score

In [12]:
# Logistic Regression
cv_scores = cross_val_score(LogisticRegression(), x, y, cv=10)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/

In [13]:
print("cross-validation values:\n", cv_scores)
print("Mean of cross-validation:", np.mean(cv_scores))

cross-validation values:
 [0.65930599 0.83911672 0.79495268 0.86435331 0.78233438 0.93375394
 0.96214511 0.96214511 0.92088608 0.88607595]
Mean of cross-validation: 0.8605069280836959


In [14]:
# random forest classifier
rf_cv_scores = cross_val_score(RandomForestClassifier(), x, y, cv=10)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [15]:
print("Random forest cross-validation values:\n", rf_cv_scores)
print("Mean of random forest cross-validation:", np.mean(rf_cv_scores))

Random forest cross-validation values:
 [0.93375394 0.94006309 0.9873817  0.95583596 0.96529968 0.99369085
 0.99369085 0.9873817  0.92405063 0.98734177]
Mean of random forest cross-validation: 0.9668490196861399


In [16]:
rf_cv_scores2 = cross_val_score(RandomForestClassifier(n_estimators=1000,max_depth=2),x,y,cv=10)

  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)
  estimator.fit(X_train, y_train, **fit_params)


In [17]:
print("Random forest cross-validation values:\n", rf_cv_scores2)
print("Mean of random forest cross-validation:", np.mean(rf_cv_scores2))

Random forest cross-validation values:
 [0.71608833 0.92744479 0.91167192 0.92429022 0.96214511 0.98422713
 0.99684543 0.99369085 0.89240506 0.95886076]
Mean of random forest cross-validation: 0.9267669608273769


___

In [18]:
import tpot



In [19]:
# to check the methods and attributes available at TPOT
dir(tpot)

['TPOTClassifier',
 'TPOTRegressor',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_version',
 'base',
 'builtins',
 'config',
 'decorators',
 'driver',
 'export_utils',
 'gp_deap',
 'gp_types',
 'main',
 'metrics',
 'operator_utils',
 'tpot']

In [20]:
# Split in train and test
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [21]:
from tpot import TPOTClassifier

# Init
tpot = TPOTClassifier(generations=5,verbosity=2)

In [22]:
# Fit data
tpot.fit(x_train,y_train)

  from pandas import MultiIndex, Int64Index
  y = column_or_1d(y, warn=True)


                                                                                
Generation 1 - Current best internal CV score: 0.9828594960649136
                                                                              
Generation 2 - Current best internal CV score: 0.9828594960649136
                                                                                
Generation 3 - Current best internal CV score: 0.9828594960649136
                                                                              
Generation 4 - Current best internal CV score: 0.9828594960649136
                                                                              
Generation 5 - Current best internal CV score: 0.984210847416265
                                                                              
Best pipeline: ExtraTreesClassifier(input_matrix, bootstrap=False, criterion=entropy, max_features=0.7500000000000001, min_samples_leaf=1, min_samples_split=3, n_estimators=100)


In [23]:
tpot.score(x_test,y_test)

  y = column_or_1d(y, warn=True)


0.9810725552050473

In [24]:
# Export the result
tpot.export('TPOTClassifier_ml_pipeline.py')

In [25]:
# Predictions
ex = np.array([0.059781,0.064241,0.032027,0.015071,0.090193,0.075122,12.863462,274.402906,0.893369,0.491918,0,0.059781,0.084279,0.015702,0.275862,0.007812,0.007812,0.007812,0.000000,0.000000]).reshape(1,-1)

In [26]:
tpot.predict(ex)



array([0], dtype=int64)

In [27]:
output

{'male': 0, 'female': 1}