# Automated ML Pipeline Generator using TPOT in Python
---
<br>

#### To install :
    pip install tpot

#### Dependencies :
- scikit learn
- numpy 

In [1]:
# import libraries
import numpy as np
import pandas as pd

# import Machine Learning libraries
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB


In [2]:
data_url = "https://raw.githubusercontent.com/20b2122/AutoML-using-TPOT-in-python/main/Housing_Modified.csv"

In [3]:
df = pd.read_csv(data_url)

In [4]:
df.head()

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,42000.0,5850,3,1,two,yes,no,yes,no,no,1,no
1,38500.0,4000,2,1,one,yes,no,no,no,no,0,no
2,49500.0,3060,3,1,one,yes,no,no,no,no,0,no
3,60500.0,6650,3,1,two,yes,yes,no,no,no,0,no
4,61000.0,6360,2,1,one,yes,no,no,no,no,0,no


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 546 entries, 0 to 545
Data columns (total 12 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   price     546 non-null    float64
 1   lotsize   546 non-null    int64  
 2   bedrooms  546 non-null    int64  
 3   bathrms   546 non-null    int64  
 4   stories   546 non-null    object 
 5   driveway  546 non-null    object 
 6   recroom   546 non-null    object 
 7   fullbase  546 non-null    object 
 8   gashw     546 non-null    object 
 9   airco     546 non-null    object 
 10  garagepl  546 non-null    int64  
 11  prefarea  546 non-null    object 
dtypes: float64(1), int64(4), object(7)
memory usage: 51.3+ KB


In [6]:
# Checking for missing
df.isnull().sum()

price       0
lotsize     0
bedrooms    0
bathrms     0
stories     0
driveway    0
recroom     0
fullbase    0
gashw       0
airco       0
garagepl    0
prefarea    0
dtype: int64

In [7]:
# convert categorical (stories, driveway, recroom, fullbase, gashw, airco, prefarea) to Numerical

df['stories'] = df['stories'].map({'one': 0, 'two': 1, 'three': 2, 'four': 3})
df['driveway'] = df['driveway'].map({'yes': 1, 'no': 0})
df['recroom'] = df['recroom'].map({'yes': 1, 'no': 0})
df['fullbase'] = df['fullbase'].map({'yes': 1, 'no': 0})
df['gashw'] = df['gashw'].map({'yes': 1, 'no': 0})
df['airco'] = df['airco'].map({'yes': 1, 'no': 0})
df['prefarea'] = df['prefarea'].map({'yes': 1, 'no': 0})

In [8]:
df.head()

Unnamed: 0,price,lotsize,bedrooms,bathrms,stories,driveway,recroom,fullbase,gashw,airco,garagepl,prefarea
0,42000.0,5850,3,1,1,1,0,1,0,0,1,0
1,38500.0,4000,2,1,0,1,0,0,0,0,0,0
2,49500.0,3060,3,1,0,1,0,0,0,0,0,0
3,60500.0,6650,3,1,1,1,1,0,0,0,0,0
4,61000.0,6360,2,1,0,1,0,0,0,0,0,0


In [9]:
x = df.iloc[:,1:] # input
y = df.iloc[:,0] # output - price

In [10]:
from sklearn.model_selection import cross_val_score

---

### Individual Algorithm - find the mean of the algorithm

In [11]:
# Logistic Regression
cv_scores = cross_val_score(LogisticRegression(), x, y, cv=10)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [12]:
print("cross-validation values:\n", cv_scores)

print("Mean of cross-validation:", np.mean(cv_scores))

cross-validation values:
 [0.05454545 0.03636364 0.03636364 0.03636364 0.03636364 0.03636364
 0.03703704 0.05555556 0.03703704 0.        ]
Mean of cross-validation: 0.0365993265993266


In [13]:
# random forest classifier
rf_cv_scores = cross_val_score(RandomForestClassifier(), x, y, cv=10)



In [14]:
print("Random forest cross-validation values:\n", rf_cv_scores)
print("Mean of random forest cross-validation:", np.mean(rf_cv_scores))

Random forest cross-validation values:
 [0.         0.         0.05454545 0.01818182 0.03636364 0.
 0.05555556 0.01851852 0.01851852 0.        ]
Mean of random forest cross-validation: 0.020168350168350165


In [15]:
# 
rf_cv_scores2 = cross_val_score(RandomForestClassifier(n_estimators=1000,max_depth=2),x,y,cv=10)



In [16]:
print("Random forest cross-validation values:\n", rf_cv_scores2)
print("Mean of random forest cross-validation:", np.mean(rf_cv_scores2))

Random forest cross-validation values:
 [0.03636364 0.03636364 0.03636364 0.03636364 0.01818182 0.05454545
 0.05555556 0.03703704 0.01851852 0.05555556]
Mean of random forest cross-validation: 0.03848484848484849


___

In [17]:
import tpot



In [18]:
# to check the methods and attributes available at TPOT
dir(tpot)

['TPOTClassifier',
 'TPOTRegressor',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '__version__',
 '_version',
 'base',
 'builtins',
 'config',
 'decorators',
 'driver',
 'export_utils',
 'gp_deap',
 'gp_types',
 'main',
 'metrics',
 'operator_utils',
 'tpot']

In [19]:
# Split in train and test
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size=0.3, random_state=42)

In [22]:
from tpot import TPOTClassifier

# Init
tpot = TPOTClassifier(generations=5,verbosity=2)

In [None]:
# Fit data
tpot.fit(x_train,y_train)

![alternatvie text](https://raw.githubusercontent.com/20b2122/AutoML-using-TPOT-in-python/main/TPOTClassifier-results.png)

In [None]:
# Export the result
tpot.export('tpot_ml_pipeline.py')