<a href="https://colab.research.google.com/github/CodeJaymes/Machine-Learn/blob/main/Automating_repetitive_tasks_and_pipelining.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

This is my attempt at writing a reusable pipeline for quickly processing Data 

The steps would be as follows 

In [2]:
#Importing Necessary Libraries for Data Manipulation 
import numpy as np     #Numpy for numerical and array computation
import pandas as pd     #Pandas for dataframe and file management
import seaborn as sns    #Seaborn  for plots 
import matplotlib.pyplot as plt  #matplotlib  for plots 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,accuracy_score, mean_squared_error

%matplotlib inline             

In [3]:
#The data to be worked on is a zip file and as such , should be unzipped 

from zipfile import ZipFile
file = '/content/archive (2).zip'
with ZipFile(file, 'r') as zp:
  print('Extracting ------')
  zp.extractall()
  print('Done.')

Extracting ------
Done.


In [4]:
#Load the data into a pandas dataframe 

df =  pd.read_csv('/content/heart.csv')
df.head()   # Print first 5 rows of the dataset   

Unnamed: 0,Age,Sex,ChestPainType,RestingBP,Cholesterol,FastingBS,RestingECG,MaxHR,ExerciseAngina,Oldpeak,ST_Slope,HeartDisease
0,40,M,ATA,140,289,0,Normal,172,N,0.0,Up,0
1,49,F,NAP,160,180,0,Normal,156,N,1.0,Flat,1
2,37,M,ATA,130,283,0,ST,98,N,0.0,Up,0
3,48,F,ASY,138,214,0,Normal,108,Y,1.5,Flat,1
4,54,M,NAP,150,195,0,Normal,122,N,0.0,Up,0


In [None]:
# Creating a column transformer to hot-encode categorical features
# Apply Standard Scaling, and fit a regressor model on a pipeline 

from sklearn.compose import ColumnTransformer 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression

#Using column transformer to encode the categorical columns(0,1 ) and skipping the other columns in one step
c_transform = ColumnTransformer(['hot-encode', OneHotEncoder(), [0,1]], remainder =  'passthrough')

#Specifying the model to be used (linear regression in this case)
classifier =  LinearRegression()      
        
        #Highlight the various steps for the pipeline 
        #A - One hot encoding 
        #B - Standard Scaler 
        #C - Simple Imputer 
        #D - Classifier 

steps =  [ ('encode',c_transform)  
           ('scaler',StandardScaler(),[1,2]),
           ('missing-value-handler',SimpleImputer(strategy ='median'), [1,2])
           ('classifier', classifier)
         ]

pipe = Pipeline(steps)

#Split the dataset into training and test set
x_train,x_test,y_train,y_test = train_test_split( x,y,random_state = 42,stratify=y,test_size = 0.2)

#train the model using the training the test data
y_predict  =  pipe.fit(x_train)

#Highlight the efficiency of the training result on the data it was trained on 
y_compare =  pipe.predict(y_predict,y_train)

#Using our classifier (regression)
results =  pipe.predict(y_predict,y_test)
score =  r2_score(results)

#calc. the logistic regression accuracy 
accuracy =  accuracy_score(y_test, y_predict)


