# Machine Learning

## PipeLines

In [2]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score


In [3]:
# Load the Dataset
df = sns.load_dataset("penguins")
print(df.head())

  species     island  bill_length_mm  bill_depth_mm  flipper_length_mm  \
0  Adelie  Torgersen            39.1           18.7              181.0   
1  Adelie  Torgersen            39.5           17.4              186.0   
2  Adelie  Torgersen            40.3           18.0              195.0   
3  Adelie  Torgersen             NaN            NaN                NaN   
4  Adelie  Torgersen            36.7           19.3              193.0   

   body_mass_g     sex  
0       3750.0    Male  
1       3800.0  Female  
2       3250.0  Female  
3          NaN     NaN  
4       3450.0  Female  


In [4]:
# Getting info of our dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


In [5]:
# Let's check for missing values
print(df.isnull().sum().sort_values(ascending=False))

sex                  11
bill_depth_mm         2
bill_length_mm        2
flipper_length_mm     2
body_mass_g           2
island                0
species               0
dtype: int64


In [6]:
# Creating Pipleline for imputing missing values in numeric and categoric column.
numeric_column = ["bill_depth_mm","bill_length_mm","flipper_length_mm","body_mass_g"]
categoric_column = ["sex"]
numeric_pipeline = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="mean"))
])
categoric_pipeline = Pipeline(steps=[
    ("Imputer", SimpleImputer(strategy="most_frequent")),
    ("Encoder", OneHotEncoder())
])

Preprocessor = ColumnTransformer(
    transformers=[
        ("Numeric",numeric_pipeline,numeric_column),
        ("Categoric",categoric_pipeline, categoric_column)
    ]
)
# Selecting Feature and Target Variable
X = df.drop("species", axis=1)
y = df["species"]
# Splitting the dataset into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Create a pipeline with the preprocessor and RandomForestClassifier
pipeline = Pipeline(steps=[
    ('Preprocessor', Preprocessor),
    ('Classifier', RandomForestClassifier(random_state=42))
])

# Fit the pipeline on the training data
pipeline.fit(X_train, y_train)

# Make predictions on the test data
y_pred = pipeline.predict(X_test)

# Calculate accuracy score
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

print("Actual vs Predicted values (first 5):")
for actual, pred in zip(y_test[:10], y_pred[:5]):
    print(f"Actual: {actual}, Predicted: {pred}")

Accuracy: 0.9710144927536232
Actual vs Predicted values (first 5):
Actual: Chinstrap, Predicted: Chinstrap
Actual: Chinstrap, Predicted: Chinstrap
Actual: Gentoo, Predicted: Gentoo
Actual: Chinstrap, Predicted: Chinstrap
Actual: Gentoo, Predicted: Gentoo


# Hyper Parameter Tuning

In [7]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder,OneHotEncoder
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import r2_score


In [8]:
# Load the Dataset
df = sns.load_dataset("Diamonds")
print(df.head())

   carat      cut color clarity  depth  table  price     x     y     z
0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75


In [9]:
# Getting the basic info of our dataset
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   carat    53940 non-null  float64
 1   cut      53940 non-null  object 
 2   color    53940 non-null  object 
 3   clarity  53940 non-null  object 
 4   depth    53940 non-null  float64
 5   table    53940 non-null  float64
 6   price    53940 non-null  int64  
 7   x        53940 non-null  float64
 8   y        53940 non-null  float64
 9   z        53940 non-null  float64
dtypes: float64(6), int64(1), object(3)
memory usage: 4.1+ MB


In [10]:
# Let's check for missing values in our dataset
print(df.isnull().sum().sort_values(ascending=False))

carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64


In [11]:
print(df.head())

   carat      cut color clarity  depth  table  price     x     y     z
0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75


In [12]:
# Pipeline to encode the categoric features
categoric_columns = ["cut","color","clarity"]
categoric_pipelines = Pipeline(steps=[
    ("Encoder", OneHotEncoder())
])
Preprocessors = ColumnTransformer([
    ("Encoder", categoric_pipelines, categoric_columns)
])
# Now Pipeline for fitting model
pipelines = Pipeline(steps=[
    ("Preprocessor", Preprocessors),
    ("Model",RandomForestRegressor(random_state=42))
])
# Define the hyperparameters to tune
hyperparameters = {
    'Model__n_estimators': [100, 200],
    'Model__max_depth': [None, 10],
    'Model__min_samples_split': [2, 5]
}
# Selecting Features and Target
X = df.drop("price",axis=1)
y = df["price"]
# Splitting the data into train and test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Perform grid search cross-validation
grid_search = GridSearchCV(pipelines, hyperparameters, cv=3, n_jobs=-1 )
grid_search.fit(X_train, y_train)

# Get the best model
best_model = grid_search.best_estimator_

# Make predictions on the test data using the best model
y_pred = best_model.predict(X_test)

# Calculate accuracy score
R2_score = r2_score(y_test, y_pred)
print("R2_score:", R2_score)
# Print the best hyperparameters
print("Best Hyperparameters:", grid_search.best_params_)

R2_score: 0.08753701698317917
Best Hyperparameters: {'Model__max_depth': 10, 'Model__min_samples_split': 5, 'Model__n_estimators': 200}


# Selecting Best Model in Pipeline   

In [14]:
# Loading the Dataset
df = sns.load_dataset("diamonds")
print(df.head()) 

   carat      cut color clarity  depth  table  price     x     y     z
0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 53940 entries, 0 to 53939
Data columns (total 10 columns):
 #   Column   Non-Null Count  Dtype   
---  ------   --------------  -----   
 0   carat    53940 non-null  float64 
 1   cut      53940 non-null  category
 2   color    53940 non-null  category
 3   clarity  53940 non-null  category
 4   depth    53940 non-null  float64 
 5   table    53940 non-null  float64 
 6   price    53940 non-null  int64   
 7   x        53940 non-null  float64 
 8   y        53940 non-null  float64 
 9   z        53940 non-null  float64 
dtypes: category(3), float64(6), int64(1)
memory usage: 3.0 MB


In [16]:
# Check for missing value in our dataset
print(df.isnull().sum().sort_values(ascending=False))


carat      0
cut        0
color      0
clarity    0
depth      0
table      0
price      0
x          0
y          0
z          0
dtype: int64


In [17]:
print(df.head())

   carat      cut color clarity  depth  table  price     x     y     z
0   0.23    Ideal     E     SI2   61.5   55.0    326  3.95  3.98  2.43
1   0.21  Premium     E     SI1   59.8   61.0    326  3.89  3.84  2.31
2   0.23     Good     E     VS1   56.9   65.0    327  4.05  4.07  2.31
3   0.29  Premium     I     VS2   62.4   58.0    334  4.20  4.23  2.63
4   0.31     Good     J     SI2   63.3   58.0    335  4.34  4.35  2.75


In [32]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
from sklearn.metrics import r2_score
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression

categoric_columns = ["cut", "color", "clarity"]
preprocessor = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), categoric_columns)
], remainder="passthrough")

regression_models = [
    ("Support Vector Machine", SVR()),
    ("RandomForestRegressor", RandomForestRegressor(random_state=42)),
    ("LinearRegression", LinearRegression()),
    ("Gradient Boosting Regressor", GradientBoostingRegressor(random_state=42))
]

best_model = None
best_score = float('-inf')

for model_name, model in regression_models:
    model_pipeline = Pipeline([
        ("preprocessor", preprocessor),
        ("model", model)
    ])
    scores = cross_val_score(model_pipeline, X_train, y_train, cv=2, scoring='r2')
    mean_score = scores.mean()
    model_pipeline.fit(X_train, y_train)
    y_pred = model_pipeline.predict(X_test)
    test_score = r2_score(y_test, y_pred)
    print("Model:", model_name)
    print("Cross-validation R2:", mean_score)
    print("Test R2:", test_score)
    print()
    if test_score > best_score:
        best_score = test_score
        best_model = model_pipeline

print("Best Model:", best_model)

Model: Support Vector Machine
Cross-validation R2: -0.13908713235557357
Test R2: -0.12522005263823188

Model: RandomForestRegressor
Cross-validation R2: 0.9791809466772817
Test R2: 0.9809753922281078

Model: LinearRegression
Cross-validation R2: 0.9197618238075258
Test R2: 0.9189331350419387

Model: Gradient Boosting Regressor
Cross-validation R2: 0.9654103280516799
Test R2: 0.9674473884071069

Best Model: Pipeline(steps=[('preprocessor',
                 ColumnTransformer(remainder='passthrough',
                                   transformers=[('cat',
                                                  OneHotEncoder(handle_unknown='ignore'),
                                                  ['cut', 'color',
                                                   'clarity'])])),
                ('model', RandomForestRegressor(random_state=42))])
