In [1]:
import pandas as pd 
import numpy as np
import plotly.express as px
import seaborn as sns
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder
from category_encoders import BinaryEncoder
from sklearn.preprocessing import RobustScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, AdaBoostRegressor, GradientBoostingRegressor
from sklearn.linear_model import LinearRegression,Lasso,Ridge
from sklearn.preprocessing import PolynomialFeatures
from xgboost import XGBRegressor
from sklearn.model_selection import cross_validate, train_test_split
from sklearn.metrics import r2_score,mean_squared_error,make_scorer
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_regression,RFE
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
import streamlit as st
import joblib as jb

In [2]:
# Read Dataset
df=pd.read_excel('Fire Pump Flow.xlsx')

# EDA

In [3]:
df.sample(20)

Unnamed: 0,Hazard,Building Height (m),Pipe Length (m),D For Longest Pipe (in),Sprinkler Orientaion,Sprinkler k-factor,Hose Cabinet,System Type,Pump Flow (GPM)
14,Light,20,50,4,Upright,5.6,No FHC,Wet,200
2,Ordinary 2,30,100,3,Pendent,5.6,inside >1,Wet,400
34,Ordinary 2,29,124,6,SideWall,11.2,inside >1,Wet,750
37,Extra 1,34,104,6,Upright,16.8,No FHC,Wet,750
12,Ordinary 1,26,99,4,Pendent,8.0,inside_and_outside,Wet,750
38,Extra 2,24,108,4,Upright,14.0,No FHC,Wet,1000
25,Ordinary 1,20,102,6,SideWall,14.0,inside_and_outside,Wet,1250
27,Extra 2,25,97,6,Upright,16.8,inside_and_outside,Wet,1500
29,Light,32,68,6,Pendent,5.6,inside_and_outside,Wet,300
16,Ordinary 1,25,80,4,Pendent,8.0,No FHC,Wet,450


In [4]:
# Check datatype of columns
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 9 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Hazard                   40 non-null     object 
 1   Building Height (m)      40 non-null     int64  
 2   Pipe Length (m)          40 non-null     int64  
 3   D For Longest Pipe (in)  40 non-null     int64  
 4   Sprinkler Orientaion     40 non-null     object 
 5   Sprinkler k-factor       40 non-null     float64
 6   Hose Cabinet             40 non-null     object 
 7   System Type              40 non-null     object 
 8   Pump Flow (GPM)          40 non-null     int64  
dtypes: float64(1), int64(4), object(4)
memory usage: 2.9+ KB


In [5]:
# Check dataset columns
df.columns

Index(['Hazard', 'Building Height (m)', 'Pipe Length (m)',
       'D For Longest Pipe (in)', 'Sprinkler Orientaion', 'Sprinkler k-factor',
       'Hose Cabinet', 'System Type', 'Pump Flow (GPM)'],
      dtype='object')

In [6]:
cols=['Hazard', 'Building Height (m)', 'Pipe Length (m)',
       'D For Longest Pipe (in)', 'Sprinkler Orientaion', 'Sprinkler k-factor',
       'Hose Cabinet', 'System Type', 'Pump Flow (GPM)']

In [7]:
# Check unique values count for each column
for col in cols:
    print(f'{col}')
    print(df[col].value_counts())
    print('*********************')
    

Hazard
Ordinary 1    13
Light          9
Ordinary 2     7
Extra 2        6
Extra 1        5
Name: Hazard, dtype: int64
*********************
Building Height (m)
20    5
25    5
21    4
35    4
30    3
38    2
26    2
34    2
31    2
33    2
32    2
24    2
27    2
39    1
36    1
29    1
Name: Building Height (m), dtype: int64
*********************
Pipe Length (m)
100    3
150    3
80     2
96     2
74     2
68     2
102    2
50     2
99     2
136    2
88     2
95     2
64     1
108    1
104    1
125    1
147    1
124    1
60     1
79     1
93     1
97     1
132    1
160    1
142    1
113    1
Name: Pipe Length (m), dtype: int64
*********************
D For Longest Pipe (in)
4    14
6    12
3     8
8     5
2     1
Name: D For Longest Pipe (in), dtype: int64
*********************
Sprinkler Orientaion
Upright     15
Pendent     14
SideWall    11
Name: Sprinkler Orientaion, dtype: int64
*********************
Sprinkler k-factor
8.0     11
11.2    10
5.6      7
14.0     6
16.8     6
Name: Sp

## Cleaning Data

In [8]:
# Drop system type column because it's just 1 value
df.drop('System Type',axis=1,inplace=True)

## Feature Engineering

In [9]:
# Map Hazards Categories
Hazards_Categories = {
    "Light":0,
    "Ordinary 1":1,
    "Ordinary 2":2,
    "Extra 1":3,
    "Extra 2":4
}

In [10]:
df['Hazard'] = df['Hazard'].map(Hazards_Categories)

In [11]:
# Map FHC Categories
FHC_Categories = {
    "No FHC":0,
    "inside 1":1,
    "inside >1":2,
    "Only Outside":3,
    "inside_and_outside":4
}

In [12]:
df['Hose Cabinet'] = df['Hose Cabinet'].map(FHC_Categories)

In [13]:
df['Hose Cabinet'].value_counts()

4    11
2     9
0     8
1     7
3     5
Name: Hose Cabinet, dtype: int64

In [14]:
# Check info of data
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40 entries, 0 to 39
Data columns (total 8 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Hazard                   40 non-null     int64  
 1   Building Height (m)      40 non-null     int64  
 2   Pipe Length (m)          40 non-null     int64  
 3   D For Longest Pipe (in)  40 non-null     int64  
 4   Sprinkler Orientaion     40 non-null     object 
 5   Sprinkler k-factor       40 non-null     float64
 6   Hose Cabinet             40 non-null     int64  
 7   Pump Flow (GPM)          40 non-null     int64  
dtypes: float64(1), int64(6), object(1)
memory usage: 2.6+ KB


# Modeling:

In [15]:
## split it into x--> data,  y--> target
X = df.drop("Pump Flow (GPM)", axis =1)
y = df['Pump Flow (GPM)']

In [16]:
## Models 
models = [
    ('Linear Regression', LinearRegression()), 
    ('Ridge',Ridge()),
    ('Lasso',Lasso(alpha=0.001)),
    ('DTR', DecisionTreeRegressor()), 
    ('RFR', RandomForestRegressor()), 
    ('ADA', AdaBoostRegressor()), 
    ('GBR', GradientBoostingRegressor()), 
    ('XGBR', XGBRegressor())
]

In [17]:
# Define the numerical column
numerical_cols = X.select_dtypes(include='number').columns

# Initialize RFE feature selector
RFE_selector = RFE(XGBRegressor(), n_features_to_select=5) 

# Create the ColumnTransformer
preprocessor = ColumnTransformer(transformers=[('num', RobustScaler(), numerical_cols), 
                                               ('Cat',BinaryEncoder(),'Sprinkler Orientaion')
                                              ],remainder='passthrough')


for model_name, model in models:
    # Print model name
    print(f"For {model_name}:")

    # Steps Creation
    steps = list()
    steps.append(('preprocessor', preprocessor))

    # Choose the feature selector based on your preference
    steps.append(('feature_selector', RFE_selector))  # RFE feature selection
    
    steps.append((model_name, model))

    # Create the pipeline
    pipeline = Pipeline(steps=steps)

    ## Scoring
    # Perform cross-validation
    # Example evaluation metrics
    cv_results = cross_validate(pipeline, X, y, scoring='r2', cv=5, return_train_score=True)  # X and y are your input data and targets



    # Access the results
    print("test :", cv_results['test_score'].mean())
    print("train:", cv_results['train_score'].mean())
    print('*' * 50)

For Linear Regression:
test : 0.6502990878740785
train: 0.868829545675052
**************************************************
For Ridge:
test : 0.6381119015011241
train: 0.8648680118801831
**************************************************
For Lasso:
test : 0.650299923687674
train: 0.8688295455683275
**************************************************
For DTR:
test : 0.645660245325532
train: 1.0
**************************************************
For RFR:
test : 0.6791606205866051
train: 0.9657558940236424
**************************************************
For ADA:
test : 0.5721071586690702
train: 0.9591045517561272
**************************************************
For GBR:
test : 0.8690538795045128
train: 0.999689361906794
**************************************************
For XGBR:
test : 0.7718098148946823
train: 0.9999999999863534
**************************************************


> GradientBoostingRegressor best Model

## HyperParameters Tuning

In [18]:
# define the parameter grid
param_grid = {
    'GBR__n_estimators': [50, 100, 200],  # Number of trees
    'GBR__learning_rate': [0.01, 0.1, 0.5],  # Shrinkage factor
    'GBR__max_depth': [3, 5, 7],  # Maximum depth of trees
    'GBR__min_samples_split': [2, 5, 10],  # Minimum samples required to split an internal node
    'GBR__min_samples_leaf': [1, 2, 4]  # Minimum samples required to form a leaf node
}

In [19]:
steps=[]
steps.append(('preprocessor', preprocessor))
steps.append(('feature_selector', RFE_selector))
steps.append(('GBR',GradientBoostingRegressor()))
pipeline=Pipeline(steps=steps)

In [20]:
# Create GridSearchCV instance
grid_search = GridSearchCV(estimator=pipeline, param_grid=param_grid, cv=5, scoring='r2', n_jobs=-1, return_train_score=True)

# Fit the pipeline with GridSearch to the data
grid_search.fit(X, y)

# Get the best parameters and best score
best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'GBR__learning_rate': 0.1, 'GBR__max_depth': 5, 'GBR__min_samples_leaf': 4, 'GBR__min_samples_split': 5, 'GBR__n_estimators': 200}
Best Score: 0.887167394758921


In [21]:
# Get the mean test score and mean train score for the best estimator
mean_test_score = grid_search.cv_results_['mean_test_score'][grid_search.best_index_]
mean_train_score = grid_search.cv_results_['mean_train_score'][grid_search.best_index_]

print("Mean Test Score:", mean_test_score)
print("Mean Train Score:", mean_train_score)

Mean Test Score: 0.887167394758921
Mean Train Score: 0.9992298762123457


In [22]:
final_model=grid_search.best_estimator_
final_model

In [23]:
# Saving Model , Columns into PKL Files
jb.dump(final_model,"PF_Model_Final.pkl")
jb.dump(X.columns,"PF_Inputs_Final.pkl")

['PF_Inputs_Final.pkl']

## Deployment

In [28]:
%%writefile streamlit_PF_final.py

import joblib
import streamlit as st
import pandas as pd
import sklearn

Model= joblib.load("PF_Model_Final.pkl")
Inputs= joblib.load("PF_Inputs_Final.pkl")

def Haz(Hazard):
    if Hazard == 'Light':
        return int(0)
    elif Hazard == 'Ordinary 1':
        return int(1)
    elif Hazard == 'Ordinary 2':
        return int(2)
    elif Hazard == 'Extra 1':
        return int(3)
    elif Hazard == 'Extra 2':
        return int(4)

def FHC(Hose_Cabinet):
    if Hose_Cabinet == 'No FHC':
        return int(0)
    elif Hose_Cabinet == 'inside 1':
        return int(1)
    elif Hose_Cabinet == 'inside >1':
        return int(2)
    elif Hose_Cabinet == 'Only Outside':
        return int(3)
    elif Hose_Cabinet == 'inside_and_outside':
        return int(4)   

def prediction(Hazard, Building_Height, Pipe_Length,
       D_For_Longest_Pipe, Sprinkler_Orientaion, Sprinkler_k_factor,
       Hose_Cabinet, System_Type):
    hazard=Haz(Hazard)
    fhc=FHC(Hose_Cabinet)
    test_df=pd.DataFrame(columns=Inputs)
    test_df.at[0,'Hazard']= hazard
    test_df.at[0,'Building Height (m)']= Building_Height
    test_df.at[0,'Pipe Length (m)']= Pipe_Length
    test_df.at[0,'D For Longest Pipe (in)']= D_For_Longest_Pipe
    test_df.at[0,'Sprinkler Orientaion']=  Sprinkler_Orientaion
    test_df.at[0,'Sprinkler k-factor']= Sprinkler_k_factor
    test_df.at[0,'Hose Cabinet']= fhc
    test_df.at[0,'System Type']= System_Type
    test_df.drop('System Type',axis=1,inplace=True)
    result= Model.predict(test_df)
    return result[0]

def main():
    
    ## Setting up the page title
    st.set_page_config(page_title= 'Fire Fighting Pump Flow Prediction')
    
     # Add a title in the middle of the page using Markdown and CSS
    st.markdown("<h1 style='text-align: center;text-decoration: underline;color:GoldenRod'>Fire Fighting Pump Flow Prediction</h1>", unsafe_allow_html=True)
    
    Hazard=st.selectbox('Insert Hazard',  ['Light', 'Ordinary 1', 'Ordinary 2', 'Extra 1', 'Extra 2'])
    
    Building_Height=st.number_input('Insert Building Height (m)',min_value=2, max_value=120, value=30,step=5)
    
    Pipe_Length=st.number_input('Insert Pipe Length (m)',min_value=2, max_value=200, value=80,step=10)
    
    D_For_Longest_Pipe=st.slider('Choose D For Longest Pipe (in)', min_value=2, max_value=10, value=4,step=1)

    Sprinkler_Orientaion=st.radio('Sprinkler Orientaion', ['Pendent', 'Upright','SideWall'])
    
    Sprinkler_k_factor=st.selectbox('Select Sprinkler k-factor', [5.6,8,11.2,14,16.8,19.6,22.4])
    
    Hose_Cabinet =st.selectbox('Select FHC', ['No FHC', 'inside 1', 'inside >1', 'Only Outside', 'inside_and_outside'])
    
    System_Type=st.radio('Select System Type', ['Wet'])
    
   
    
    if st.button('predict'):
        results= prediction(Hazard, Building_Height, Pipe_Length,
                            D_For_Longest_Pipe, Sprinkler_Orientaion, Sprinkler_k_factor,Hose_Cabinet, System_Type)
        st.text(f"The Pump Flow is {int(results)} GPM")
    
if __name__ == '__main__':
    main()

Overwriting streamlit_PF_final.py


In [29]:
!streamlit run streamlit_PF_final.py

^C
