<h1>Asan Dopollos Model</h1>

In [1]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


# Import datasets
network = pd.read_csv('network.csv')
inspections = pd.read_csv('inspections.csv')
pipes = pd.read_csv('sample_submission.csv')

## Data Visualization

In [2]:
network.head()

Unnamed: 0,PipeId,Province,Town,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
0,489616,Ciudad Real,Torralba de Calatrava,2012,PE,Gas natural,63.0,7.792,0.4,0,0,False
1,489645,Ciudad Real,Torralba de Calatrava,2012,PE,Gas natural,90.0,2.08,0.4,0,0,False
2,489646,Ciudad Real,Torralba de Calatrava,2012,PE,Gas natural,90.0,111.173,0.4,3,0,False
3,519950,Ciudad Real,Torralba de Calatrava,2012,PE,Gas natural,200.0,14.304,0.4,0,0,False
4,1332439,Ciudad Real,Torralba de Calatrava,2012,PE,Gas natural,160.0,22.669,0.4,1,0,False


In [3]:
pipes.head()

Unnamed: 0,PipeId,Incidence
0,446859944,0
1,428124500,0
2,438428871,0
3,429034569,0
4,411184477,0


In [4]:
inspections.head()

Unnamed: 0,PipeId,MaintenanceId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence
0,189311802,ZRV-00001972,2010,2010-10-01,24,4,0
1,188889914,ZRV-00001406,2010,2010-10-01,22,4,0
2,188889907,ZRV-00001406,2010,2010-10-01,22,4,0
3,188673355,ZRV-00001406,2010,2010-10-01,22,4,0
4,188558922,ZRV-00001406,2010,2010-10-01,22,4,0


Merging the df with an inner join using PipeId

In [5]:
# Merge both dataframes
df = inspections.merge(
                        network,
                        how = 'inner',
                        on=['PipeId'] 
                        )

df

Unnamed: 0,PipeId,MaintenanceId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,Town,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
0,189311802,ZRV-00001972,2010,2010-10-01,24,4,0,Barcelona,Sentmenat,2001,PE,Gas natural,160.0,117.831,4.000,0,0,False
1,189311802,ZRV-00001972,2012,2012-10-01,24,4,0,Barcelona,Sentmenat,2001,PE,Gas natural,160.0,117.831,4.000,0,0,False
2,189311802,ZRV-00001972,2014,2014-10-08,24,4,0,Barcelona,Sentmenat,2001,PE,Gas natural,160.0,117.831,4.000,0,0,False
3,189311802,ZRV-00001972,2016,2016-10-14,24,4,0,Barcelona,Sentmenat,2001,PE,Gas natural,160.0,117.831,4.000,0,0,False
4,189311802,ZRV-00001972,2018,2018-10-09,24,4,0,Barcelona,Sentmenat,2001,PE,Gas natural,160.0,117.831,4.000,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345339,56922448,ZRV-00004581,2020,2020-12-31,24,4,0,Valencia,Betera,1993,PE,Gas natural,63.0,1.237,4.000,0,0,False
6345340,56907425,ZRV-00004581,2020,2020-12-31,24,4,0,Valencia,Betera,2000,PE,Gas natural,160.0,169.732,4.000,1,0,False
6345341,7843615,ZRV-00002112,2021,2020-12-31,24,4,0,Barcelona,Sabadell,1995,FD,Gas natural,150.0,0.361,0.025,0,0,True
6345342,56922456,ZRV-00004581,2020,2020-12-31,24,4,0,Valencia,Betera,1993,PE,Gas natural,63.0,23.306,4.000,0,0,False


## Data preparation

Dates will be converted to datetime.

In [6]:
df['InspectionDate'] = pd.to_datetime(df['InspectionDate'])

Column Dop: MaintenaceId, Town, NumConnectionsUnder

In [7]:
df = df.drop(['MaintenanceId','NumConnectionsUnder','Town'],axis=1)
df

Unnamed: 0,PipeId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,BoolBridle
0,189311802,2010,2010-10-01,24,4,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
1,189311802,2012,2012-10-01,24,4,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
2,189311802,2014,2014-10-08,24,4,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
3,189311802,2016,2016-10-14,24,4,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
4,189311802,2018,2018-10-09,24,4,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345339,56922448,2020,2020-12-31,24,4,0,Valencia,1993,PE,Gas natural,63.0,1.237,4.000,0,False
6345340,56907425,2020,2020-12-31,24,4,0,Valencia,2000,PE,Gas natural,160.0,169.732,4.000,1,False
6345341,7843615,2021,2020-12-31,24,4,0,Barcelona,1995,FD,Gas natural,150.0,0.361,0.025,0,True
6345342,56922456,2020,2020-12-31,24,4,0,Valencia,1993,PE,Gas natural,63.0,23.306,4.000,0,False


Severity values will be inverted, meaning that they will go from 0-3 going from no incidence to highest severity

In [8]:
inverse_sev = {4:0,3:1,2:2,1:3}
list = []
for sev in df["Severity"]:
    list.append(inverse_sev[sev]) 

df['Severity'] = list
df

Unnamed: 0,PipeId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,BoolBridle
0,189311802,2010,2010-10-01,24,0,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
1,189311802,2012,2012-10-01,24,0,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
2,189311802,2014,2014-10-08,24,0,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
3,189311802,2016,2016-10-14,24,0,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
4,189311802,2018,2018-10-09,24,0,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345339,56922448,2020,2020-12-31,24,0,0,Valencia,1993,PE,Gas natural,63.0,1.237,4.000,0,False
6345340,56907425,2020,2020-12-31,24,0,0,Valencia,2000,PE,Gas natural,160.0,169.732,4.000,1,False
6345341,7843615,2021,2020-12-31,24,0,0,Barcelona,1995,FD,Gas natural,150.0,0.361,0.025,0,True
6345342,56922456,2020,2020-12-31,24,0,0,Valencia,1993,PE,Gas natural,63.0,23.306,4.000,0,False


Normalization

In [None]:
for column in ['Diameter', 'Length', 'Pressure']:
   df[column] = (df[column] - df[column].mean()) / df[column].std()

## Feature Engineering

### Age of the pipe

In [9]:
df['Age'] = df["InspectionYear"] - df["YearBuilt"]

### Previous revision severity

In [10]:
df = df.sort_values(['PipeId','InspectionDate'])
df = df.reset_index(drop=True)

prev_sev = 0
previous_severity = []

for ind in df.index:
    previous_severity.append(prev_sev)
    prev_sev = df['Severity'][ind]

df['PrevSeverity'] = previous_severity
df[df.groupby('PipeId').cumcount() == 0]['PrevSeverity'] = 0

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df.groupby('PipeId').cumcount() == 0]['PrevSeverity'] = 0


Unnamed: 0,PipeId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,BoolBridle,Age,PrevSeverity
0,489616,2013,2013-05-08,16,0,0,Ciudad Real,2012,PE,Gas natural,63.0,7.792,0.40,0,False,1,0
1,489616,2015,2015-03-04,22,0,0,Ciudad Real,2012,PE,Gas natural,63.0,7.792,0.40,0,False,3,0
2,489616,2016,2017-01-12,22,0,0,Ciudad Real,2012,PE,Gas natural,63.0,7.792,0.40,0,False,4,0
3,489616,2018,2018-12-19,23,0,0,Ciudad Real,2012,PE,Gas natural,63.0,7.792,0.40,0,False,6,0
4,489616,2020,2020-12-05,24,0,0,Ciudad Real,2012,PE,Gas natural,63.0,7.792,0.40,0,False,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345339,451995260,2019,2019-10-23,24,0,0,Lleida,2003,PE,Gas natural,90.0,67.284,0.15,2,False,16,0
6345340,451995309,2013,2013-10-07,24,0,0,Lleida,1998,PE,Gas natural,200.0,92.220,0.15,0,False,15,0
6345341,451995309,2015,2015-10-05,24,0,0,Lleida,1998,PE,Gas natural,200.0,92.220,0.15,0,False,17,0
6345342,451995309,2017,2017-10-02,24,0,0,Lleida,1998,PE,Gas natural,200.0,92.220,0.15,0,False,19,0


### Accumulated number of incidence per pipe and accumulated number of revisions

In [11]:
accum_incidence_list = [0]
incidence_count = df['Incidence'][0]
revision_count = 1
revision_list = [0]

for ind in range(1, len(df.index)):
    if df['PipeId'][ind] != df['PipeId'][ind-1]:
        incidence_count = 0
        revision_count = 0
    accum_incidence_list.append(incidence_count)
    revision_list.append(revision_count)
    incidence_count = incidence_count + df['Incidence'][ind]
    revision_count += 1

df['AcumIncidence'] = accum_incidence_list
df['AcumRevisions'] = revision_list

In [12]:
df

Unnamed: 0,PipeId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,BoolBridle,Age,PrevSeverity,AcumIncidence,AcumRevisions
0,489616,2013,2013-05-08,16,0,0,Ciudad Real,2012,PE,Gas natural,63.0,7.792,0.40,0,False,1,0,0,0
1,489616,2015,2015-03-04,22,0,0,Ciudad Real,2012,PE,Gas natural,63.0,7.792,0.40,0,False,3,0,0,1
2,489616,2016,2017-01-12,22,0,0,Ciudad Real,2012,PE,Gas natural,63.0,7.792,0.40,0,False,4,0,0,2
3,489616,2018,2018-12-19,23,0,0,Ciudad Real,2012,PE,Gas natural,63.0,7.792,0.40,0,False,6,0,0,3
4,489616,2020,2020-12-05,24,0,0,Ciudad Real,2012,PE,Gas natural,63.0,7.792,0.40,0,False,8,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345339,451995260,2019,2019-10-23,24,0,0,Lleida,2003,PE,Gas natural,90.0,67.284,0.15,2,False,16,0,0,3
6345340,451995309,2013,2013-10-07,24,0,0,Lleida,1998,PE,Gas natural,200.0,92.220,0.15,0,False,15,0,0,0
6345341,451995309,2015,2015-10-05,24,0,0,Lleida,1998,PE,Gas natural,200.0,92.220,0.15,0,False,17,0,0,1
6345342,451995309,2017,2017-10-02,24,0,0,Lleida,1998,PE,Gas natural,200.0,92.220,0.15,0,False,19,0,0,2


One hot encoding the categorical values with pandas get dummies

In [13]:
df = pd.get_dummies(df)
df.head(4)

Unnamed: 0,PipeId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,YearBuilt,Diameter,Length,Pressure,...,Material_FG,Material_FI,Material_FO,Material_PA,Material_PE,Material_PN,Material_PV,Material_ZD,GasType_Gas natural,GasType_Gas propano
0,489616,2013,2013-05-08,16,0,0,2012,63.0,7.792,0.4,...,0,0,0,0,1,0,0,0,1,0
1,489616,2015,2015-03-04,22,0,0,2012,63.0,7.792,0.4,...,0,0,0,0,1,0,0,0,1,0
2,489616,2016,2017-01-12,22,0,0,2012,63.0,7.792,0.4,...,0,0,0,0,1,0,0,0,1,0
3,489616,2018,2018-12-19,23,0,0,2012,63.0,7.792,0.4,...,0,0,0,0,1,0,0,0,1,0


The dataframe will be reindexed by means of the PipeId and Date

In [14]:
df = df.set_index(['PipeId', 'InspectionDate','Severity'])
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6345344 entries, (489616, Timestamp('2013-05-08 00:00:00'), 0) to (451995309, Timestamp('2019-10-26 00:00:00'), 0)
Data columns (total 64 columns):
 #   Column                Dtype  
---  ------                -----  
 0   InspectionYear        int64  
 1   MonthsLastRev         int64  
 2   Incidence             int64  
 3   YearBuilt             int64  
 4   Diameter              float64
 5   Length                float64
 6   Pressure              float64
 7   NumConnections        int64  
 8   BoolBridle            bool   
 9   Age                   int64  
 10  PrevSeverity          int64  
 11  AcumIncidence         int64  
 12  AcumRevisions         int64  
 13  Province_Albacete     uint8  
 14  Province_Alicante     uint8  
 15  Province_Almeria      uint8  
 16  Province_Avila        uint8  
 17  Province_Barcelona    uint8  
 18  Province_Burgos       uint8  
 19  Province_Castellón    uint8  
 20  Province_Ciudad Real  uint8

Splitting in a train-test set

In [15]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Incidence'],axis=1),df['Incidence'])

Balancing the dataframe as it is quite unbalanced

In [16]:
X_train['Incidence'] = y_train

In [17]:
# First we check how balanced is the dataset
Incidence = X_train[X_train.Incidence == 1]
not_Incidence = X_train[X_train.Incidence == 0]

# index property contains the indexes of the not_Incidence dataset
not_Incidence_indexes = not_Incidence.index

# get as many not_incidence indexes (randomly) as incidence occurrences
np.random.seed(42)
random_not_Incidence_indexes = np.random.choice(not_Incidence_indexes, len(Incidence))

# Keep those entries in not_Incidence
not_Incidence = X_train.loc[random_not_Incidence_indexes]

# We will form the balanced dataset concatenating Incidence and not_Incidence
balanced_df = Incidence.append(not_Incidence)
balanced_df.sort_index()
X_train = balanced_df

y_train = X_train['Incidence']
X_train = X_train.drop(['Incidence'],axis=1)

  balanced_df = Incidence.append(not_Incidence)


## Training and testing the model

BayesianNB Classification

In [19]:
forecaster = GaussianNB()
forecaster.fit(X_train, y_train)
Incidence_pred = forecaster.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

0.7181068613039446


Support vector polinomial kernel

In [None]:
# forecaster = SVC(kernel='poly')
# forecaster.fit(X_train, y_train)
# Incidence_pred = forecaster.predict(X_test)
# print(metrics.roc_auc_score(y_test,Incidence_pred))

Logisitc regression

In [20]:
forecaster = LogisticRegression()
forecaster.fit(X_train, y_train)
Incidence_pred = forecaster.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7544718425122657


Neural Network

In [21]:
forecaster_nn = MLPClassifier()
forecaster_nn.fit(X_train, y_train)
Incidence_pred = forecaster_nn.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

0.6806249943131876


Random Forest classifier

In [29]:
forecaster_rf = RandomForestClassifier()
forecaster_rf.fit(X_train, y_train)
Incidence_pred = forecaster_rf.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

0.8403161440958493


Gradient boosting classifier

In [23]:
forecaster_gb = GradientBoostingClassifier()
forecaster_gb.fit(X_train, y_train)
Incidence_pred = forecaster_gb.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

0.8394083529869475


### Hyperparameter tuning

#### Random forest classifier

In [28]:
# Define parameters to test
grid_param_rf = {
    'n_estimators' : range(115, 125, 5),
    'criterion' : ['gini', 'entropy', 'log_loss'], 
    'max_depth' : range(10, 30, 5),
    'min_samples_leaf' : [1],
    'min_samples_split' : range(2,5,1),
    'max_features' : ['sqrt', 'log2', 'auto']
}

In [30]:
grid_search_rf = GridSearchCV(estimator=forecaster_rf, param_grid=grid_param_rf, scoring='roc_auc', cv=5, n_jobs=-1, verbose=3)
grid_search_rf.fit(X_train, y_train)

Fitting 5 folds for each of 216 candidates, totalling 1080 fits
[CV 2/5] END criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=115;, score=0.905 total time=   1.1s
[CV 1/5] END criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=115;, score=0.912 total time=   1.2s
[CV 3/5] END criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=115;, score=0.906 total time=   1.1s
[CV 4/5] END criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=115;, score=0.906 total time=   1.2s
[CV 5/5] END criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=115;, score=0.912 total time=   1.2s
[CV 1/5] END criterion=gini, max_depth=10, max_features=sqrt, min_samples_leaf=1, min_samples_split=2, n_estimators=120;, score=0.912 total time=   1.2s
[CV 3/5] END crite

360 fits failed out of a total of 1080.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
360 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/carlospiqueras73/opt/anaconda3/lib/python3.9/site-packages/sklearn/model_selection/_validation.py", line 680, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/carlospiqueras73/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_forest.py", line 450, in fit
    trees = Parallel(
  File "/Users/carlospiqueras73/opt/anaconda3/lib/python3.9/site-packages/joblib/parallel.py", line 1043, in __call__
    if self.dispatch_one_batch(iterator):
  File "/Users/carlospiqueras73/opt/anaconda3/lib/python3.9/site-packages/joblib/paral

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy', 'log_loss'],
                         'max_depth': range(10, 30, 5),
                         'max_features': ['sqrt', 'log2', 'auto'],
                         'min_samples_leaf': [1],
                         'min_samples_split': range(2, 5),
                         'n_estimators': range(115, 125, 5)},
             scoring='roc_auc', verbose=3)

In [31]:
# Get the best parameters from the cross validation test
grid_search_rf.best_params_

{'criterion': 'entropy',
 'max_depth': 20,
 'max_features': 'log2',
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 120}

Re-test the model with the new parameters

In [34]:
forecaster_rf = RandomForestClassifier(criterion= 'entropy',
                                       max_depth= 20,
                                       max_features= 'log2',
                                       min_samples_leaf= 1,
                                       min_samples_split= 4,
                                       n_estimators= 120)
forecaster_rf.fit(X_train, y_train)
Incidence_pred = forecaster_rf.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

0.8424070813888745


#### Gradient boosting classifier

In [None]:
# Define parameters to test
grid_param_gb = {
    'learning_rate' : [0.1],
    'n_estimators' : range(50, 100, 10),
    'criterion' : ['squared_error'], 
    'max_depth' : range(1, 11, 2),
    'min_samples_leaf' : [2],
    'min_samples_split' : [4],
    'max_features' : ['sqrt', 'log2']
}

In [None]:
grid_search_gb = GridSearchCV(estimator=forecaster_gb, param_grid=grid_param_gb, scoring='roc_auc', cv=5, n_jobs=-1)
grid_search_gb.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 3/5] END criterion=squared_error, learning_rate=0.1, max_depth=1, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=50;, score=0.780 total time=   0.2s
[CV 1/5] END criterion=squared_error, learning_rate=0.1, max_depth=1, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=50;, score=0.793 total time=   0.2s
[CV 2/5] END criterion=squared_error, learning_rate=0.1, max_depth=1, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=50;, score=0.788 total time=   0.2s
[CV 4/5] END criterion=squared_error, learning_rate=0.1, max_depth=1, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=50;, score=0.803 total time=   0.3s
[CV 5/5] END criterion=squared_error, learning_rate=0.1, max_depth=1, max_features=sqrt, min_samples_leaf=2, min_samples_split=4, n_estimators=50;, score=0.805 total time=   0.3s
[CV 2/5] END criterion=squared_error, learn

GridSearchCV(cv=5, estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'criterion': ['squared_error'], 'learning_rate': [0.1],
                         'max_depth': range(1, 11, 2),
                         'max_features': ['sqrt', 'log2'],
                         'min_samples_leaf': [2], 'min_samples_split': [4],
                         'n_estimators': range(50, 100, 10)},
             verbose=3)

In [None]:
# Get the best parameters from the cross validation test
grid_search_gb.best_params_

{'criterion': 'squared_error',
 'learning_rate': 0.1,
 'max_depth': 9,
 'max_features': 'sqrt',
 'min_samples_leaf': 2,
 'min_samples_split': 4,
 'n_estimators': 70}

Re-test the model with the new parameters

In [None]:
forecaster_gb = GradientBoostingClassifier(learning_rate= 0.1,
                                           n_estimators= 70,
                                           criterion= 'squared_error',
                                           max_depth= 9,
                                           min_samples_leaf= 2,
                                           min_samples_split= 4,
                                           max_features= 'sqrt')
forecaster_gb.fit(X_train, y_train)
Incidence_pred = forecaster_gb.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

0.8463023025451637


#### Neural Network

In [None]:
# Define parameters to test
grid_param_nn = {
    'activation' : ['identity', 'logistic', 'tanh', 'relu'],
    'solver' : ['lbfgs', 'sgd', 'adam'], 
    'learning_rate' : ['constant', 'invscaling', 'adaptive']
}

In [None]:
grid_search_nn = GridSearchCV(estimator=forecaster_nn, param_grid=grid_param_nn, scoring='roc_auc', cv=5, n_jobs=-1, verbose=3)
grid_search_nn.fit(X_train, y_train)

  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
  ret = a @ b
ABNORMAL_TERMINATION_IN_LNSRCH.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  ret = a @ b
  ret = a @ b
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
  ret = a @ b
  ret = a @ b
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale

GridSearchCV(cv=5, estimator=MLPClassifier(), n_jobs=-1,
             param_grid={'activation': ['identity', 'logistic', 'tanh', 'relu'],
                         'learning_rate': ['constant', 'invscaling',
                                           'adaptive'],
                         'solver': ['lbfgs', 'sgd', 'adam']},
             scoring='roc_auc')

In [None]:
# Get the best parameters from the cross validation test
grid_search_nn.best_params_

{'activation': 'relu', 'learning_rate': 'invscaling', 'solver': 'adam'}

Re-test the model with the new parameters

In [None]:
forecaster_nn = MLPClassifier(activation='relu',
                              learning_rate='invscaling',
                              solver='adam')

forecaster_nn.fit(X_train, y_train)
Incidence_pred = forecaster_nn.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

0.7770800050390587


## Creating prediction dataset

Merging with the PipeId from the Submission dataset

In [59]:
pipes = pipes.drop(['Incidence'],axis=1)
pred_X = network.merge(
                pipes,
                how='inner',
                on=['PipeId']
)
pred_X

Unnamed: 0,PipeId,Province,Town,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
0,489780,Toledo,Añover de Tajo,2009,PE,Gas natural,110.0,7.485,5.0,0,0,False
1,489790,Toledo,Añover de Tajo,2012,PE,Gas natural,110.0,4.436,4.0,0,0,False
2,489792,Toledo,Añover de Tajo,2012,PE,Gas natural,110.0,11.769,4.0,1,0,False
3,489793,Toledo,Añover de Tajo,2012,PE,Gas natural,110.0,6.172,4.0,1,0,False
4,489981,Toledo,Añover de Tajo,2012,PE,Gas natural,110.0,49.031,4.0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...
909728,416223449,La Rioja,Cirueña,2011,AO,Gas natural,50.8,0.460,45.0,0,0,False
909729,416223471,La Rioja,Cirueña,2011,AO,Gas natural,50.8,0.230,45.0,0,0,False
909730,416425646,La Rioja,Nájera,2011,AO,Gas natural,254.0,1.999,45.0,0,0,False
909731,416425691,La Rioja,Nájera,2011,AO,Gas natural,254.0,3.000,45.0,0,0,False


Drop columns

In [60]:
pred_X = pred_X.drop(['Town', 'NumConnectionsUnder'],axis=1)
pred_X

Unnamed: 0,PipeId,Province,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,BoolBridle
0,489780,Toledo,2009,PE,Gas natural,110.0,7.485,5.0,0,False
1,489790,Toledo,2012,PE,Gas natural,110.0,4.436,4.0,0,False
2,489792,Toledo,2012,PE,Gas natural,110.0,11.769,4.0,1,False
3,489793,Toledo,2012,PE,Gas natural,110.0,6.172,4.0,1,False
4,489981,Toledo,2012,PE,Gas natural,110.0,49.031,4.0,0,False
...,...,...,...,...,...,...,...,...,...,...
909728,416223449,La Rioja,2011,AO,Gas natural,50.8,0.460,45.0,0,False
909729,416223471,La Rioja,2011,AO,Gas natural,50.8,0.230,45.0,0,False
909730,416425646,La Rioja,2011,AO,Gas natural,254.0,1.999,45.0,0,False
909731,416425691,La Rioja,2011,AO,Gas natural,254.0,3.000,45.0,0,False


Normalization

In [None]:
for column in ['Diameter', 'Length', 'Pressure']:
   pred_X[column] = (pred_X[column] - pred_X[column].mean()) / pred_X[column].std()

Add Previous severity variable

In [61]:
last_rev_df = df
last_rev_df.reset_index(inplace=True)
last_rev_df = last_rev_df.drop_duplicates(subset=['PipeId'], keep='last')
pred_X = pred_X.merge(last_rev_df[['PipeId', 'Severity', 'AcumRevisions', 'AcumIncidence','Incidence']], on = 'PipeId', how = 'left')
pred_X.rename(columns = {'Severity':'PrevSeverity'}, inplace = True)

Changing the NaN in Severity to 0, as some of them have never been revised. Same will happen with the accumulated values

In [62]:
pred_X['PrevSeverity'] = pred_X['PrevSeverity'].replace(np.nan, 0)
pred_X['AcumRevisions'] = pred_X['AcumRevisions'].replace(np.nan, 0)
pred_X['AcumIncidence'] = pred_X['AcumIncidence'].replace(np.nan, 0)
pred_X['Incidence'] = pred_X['Incidence'].replace(np.nan, 0)

pred_X['PrevSeverity'] = pred_X['PrevSeverity'].astype('int')
pred_X['AcumIncidence'] = pred_X['AcumIncidence'].astype('int')
pred_X['AcumRevisions'] = pred_X['AcumRevisions'].astype('int')
pred_X['Incidence'] = pred_X['Incidence'].astype('int')

Accumulated Incidences

In [66]:
pred_X['AcumIncidence'] = pred_X['AcumIncidence'] + pred_X['Incidence']
pred_X = pred_X.drop(['Incidence'],axis=1)

Accumulated Revisions

In [65]:
pred_X['AcumRevisions'] = pred_X['AcumRevisions'] + 1

Encoding

In [69]:
pred_X = pd.get_dummies(pred_X)
pred_X

Unnamed: 0,PipeId,YearBuilt,Diameter,Length,Pressure,NumConnections,BoolBridle,PrevSeverity,AcumRevisions,AcumIncidence,...,Material_CU,Material_FD,Material_FG,Material_FI,Material_PA,Material_PE,Material_PN,Material_ZD,GasType_Gas natural,GasType_Gas propano
0,489780,2009,110.0,7.485,5.0,0,False,0.0,5.0,0.0,...,0,0,0,0,0,1,0,0,1,0
1,489790,2012,110.0,4.436,4.0,0,False,0.0,5.0,0.0,...,0,0,0,0,0,1,0,0,1,0
2,489792,2012,110.0,11.769,4.0,1,False,0.0,5.0,0.0,...,0,0,0,0,0,1,0,0,1,0
3,489793,2012,110.0,6.172,4.0,1,False,0.0,5.0,0.0,...,0,0,0,0,0,1,0,0,1,0
4,489981,2012,110.0,49.031,4.0,0,False,0.0,5.0,0.0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909728,416223449,2011,50.8,0.460,45.0,0,False,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0
909729,416223471,2011,50.8,0.230,45.0,0,False,0.0,1.0,0.0,...,0,0,0,0,0,0,0,0,1,0
909730,416425646,2011,254.0,1.999,45.0,0,False,0.0,2.0,0.0,...,0,0,0,0,0,0,0,0,1,0
909731,416425691,2011,254.0,3.000,45.0,0,False,0.0,2.0,0.0,...,0,0,0,0,0,0,0,0,1,0


Obtaining the MonthLastRev column will be done through a prediction

prepair training dataset

In [70]:
df_regressor = df
df_regressor = df_regressor.drop(["Material_FO", "Material_PV", "Incidence", "InspectionDate", "Severity", "InspectionYear", "Age"], axis = 1)
df_regressor = df_regressor.set_index(['PipeId'])
df_regressor

Unnamed: 0_level_0,MonthsLastRev,YearBuilt,Diameter,Length,Pressure,NumConnections,BoolBridle,PrevSeverity,AcumIncidence,AcumRevisions,...,Material_CU,Material_FD,Material_FG,Material_FI,Material_PA,Material_PE,Material_PN,Material_ZD,GasType_Gas natural,GasType_Gas propano
PipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489616,16,2012,63.0,7.792,0.40,0,False,0,0,0,...,0,0,0,0,0,1,0,0,1,0
489616,22,2012,63.0,7.792,0.40,0,False,0,0,1,...,0,0,0,0,0,1,0,0,1,0
489616,22,2012,63.0,7.792,0.40,0,False,0,0,2,...,0,0,0,0,0,1,0,0,1,0
489616,23,2012,63.0,7.792,0.40,0,False,0,0,3,...,0,0,0,0,0,1,0,0,1,0
489616,24,2012,63.0,7.792,0.40,0,False,0,0,4,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
451995260,24,2003,90.0,67.284,0.15,2,False,0,0,3,...,0,0,0,0,0,1,0,0,1,0
451995309,24,1998,200.0,92.220,0.15,0,False,0,0,0,...,0,0,0,0,0,1,0,0,1,0
451995309,24,1998,200.0,92.220,0.15,0,False,0,0,1,...,0,0,0,0,0,1,0,0,1,0
451995309,24,1998,200.0,92.220,0.15,0,False,0,0,2,...,0,0,0,0,0,1,0,0,1,0


In [71]:
df_regressor_y = df_regressor["MonthsLastRev"]
df_regressor = df_regressor.drop(["MonthsLastRev"], axis = 1)

set pipe id in the index for the predictor

In [72]:
pred_X = pred_X.set_index(['PipeId'])
pred_X

Unnamed: 0_level_0,YearBuilt,Diameter,Length,Pressure,NumConnections,BoolBridle,PrevSeverity,AcumRevisions,AcumIncidence,Province_Albacete,...,Material_CU,Material_FD,Material_FG,Material_FI,Material_PA,Material_PE,Material_PN,Material_ZD,GasType_Gas natural,GasType_Gas propano
PipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489780,2009,110.0,7.485,5.0,0,False,0.0,5.0,0.0,0,...,0,0,0,0,0,1,0,0,1,0
489790,2012,110.0,4.436,4.0,0,False,0.0,5.0,0.0,0,...,0,0,0,0,0,1,0,0,1,0
489792,2012,110.0,11.769,4.0,1,False,0.0,5.0,0.0,0,...,0,0,0,0,0,1,0,0,1,0
489793,2012,110.0,6.172,4.0,1,False,0.0,5.0,0.0,0,...,0,0,0,0,0,1,0,0,1,0
489981,2012,110.0,49.031,4.0,0,False,0.0,5.0,0.0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416223449,2011,50.8,0.460,45.0,0,False,0.0,1.0,0.0,0,...,0,0,0,0,0,0,0,0,1,0
416223471,2011,50.8,0.230,45.0,0,False,0.0,1.0,0.0,0,...,0,0,0,0,0,0,0,0,1,0
416425646,2011,254.0,1.999,45.0,0,False,0.0,2.0,0.0,0,...,0,0,0,0,0,0,0,0,1,0
416425691,2011,254.0,3.000,45.0,0,False,0.0,2.0,0.0,0,...,0,0,0,0,0,0,0,0,1,0


order columns

In [74]:
pred_X = pred_X[df_regressor.columns]
pred_X

Unnamed: 0_level_0,YearBuilt,Diameter,Length,Pressure,NumConnections,BoolBridle,PrevSeverity,AcumIncidence,AcumRevisions,Province_Albacete,...,Material_CU,Material_FD,Material_FG,Material_FI,Material_PA,Material_PE,Material_PN,Material_ZD,GasType_Gas natural,GasType_Gas propano
PipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489780,2009,110.0,7.485,5.0,0,False,0.0,0.0,5.0,0,...,0,0,0,0,0,1,0,0,1,0
489790,2012,110.0,4.436,4.0,0,False,0.0,0.0,5.0,0,...,0,0,0,0,0,1,0,0,1,0
489792,2012,110.0,11.769,4.0,1,False,0.0,0.0,5.0,0,...,0,0,0,0,0,1,0,0,1,0
489793,2012,110.0,6.172,4.0,1,False,0.0,0.0,5.0,0,...,0,0,0,0,0,1,0,0,1,0
489981,2012,110.0,49.031,4.0,0,False,0.0,0.0,5.0,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416223449,2011,50.8,0.460,45.0,0,False,0.0,0.0,1.0,0,...,0,0,0,0,0,0,0,0,1,0
416223471,2011,50.8,0.230,45.0,0,False,0.0,0.0,1.0,0,...,0,0,0,0,0,0,0,0,1,0
416425646,2011,254.0,1.999,45.0,0,False,0.0,0.0,2.0,0,...,0,0,0,0,0,0,0,0,1,0
416425691,2011,254.0,3.000,45.0,0,False,0.0,0.0,2.0,0,...,0,0,0,0,0,0,0,0,1,0


Predict months

In [75]:
regressor = GradientBoostingRegressor()
regressor.fit(df_regressor, df_regressor_y)
Months_pred = regressor.predict(pred_X)

pred_X['MonthsLastRev'] = Months_pred
pred_X['MonthsLastRev'] = pred_X['MonthsLastRev'].astype(int)
pred_X

Unnamed: 0_level_0,YearBuilt,Diameter,Length,Pressure,NumConnections,BoolBridle,PrevSeverity,AcumIncidence,AcumRevisions,Province_Albacete,...,Material_FD,Material_FG,Material_FI,Material_PA,Material_PE,Material_PN,Material_ZD,GasType_Gas natural,GasType_Gas propano,MonthsLastRev
PipeId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
489780,2009,110.0,7.485,5.0,0,False,0.0,0.0,5.0,0,...,0,0,0,0,1,0,0,1,0,23.657574
489790,2012,110.0,4.436,4.0,0,False,0.0,0.0,5.0,0,...,0,0,0,0,1,0,0,1,0,23.499360
489792,2012,110.0,11.769,4.0,1,False,0.0,0.0,5.0,0,...,0,0,0,0,1,0,0,1,0,23.499360
489793,2012,110.0,6.172,4.0,1,False,0.0,0.0,5.0,0,...,0,0,0,0,1,0,0,1,0,23.499360
489981,2012,110.0,49.031,4.0,0,False,0.0,0.0,5.0,0,...,0,0,0,0,1,0,0,1,0,23.499360
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416223449,2011,50.8,0.460,45.0,0,False,0.0,0.0,1.0,0,...,0,0,0,0,0,0,0,1,0,32.497586
416223471,2011,50.8,0.230,45.0,0,False,0.0,0.0,1.0,0,...,0,0,0,0,0,0,0,1,0,32.949888
416425646,2011,254.0,1.999,45.0,0,False,0.0,0.0,2.0,0,...,0,0,0,0,0,0,0,1,0,30.027411
416425691,2011,254.0,3.000,45.0,0,False,0.0,0.0,2.0,0,...,0,0,0,0,0,0,0,1,0,30.027411


Restore PipeId after prediction

In [76]:
pred_X.reset_index(inplace=True)
pred_X

With this MonthLastRev we estimate the inspection date

In [83]:
pred_X = pred_X.merge(last_rev_df[['PipeId', 'InspectionDate']], on = 'PipeId', how = 'left')

new_dates = []

for ind in pred_X.index:
    new_date = pred_X['InspectionDate'][ind]+pd.DateOffset(months=pred_X['MonthsLastRev'][ind])
    new_dates.append(new_date)

pred_X['InspectionDate'] = new_dates
pred_X['InspectionDate'] = pred_X['InspectionDate'].replace(pd.NaT, pd.to_datetime('2022-12-31'))

Age of the pipe

In [84]:
age_list = []

for ind in pred_X.index:
    age = int(pred_X['InspectionDate'][ind].year - pred_X['YearBuilt'][ind])
    age_list.append(age)

pred_X['Age'] = age_list

Inspection Year

In [85]:
pred_X['InspectionYear'] = pred_X['InspectionDate'].dt.year


Indexing some columns

In [86]:
df

Unnamed: 0,PipeId,InspectionDate,Severity,InspectionYear,MonthsLastRev,Incidence,YearBuilt,Diameter,Length,Pressure,...,Material_FG,Material_FI,Material_FO,Material_PA,Material_PE,Material_PN,Material_PV,Material_ZD,GasType_Gas natural,GasType_Gas propano
0,489616,2013-05-08,0,2013,16,0,2012,63.0,7.792,0.40,...,0,0,0,0,1,0,0,0,1,0
1,489616,2015-03-04,0,2015,22,0,2012,63.0,7.792,0.40,...,0,0,0,0,1,0,0,0,1,0
2,489616,2017-01-12,0,2016,22,0,2012,63.0,7.792,0.40,...,0,0,0,0,1,0,0,0,1,0
3,489616,2018-12-19,0,2018,23,0,2012,63.0,7.792,0.40,...,0,0,0,0,1,0,0,0,1,0
4,489616,2020-12-05,0,2020,24,0,2012,63.0,7.792,0.40,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345339,451995260,2019-10-23,0,2019,24,0,2003,90.0,67.284,0.15,...,0,0,0,0,1,0,0,0,1,0
6345340,451995309,2013-10-07,0,2013,24,0,1998,200.0,92.220,0.15,...,0,0,0,0,1,0,0,0,1,0
6345341,451995309,2015-10-05,0,2015,24,0,1998,200.0,92.220,0.15,...,0,0,0,0,1,0,0,0,1,0
6345342,451995309,2017-10-02,0,2017,24,0,1998,200.0,92.220,0.15,...,0,0,0,0,1,0,0,0,1,0


In [87]:
df = df.set_index(['PipeId', 'InspectionDate','Severity'])
pred_X = pred_X.set_index(['PipeId', 'InspectionDate'])


Balance training dataframe

In [88]:
# First we check how balanced is the dataset
Incidence = df[df.Incidence == 1]
not_Incidence = df[df.Incidence == 0]

# index property contains the indexes of the not_Incidence dataset
not_Incidence_indexes = not_Incidence.index

# get as many not_incidence indexes (randomly) as incidence occurrences
np.random.seed(42)
random_not_Incidence_indexes = np.random.choice(not_Incidence_indexes, len(Incidence))

# Keep those entries in not_Incidence
not_Incidence = df.loc[random_not_Incidence_indexes]

# We will form the balanced dataset concatenating Incidence and not_Incidence
balanced_df = Incidence.append(not_Incidence)
balanced_df.sort_index()

  balanced_df = Incidence.append(not_Incidence)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,InspectionYear,MonthsLastRev,Incidence,YearBuilt,Diameter,Length,Pressure,NumConnections,BoolBridle,Age,...,Material_FG,Material_FI,Material_FO,Material_PA,Material_PE,Material_PN,Material_PV,Material_ZD,GasType_Gas natural,GasType_Gas propano
PipeId,InspectionDate,Severity,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
489981,2016-04-28,0,2016,22,0,2012,110.0,49.031,4.00,0,False,4,...,0,0,0,0,1,0,0,0,1,0
490620,2020-05-18,0,2020,24,0,2012,160.0,6.800,10.00,0,False,8,...,0,0,0,0,1,0,0,0,1,0
490635,2017-08-31,0,2017,24,0,2012,160.0,0.403,0.40,0,False,5,...,0,0,0,0,1,0,0,0,1,0
497136,2019-01-25,1,2019,24,1,2008,160.0,319.164,0.15,11,False,11,...,0,0,0,0,1,0,0,0,1,0
497253,2017-01-10,0,2017,24,0,2005,110.0,71.260,0.15,1,False,12,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450861286,2020-03-12,0,2020,24,0,2012,160.0,1.627,0.15,0,False,8,...,0,0,0,0,1,0,0,0,1,0
450872858,2011-12-21,1,2011,21,1,1991,152.4,68.071,4.00,1,True,20,...,0,0,0,0,0,0,0,0,1,0
450873154,2012-03-05,1,2012,2,1,2012,50.8,1.354,4.00,1,True,0,...,0,0,0,0,0,0,0,0,1,0
450873682,2011-10-13,1,2011,21,1,1998,90.0,1.252,4.00,0,False,13,...,0,0,0,0,1,0,0,0,1,0


split train X and Y

In [89]:
train_X = balanced_df.drop(["Incidence"], axis = 1)
train_Y = balanced_df["Incidence"]

train_X = train_X.drop(["Material_FO", "Material_PV"], axis = 1)

In [90]:
train_X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,InspectionYear,MonthsLastRev,YearBuilt,Diameter,Length,Pressure,NumConnections,BoolBridle,Age,PrevSeverity,...,Material_CU,Material_FD,Material_FG,Material_FI,Material_PA,Material_PE,Material_PN,Material_ZD,GasType_Gas natural,GasType_Gas propano
PipeId,InspectionDate,Severity,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
497136,2019-01-25,1,2019,24,2008,160.0,319.164000,0.150,11,False,11,0,...,0,0,0,0,0,1,0,0,1,0
510311,2012-06-21,1,2012,17,2011,40.0,93.568000,0.150,6,False,1,0,...,0,0,0,0,0,1,0,0,1,0
586278,2012-05-01,3,2012,19,1970,50.8,25.163277,4.000,0,True,42,0,...,0,0,0,0,0,0,0,0,1,0
634981,2011-02-16,1,2011,21,1999,50.8,76.544000,16.000,0,False,12,0,...,0,0,0,0,0,0,0,0,1,0
644491,2013-07-30,1,2013,24,1989,110.0,39.471000,0.025,2,False,24,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133067019,2016-04-08,0,2016,49,2001,200.0,0.999000,0.150,1,False,15,0,...,0,0,0,0,0,1,0,0,1,0
423217978,2015-09-24,0,2015,23,2011,110.0,0.976000,0.150,0,False,4,0,...,0,0,0,0,0,1,0,0,1,0
300463684,2011-01-28,0,2011,24,2006,200.0,72.486000,0.150,0,False,5,0,...,0,0,0,0,0,1,0,0,1,0
24303171,2015-05-28,0,2015,22,1996,110.0,73.486000,4.000,0,False,19,0,...,0,0,0,0,0,1,0,0,1,0


Reorder prediction columns

In [91]:
pred_X = pred_X[train_X.columns]
pred_X

Unnamed: 0_level_0,Unnamed: 1_level_0,InspectionYear,MonthsLastRev,YearBuilt,Diameter,Length,Pressure,NumConnections,BoolBridle,Age,PrevSeverity,...,Material_CU,Material_FD,Material_FG,Material_FI,Material_PA,Material_PE,Material_PN,Material_ZD,GasType_Gas natural,GasType_Gas propano
PipeId,InspectionDate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
489780,2022-03-18,2022,23,2009,110.0,7.485,5.0,0,False,13,0,...,0,0,0,0,0,1,0,0,1,0
489790,2022-03-18,2022,23,2012,110.0,4.436,4.0,0,False,10,0,...,0,0,0,0,0,1,0,0,1,0
489792,2022-03-18,2022,23,2012,110.0,11.769,4.0,1,False,10,0,...,0,0,0,0,0,1,0,0,1,0
489793,2022-03-18,2022,23,2012,110.0,6.172,4.0,1,False,10,0,...,0,0,0,0,0,1,0,0,1,0
489981,2022-03-18,2022,23,2012,110.0,49.031,4.0,0,False,10,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416223449,2023-08-31,2023,32,2011,50.8,0.460,45.0,0,False,12,0,...,0,0,0,0,0,0,0,0,1,0
416223471,2023-08-31,2023,32,2011,50.8,0.230,45.0,0,False,12,0,...,0,0,0,0,0,0,0,0,1,0
416425646,2020-02-10,2020,30,2011,254.0,1.999,45.0,0,False,9,0,...,0,0,0,0,0,0,0,0,1,0
416425691,2020-02-09,2020,30,2011,254.0,3.000,45.0,0,False,9,0,...,0,0,0,0,0,0,0,0,1,0


## Incidence prediction

In [92]:
# Random Forest model
forecaster2 = RandomForestClassifier(criterion= 'entropy',
                                     max_depth= 20,
                                     max_features= 'log2',
                                     min_samples_leaf= 1,
                                     min_samples_split= 4,
                                     n_estimators= 120)

# Random Forest model
# forecaster2 = GradientBoostingClassifier(learning_rate= 0.1,
#                                          n_estimators= 70,
#                                          criterion= 'squared_error',
#                                          max_depth= 9,
#                                          min_samples_leaf= 2,
#                                          min_samples_split= 4,
#                                          max_features= 'sqrt')

forecaster2.fit(train_X, train_Y)
Incidence_pred2 = forecaster2.predict(pred_X)

## Export

In [93]:
pred_X.reset_index(inplace=True)

In [94]:
export = pd.DataFrame()
export['PipeId'] = pred_X["PipeId"]
export['Incidence'] = Incidence_pred2

# set indexes again to be able to re-run the model straight away
pred_X = pred_X.set_index(['PipeId', 'InspectionDate'])

export

Unnamed: 0,PipeId,Incidence
0,489780,0
1,489790,0
2,489792,0
3,489793,0
4,489981,0
...,...,...
909728,416223449,0
909729,416223471,0
909730,416425646,0
909731,416425691,0


In [95]:
print(len(export[export.Incidence == 1]))
print(len(export[export.Incidence == 0]))

135039
774694


In [96]:
submission_set = pipes.merge(export[['PipeId', 'Incidence']], on = 'PipeId', how = 'left')
submission_set

Unnamed: 0,PipeId,Incidence
0,446859944,0
1,428124500,0
2,438428871,0
3,429034569,0
4,411184477,0
...,...,...
909728,235426649,0
909729,235426673,0
909730,235426707,0
909731,190956601,0


In [97]:
submission_set.to_csv('Asan-Dopollos.csv', index=False)