<h1>Asan Dopollos Model</h1>

In [55]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingRegressor, GradientBoostingClassifier, RandomForestRegressor, RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, SVR
from sklearn.neural_network import MLPClassifier
from sklearn import metrics
from sklearn.preprocessing import OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV


# Import datasets
network = pd.read_csv('network.csv')
inspections = pd.read_csv('inspections.csv')
pipes = pd.read_csv('sample_submission.csv')

## Data Visualization

In [56]:
network.head()

Unnamed: 0,PipeId,Province,Town,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
0,489616,Ciudad Real,Torralba de Calatrava,2012,PE,Gas natural,63.0,7.792,0.4,0,0,False
1,489645,Ciudad Real,Torralba de Calatrava,2012,PE,Gas natural,90.0,2.08,0.4,0,0,False
2,489646,Ciudad Real,Torralba de Calatrava,2012,PE,Gas natural,90.0,111.173,0.4,3,0,False
3,519950,Ciudad Real,Torralba de Calatrava,2012,PE,Gas natural,200.0,14.304,0.4,0,0,False
4,1332439,Ciudad Real,Torralba de Calatrava,2012,PE,Gas natural,160.0,22.669,0.4,1,0,False


In [57]:
pipes.head()

Unnamed: 0,PipeId,Incidence
0,446859944,0
1,428124500,0
2,438428871,0
3,429034569,0
4,411184477,0


In [58]:
inspections.head()

Unnamed: 0,PipeId,MaintenanceId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence
0,189311802,ZRV-00001972,2010,2010-10-01,24,4,0
1,188889914,ZRV-00001406,2010,2010-10-01,22,4,0
2,188889907,ZRV-00001406,2010,2010-10-01,22,4,0
3,188673355,ZRV-00001406,2010,2010-10-01,22,4,0
4,188558922,ZRV-00001406,2010,2010-10-01,22,4,0


Merging the df with an inner join using PipeId

In [59]:
# Merge both dataframes
df = inspections.merge(
                        network,
                        how = 'inner',
                        on=['PipeId'] 
                        )

df

Unnamed: 0,PipeId,MaintenanceId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,Town,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
0,189311802,ZRV-00001972,2010,2010-10-01,24,4,0,Barcelona,Sentmenat,2001,PE,Gas natural,160.0,117.831,4.000,0,0,False
1,189311802,ZRV-00001972,2012,2012-10-01,24,4,0,Barcelona,Sentmenat,2001,PE,Gas natural,160.0,117.831,4.000,0,0,False
2,189311802,ZRV-00001972,2014,2014-10-08,24,4,0,Barcelona,Sentmenat,2001,PE,Gas natural,160.0,117.831,4.000,0,0,False
3,189311802,ZRV-00001972,2016,2016-10-14,24,4,0,Barcelona,Sentmenat,2001,PE,Gas natural,160.0,117.831,4.000,0,0,False
4,189311802,ZRV-00001972,2018,2018-10-09,24,4,0,Barcelona,Sentmenat,2001,PE,Gas natural,160.0,117.831,4.000,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345339,56922448,ZRV-00004581,2020,2020-12-31,24,4,0,Valencia,Betera,1993,PE,Gas natural,63.0,1.237,4.000,0,0,False
6345340,56907425,ZRV-00004581,2020,2020-12-31,24,4,0,Valencia,Betera,2000,PE,Gas natural,160.0,169.732,4.000,1,0,False
6345341,7843615,ZRV-00002112,2021,2020-12-31,24,4,0,Barcelona,Sabadell,1995,FD,Gas natural,150.0,0.361,0.025,0,0,True
6345342,56922456,ZRV-00004581,2020,2020-12-31,24,4,0,Valencia,Betera,1993,PE,Gas natural,63.0,23.306,4.000,0,0,False


## Data preparation

Dates will be converted to datetime.

In [60]:
df['InspectionDate'] = pd.to_datetime(df['InspectionDate'])

Column Dop: MaintenaceId, Town, NumConnectionsUnder

In [61]:
df = df.drop(['MaintenanceId','NumConnectionsUnder','Town'],axis=1)
df

Unnamed: 0,PipeId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,BoolBridle
0,189311802,2010,2010-10-01,24,4,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
1,189311802,2012,2012-10-01,24,4,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
2,189311802,2014,2014-10-08,24,4,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
3,189311802,2016,2016-10-14,24,4,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
4,189311802,2018,2018-10-09,24,4,0,Barcelona,2001,PE,Gas natural,160.0,117.831,4.000,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345339,56922448,2020,2020-12-31,24,4,0,Valencia,1993,PE,Gas natural,63.0,1.237,4.000,0,False
6345340,56907425,2020,2020-12-31,24,4,0,Valencia,2000,PE,Gas natural,160.0,169.732,4.000,1,False
6345341,7843615,2021,2020-12-31,24,4,0,Barcelona,1995,FD,Gas natural,150.0,0.361,0.025,0,True
6345342,56922456,2020,2020-12-31,24,4,0,Valencia,1993,PE,Gas natural,63.0,23.306,4.000,0,False


Value standarization: Diameter, Length, Pressure

In [62]:
for column in ['Diameter', 'Length', 'Pressure']:
   df[column] = (df[column] - df[column].mean()) / df[column].std()

df.head()

Unnamed: 0,PipeId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,BoolBridle
0,189311802,2010,2010-10-01,24,4,0,Barcelona,2001,PE,Gas natural,0.707025,1.12697,0.166398,0,False
1,189311802,2012,2012-10-01,24,4,0,Barcelona,2001,PE,Gas natural,0.707025,1.12697,0.166398,0,False
2,189311802,2014,2014-10-08,24,4,0,Barcelona,2001,PE,Gas natural,0.707025,1.12697,0.166398,0,False
3,189311802,2016,2016-10-14,24,4,0,Barcelona,2001,PE,Gas natural,0.707025,1.12697,0.166398,0,False
4,189311802,2018,2018-10-09,24,4,0,Barcelona,2001,PE,Gas natural,0.707025,1.12697,0.166398,0,False


Severity values will be inverted, meaning that they will go from 0-3 going from no incidence to highest severity

In [63]:
inverse_sev = {4:0,3:1,2:2,1:3}
list = []
for sev in df["Severity"]:
    list.append(inverse_sev[sev]) 

df['Severity'] = list
df

Unnamed: 0,PipeId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,BoolBridle
0,189311802,2010,2010-10-01,24,0,0,Barcelona,2001,PE,Gas natural,0.707025,1.126970,0.166398,0,False
1,189311802,2012,2012-10-01,24,0,0,Barcelona,2001,PE,Gas natural,0.707025,1.126970,0.166398,0,False
2,189311802,2014,2014-10-08,24,0,0,Barcelona,2001,PE,Gas natural,0.707025,1.126970,0.166398,0,False
3,189311802,2016,2016-10-14,24,0,0,Barcelona,2001,PE,Gas natural,0.707025,1.126970,0.166398,0,False
4,189311802,2018,2018-10-09,24,0,0,Barcelona,2001,PE,Gas natural,0.707025,1.126970,0.166398,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345339,56922448,2020,2020-12-31,24,0,0,Valencia,1993,PE,Gas natural,-0.988359,-0.489225,0.166398,0,False
6345340,56907425,2020,2020-12-31,24,0,0,Valencia,2000,PE,Gas natural,0.707025,1.846408,0.166398,1,False
6345341,7843615,2021,2020-12-31,24,0,0,Barcelona,1995,FD,Gas natural,0.532243,-0.501367,-0.438394,0,True
6345342,56922456,2020,2020-12-31,24,0,0,Valencia,1993,PE,Gas natural,-0.988359,-0.183310,0.166398,0,False


## Feature Engineering

### Age of the pipe

In [64]:
df['Age'] = df["InspectionYear"] - df["YearBuilt"]

### Previous revision severity

In [65]:
df = df.sort_values(['PipeId','InspectionDate'])
df = df.reset_index(drop=True)

prev_sev = 0
previous_severity = []

for ind in df.index:
    previous_severity.append(prev_sev)
    prev_sev = df['Severity'][ind]

df['PrevSeverity'] = previous_severity
df[df.groupby('PipeId').cumcount() == 0]['PrevSeverity'] = 0

df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[df.groupby('PipeId').cumcount() == 0]['PrevSeverity'] = 0


Unnamed: 0,PipeId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,BoolBridle,Age,PrevSeverity
0,489616,2013,2013-05-08,16,0,0,Ciudad Real,2012,PE,Gas natural,-0.988359,-0.398361,-0.381338,0,False,1,0
1,489616,2015,2015-03-04,22,0,0,Ciudad Real,2012,PE,Gas natural,-0.988359,-0.398361,-0.381338,0,False,3,0
2,489616,2016,2017-01-12,22,0,0,Ciudad Real,2012,PE,Gas natural,-0.988359,-0.398361,-0.381338,0,False,4,0
3,489616,2018,2018-12-19,23,0,0,Ciudad Real,2012,PE,Gas natural,-0.988359,-0.398361,-0.381338,0,False,6,0
4,489616,2020,2020-12-05,24,0,0,Ciudad Real,2012,PE,Gas natural,-0.988359,-0.398361,-0.381338,0,False,8,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345339,451995260,2019,2019-10-23,24,0,0,Lleida,2003,PE,Gas natural,-0.516448,0.426301,-0.419375,2,False,16,0
6345340,451995309,2013,2013-10-07,24,0,0,Lleida,1998,PE,Gas natural,1.406152,0.771957,-0.419375,0,False,15,0
6345341,451995309,2015,2015-10-05,24,0,0,Lleida,1998,PE,Gas natural,1.406152,0.771957,-0.419375,0,False,17,0
6345342,451995309,2017,2017-10-02,24,0,0,Lleida,1998,PE,Gas natural,1.406152,0.771957,-0.419375,0,False,19,0


### Accumulated number of incidence per pipe and accumulated number of revisions

In [66]:
accum_incidence_list = [0]
incidence_count = df['Incidence'][0]
revision_count = 1
revision_list = [0]

for ind in range(1, len(df.index)):
    if df['PipeId'][ind] != df['PipeId'][ind-1]:
        incidence_count = 0
        revision_count = 0
    accum_incidence_list.append(incidence_count)
    revision_list.append(revision_count)
    incidence_count = incidence_count + df['Incidence'][ind]
    revision_count += 1

df['AcumIncidence'] = accum_incidence_list
df['AcumRevisions'] = revision_list

In [67]:
df

Unnamed: 0,PipeId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,Province,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,BoolBridle,Age,PrevSeverity,AcumIncidence,AcumRevisions
0,489616,2013,2013-05-08,16,0,0,Ciudad Real,2012,PE,Gas natural,-0.988359,-0.398361,-0.381338,0,False,1,0,0,0
1,489616,2015,2015-03-04,22,0,0,Ciudad Real,2012,PE,Gas natural,-0.988359,-0.398361,-0.381338,0,False,3,0,0,1
2,489616,2016,2017-01-12,22,0,0,Ciudad Real,2012,PE,Gas natural,-0.988359,-0.398361,-0.381338,0,False,4,0,0,2
3,489616,2018,2018-12-19,23,0,0,Ciudad Real,2012,PE,Gas natural,-0.988359,-0.398361,-0.381338,0,False,6,0,0,3
4,489616,2020,2020-12-05,24,0,0,Ciudad Real,2012,PE,Gas natural,-0.988359,-0.398361,-0.381338,0,False,8,0,0,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345339,451995260,2019,2019-10-23,24,0,0,Lleida,2003,PE,Gas natural,-0.516448,0.426301,-0.419375,2,False,16,0,0,3
6345340,451995309,2013,2013-10-07,24,0,0,Lleida,1998,PE,Gas natural,1.406152,0.771957,-0.419375,0,False,15,0,0,0
6345341,451995309,2015,2015-10-05,24,0,0,Lleida,1998,PE,Gas natural,1.406152,0.771957,-0.419375,0,False,17,0,0,1
6345342,451995309,2017,2017-10-02,24,0,0,Lleida,1998,PE,Gas natural,1.406152,0.771957,-0.419375,0,False,19,0,0,2


One hot encoding the categorical values with pandas get dummies

In [68]:
df = pd.get_dummies(df)
df.head(4)

Unnamed: 0,PipeId,InspectionYear,InspectionDate,MonthsLastRev,Severity,Incidence,YearBuilt,Diameter,Length,Pressure,...,Material_FG,Material_FI,Material_FO,Material_PA,Material_PE,Material_PN,Material_PV,Material_ZD,GasType_Gas natural,GasType_Gas propano
0,489616,2013,2013-05-08,16,0,0,2012,-0.988359,-0.398361,-0.381338,...,0,0,0,0,1,0,0,0,1,0
1,489616,2015,2015-03-04,22,0,0,2012,-0.988359,-0.398361,-0.381338,...,0,0,0,0,1,0,0,0,1,0
2,489616,2016,2017-01-12,22,0,0,2012,-0.988359,-0.398361,-0.381338,...,0,0,0,0,1,0,0,0,1,0
3,489616,2018,2018-12-19,23,0,0,2012,-0.988359,-0.398361,-0.381338,...,0,0,0,0,1,0,0,0,1,0


The dataframe will be reindexed by means of the PipeId and Date

In [69]:
df = df.set_index(['PipeId', 'InspectionDate','Severity'])
df.info()

<class 'pandas.core.frame.DataFrame'>
MultiIndex: 6345344 entries, (489616, Timestamp('2013-05-08 00:00:00'), 0) to (451995309, Timestamp('2019-10-26 00:00:00'), 0)
Data columns (total 64 columns):
 #   Column                Dtype  
---  ------                -----  
 0   InspectionYear        int64  
 1   MonthsLastRev         int64  
 2   Incidence             int64  
 3   YearBuilt             int64  
 4   Diameter              float64
 5   Length                float64
 6   Pressure              float64
 7   NumConnections        int64  
 8   BoolBridle            bool   
 9   Age                   int64  
 10  PrevSeverity          int64  
 11  AcumIncidence         int64  
 12  AcumRevisions         int64  
 13  Province_Albacete     uint8  
 14  Province_Alicante     uint8  
 15  Province_Almeria      uint8  
 16  Province_Avila        uint8  
 17  Province_Barcelona    uint8  
 18  Province_Burgos       uint8  
 19  Province_Castellón    uint8  
 20  Province_Ciudad Real  uint8

Splitting in a train-test set

In [70]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['Incidence'],axis=1),df['Incidence'])

Balancing the dataframe as it is quite unbalanced

In [71]:
X_train['Incidence'] = y_train

In [72]:
# First we check how balanced is the dataset
Incidence = X_train[X_train.Incidence == 1]
not_Incidence = X_train[X_train.Incidence == 0]

# index property contains the indexes of the not_Incidence dataset
not_Incidence_indexes = not_Incidence.index

# get as many not_incidence indexes (randomly) as incidence occurrences
np.random.seed(42)
random_not_Incidence_indexes = np.random.choice(not_Incidence_indexes, len(Incidence))

# Keep those entries in not_Incidence
not_Incidence = X_train.loc[random_not_Incidence_indexes]

# We will form the balanced dataset concatenating Incidence and not_Incidence
balanced_df = Incidence.append(not_Incidence)
balanced_df.sort_index()
X_train = balanced_df

y_train = X_train['Incidence']
X_train = X_train.drop(['Incidence'],axis=1)

  balanced_df = Incidence.append(not_Incidence)


## Training and testing the model

BayesianNB Classification

In [74]:
forecaster = GaussianNB()
forecaster.fit(X_train, y_train)
Incidence_pred = forecaster.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

0.6775409012278238


Support vector polinomial kernel

In [None]:
forecaster = SVC(kernel='poly')
forecaster.fit(X_train, y_train)
Incidence_pred = forecaster.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

Logisitc regression

In [75]:
forecaster = LogisticRegression()
forecaster.fit(X_train, y_train)
Incidence_pred = forecaster.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7809286467766544


Random Forest classifier

In [80]:
forecaster_rf = RandomForestClassifier()
forecaster_rf.fit(X_train, y_train)
Incidence_pred = forecaster_rf.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

0.8408659398519952


Gradient boosting classifier

In [81]:
forecaster_gb = GradientBoostingClassifier()
forecaster_gb.fit(X_train, y_train)
Incidence_pred = forecaster_gb.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

0.8409459391909979


### Hyperparameter tuning

#### Random forest classifier

In [84]:
# Define parameters to test
grid_param = {
    'n_estimators' : [90, 100, 115, 130],
    'criterion' : ['gini', 'entropy'], 
    'max_depth' : range(2, 20, 5),
    'min_samples_leaf' : range(1,10,2),
    'min_samples_split' : range(2, 10, 2),
    'max_features' : ['auto', 'log2']
}

In [85]:
grid_search = GridSearchCV(estimator=forecaster_rf, param_grid=grid_param, cv=5, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

# This takes around 11-12 mins to run...

Fitting 5 folds for each of 1280 candidates, totalling 6400 fits
[CV 5/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90;, score=0.806 total time=   0.3s
[CV 3/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90;, score=0.778 total time=   0.4s
[CV 2/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90;, score=0.793 total time=   0.4s
[CV 1/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90;, score=0.795 total time=   0.4s
[CV 2/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.799 total time=   0.4s
[CV 4/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90;, score=0.782 total time=   0.4s
[CV 3/5] END criterion=gini,

GridSearchCV(cv=5, estimator=RandomForestClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': range(2, 20, 5),
                         'max_features': ['auto', 'log2'],
                         'min_samples_leaf': range(1, 10, 2),
                         'min_samples_split': range(2, 10, 2),
                         'n_estimators': [90, 100, 115, 130]},
             verbose=3)

In [86]:
# Get the best parameters from the cross validation test
grid_search.best_params_

{'criterion': 'gini',
 'max_depth': 17,
 'max_features': 'auto',
 'min_samples_leaf': 1,
 'min_samples_split': 4,
 'n_estimators': 115}

Re-test the model with the new parameters

In [89]:
forecaster_rf = RandomForestClassifier(criterion= 'gini',
                                       max_depth= 17,
                                       max_features= 'auto',
                                       min_samples_leaf= 1,
                                       min_samples_split= 4,
                                       n_estimators= 115)
forecaster_rf.fit(X_train, y_train)
Incidence_pred = forecaster_rf.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

0.8462022214576173


#### Gradient boosting classifier

In [None]:
# Define parameters to test
grid_param = {

}

In [None]:
grid_search = GridSearchCV(estimator=forecaster_gb, param_grid=grid_param, cv=5, n_jobs=-1, verbose=3)
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 1280 candidates, totalling 6400 fits
[CV 5/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90;, score=0.806 total time=   0.3s
[CV 3/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90;, score=0.778 total time=   0.4s
[CV 2/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90;, score=0.793 total time=   0.4s
[CV 1/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90;, score=0.795 total time=   0.4s
[CV 2/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=100;, score=0.799 total time=   0.4s
[CV 4/5] END criterion=gini, max_depth=2, max_features=auto, min_samples_leaf=1, min_samples_split=2, n_estimators=90;, score=0.782 total time=   0.4s
[CV 3/5] END criterion=gini,

In [None]:
# Get the best parameters from the cross validation test
grid_search.best_params_

Re-test the model with the new parameters

In [None]:
forecaster_gb = GradientBoostingClassifier()
forecaster_gb.fit(X_train, y_train)
Incidence_pred = forecaster_gb.predict(X_test)
print(metrics.roc_auc_score(y_test,Incidence_pred))

0.8409459391909979


## Creating prediction dataset

Merging with the PipeId from the Submission dataset

In [90]:
pipes = pipes.drop(['Incidence'],axis=1)
pred_X = network.merge(
                pipes,
                how='inner',
                on=['PipeId']
)
pred_X

Unnamed: 0,PipeId,Province,Town,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,NumConnectionsUnder,BoolBridle
0,489780,Toledo,Añover de Tajo,2009,PE,Gas natural,110.0,7.485,5.0,0,0,False
1,489790,Toledo,Añover de Tajo,2012,PE,Gas natural,110.0,4.436,4.0,0,0,False
2,489792,Toledo,Añover de Tajo,2012,PE,Gas natural,110.0,11.769,4.0,1,0,False
3,489793,Toledo,Añover de Tajo,2012,PE,Gas natural,110.0,6.172,4.0,1,0,False
4,489981,Toledo,Añover de Tajo,2012,PE,Gas natural,110.0,49.031,4.0,0,0,False
...,...,...,...,...,...,...,...,...,...,...,...,...
909728,416223449,La Rioja,Cirueña,2011,AO,Gas natural,50.8,0.460,45.0,0,0,False
909729,416223471,La Rioja,Cirueña,2011,AO,Gas natural,50.8,0.230,45.0,0,0,False
909730,416425646,La Rioja,Nájera,2011,AO,Gas natural,254.0,1.999,45.0,0,0,False
909731,416425691,La Rioja,Nájera,2011,AO,Gas natural,254.0,3.000,45.0,0,0,False


Drop columns

In [91]:
pred_X = pred_X.drop(['Town', 'NumConnectionsUnder'],axis=1)

Standarization

In [92]:
for column in ['Diameter', 'Length', 'Pressure']:
   pred_X[column] = (pred_X[column] - pred_X[column].mean()) / pred_X[column].std()

pred_X.head()

Unnamed: 0,PipeId,Province,YearBuilt,Material,GasType,Diameter,Length,Pressure,NumConnections,BoolBridle
0,489780,Toledo,2009,PE,Gas natural,-0.111998,-0.421161,0.387956,0,False
1,489790,Toledo,2012,PE,Gas natural,-0.111998,-0.462561,0.216777,0,False
2,489792,Toledo,2012,PE,Gas natural,-0.111998,-0.362993,0.216777,1,False
3,489793,Toledo,2012,PE,Gas natural,-0.111998,-0.438989,0.216777,1,False
4,489981,Toledo,2012,PE,Gas natural,-0.111998,0.142952,0.216777,0,False


Obtaining the MonthLastRev column (we assume the next revision will be in 24 months from the last)

In [93]:
pred_X['MonthsLastRev'] = 24

With this MonthLastRev we estimate the inspection date

In [94]:
last_rev_df = df
last_rev_df.reset_index(inplace=True)
last_rev_df = last_rev_df.drop_duplicates(subset=['PipeId'], keep='last')
pred_X = pred_X.merge(last_rev_df[['PipeId', 'Severity', 'AcumRevisions', 'AcumIncidence','InspectionDate','Incidence']], on = 'PipeId', how = 'left')
pred_X.rename(columns = {'Severity':'PrevSeverity'}, inplace = True)

Inspection Date (estimated)

In [95]:
new_dates = []

for ind in pred_X.index:
    new_date = pred_X['InspectionDate'][ind]+pd.DateOffset(months=pred_X['MonthsLastRev'][ind])
    new_dates.append(new_date)

pred_X['InspectionDate'] = new_dates

Changing the NaN in Severity to 0, as some of them have never been revised. Same will happen with the accumulated values

In [96]:
pred_X['InspectionDate'] = pred_X['InspectionDate'].replace(pd.NaT, pd.to_datetime('2022-12-31'))
pred_X['PrevSeverity'] = pred_X['PrevSeverity'].replace(np.nan, 0)
pred_X['AcumRevisions'] = pred_X['AcumRevisions'].replace(np.nan, 0)
pred_X['AcumIncidence'] = pred_X['AcumIncidence'].replace(np.nan, 0)
pred_X['Incidence'] = pred_X['Incidence'].replace(np.nan, 0)

Age of the pipe

In [97]:
age_list = []

for ind in pred_X.index:
    age = int(pred_X['InspectionDate'][ind].year - pred_X['YearBuilt'][ind])
    age_list.append(age)

pred_X['Age'] = age_list

Accumulated Incidences

In [98]:
pred_X['AcumIncidence'] = pred_X['AcumIncidence'] + pred_X['Incidence']
pred_X = pred_X.drop(['Incidence'],axis=1)

Accumulated Revisions

In [99]:
pred_X['AcumRevisions'] = pred_X['AcumRevisions'] + 1

Inspection Year

In [100]:
pred_X['InspectionYear'] = pred_X['InspectionDate'].dt.year
pred_X['PrevSeverity'] = pred_X['PrevSeverity'].astype('int')
pred_X['AcumIncidence'] = pred_X['AcumIncidence'].astype('int')
pred_X['AcumRevisions'] = pred_X['AcumRevisions'].astype('int')


### Encoding

In [101]:
pred_X = pd.get_dummies(pred_X)
pred_X

Unnamed: 0,PipeId,YearBuilt,Diameter,Length,Pressure,NumConnections,BoolBridle,MonthsLastRev,PrevSeverity,AcumRevisions,...,Material_CU,Material_FD,Material_FG,Material_FI,Material_PA,Material_PE,Material_PN,Material_ZD,GasType_Gas natural,GasType_Gas propano
0,489780,2009,-0.111998,-0.421161,0.387956,0,False,24,0,5,...,0,0,0,0,0,1,0,0,1,0
1,489790,2012,-0.111998,-0.462561,0.216777,0,False,24,0,5,...,0,0,0,0,0,1,0,0,1,0
2,489792,2012,-0.111998,-0.362993,0.216777,1,False,24,0,5,...,0,0,0,0,0,1,0,0,1,0
3,489793,2012,-0.111998,-0.438989,0.216777,1,False,24,0,5,...,0,0,0,0,0,1,0,0,1,0
4,489981,2012,-0.111998,0.142952,0.216777,0,False,24,0,5,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
909728,416223449,2011,-1.138929,-0.516547,7.235145,0,False,24,0,1,...,0,0,0,0,0,0,0,0,1,0
909729,416223471,2011,-1.138929,-0.519670,7.235145,0,False,24,0,1,...,0,0,0,0,0,0,0,0,1,0
909730,416425646,2011,2.385941,-0.495651,7.235145,0,False,24,0,2,...,0,0,0,0,0,0,0,0,1,0
909731,416425691,2011,2.385941,-0.482059,7.235145,0,False,24,0,2,...,0,0,0,0,0,0,0,0,1,0


Indexing some columns

In [102]:
df

Unnamed: 0,PipeId,InspectionDate,Severity,InspectionYear,MonthsLastRev,Incidence,YearBuilt,Diameter,Length,Pressure,...,Material_FG,Material_FI,Material_FO,Material_PA,Material_PE,Material_PN,Material_PV,Material_ZD,GasType_Gas natural,GasType_Gas propano
0,489616,2013-05-08,0,2013,16,0,2012,-0.988359,-0.398361,-0.381338,...,0,0,0,0,1,0,0,0,1,0
1,489616,2015-03-04,0,2015,22,0,2012,-0.988359,-0.398361,-0.381338,...,0,0,0,0,1,0,0,0,1,0
2,489616,2017-01-12,0,2016,22,0,2012,-0.988359,-0.398361,-0.381338,...,0,0,0,0,1,0,0,0,1,0
3,489616,2018-12-19,0,2018,23,0,2012,-0.988359,-0.398361,-0.381338,...,0,0,0,0,1,0,0,0,1,0
4,489616,2020-12-05,0,2020,24,0,2012,-0.988359,-0.398361,-0.381338,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6345339,451995260,2019-10-23,0,2019,24,0,2003,-0.516448,0.426301,-0.419375,...,0,0,0,0,1,0,0,0,1,0
6345340,451995309,2013-10-07,0,2013,24,0,1998,1.406152,0.771957,-0.419375,...,0,0,0,0,1,0,0,0,1,0
6345341,451995309,2015-10-05,0,2015,24,0,1998,1.406152,0.771957,-0.419375,...,0,0,0,0,1,0,0,0,1,0
6345342,451995309,2017-10-02,0,2017,24,0,1998,1.406152,0.771957,-0.419375,...,0,0,0,0,1,0,0,0,1,0


In [103]:
df = df.set_index(['PipeId', 'InspectionDate','Severity'])
pred_X = pred_X.set_index(['PipeId', 'InspectionDate'])


Balance training dataframe

In [104]:
# First we check how balanced is the dataset
Incidence = df[df.Incidence == 1]
not_Incidence = df[df.Incidence == 0]

# index property contains the indexes of the not_Incidence dataset
not_Incidence_indexes = not_Incidence.index

# get as many not_incidence indexes (randomly) as incidence occurrences
np.random.seed(42)
random_not_Incidence_indexes = np.random.choice(not_Incidence_indexes, len(Incidence))

# Keep those entries in not_Incidence
not_Incidence = df.loc[random_not_Incidence_indexes]

# We will form the balanced dataset concatenating Incidence and not_Incidence
balanced_df = Incidence.append(not_Incidence)
balanced_df.sort_index()

  balanced_df = Incidence.append(not_Incidence)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,InspectionYear,MonthsLastRev,Incidence,YearBuilt,Diameter,Length,Pressure,NumConnections,BoolBridle,Age,...,Material_FG,Material_FI,Material_FO,Material_PA,Material_PE,Material_PN,Material_PV,Material_ZD,GasType_Gas natural,GasType_Gas propano
PipeId,InspectionDate,Severity,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
489981,2016-04-28,0,2016,22,0,2012,-0.166884,0.173283,0.166398,0,False,4,...,0,0,0,0,1,0,0,0,1,0
490620,2020-05-18,0,2020,24,0,2012,0.707025,-0.412112,1.079290,0,False,8,...,0,0,0,0,1,0,0,0,1,0
490635,2017-08-31,0,2017,24,0,2012,0.707025,-0.500785,-0.381338,0,False,5,...,0,0,0,0,1,0,0,0,1,0
497136,2019-01-25,1,2019,24,1,2008,0.707025,3.917794,-0.419375,11,False,11,...,0,0,0,0,1,0,0,0,1,0
497253,2017-01-10,0,2017,24,0,2005,-0.166884,0.481415,-0.419375,1,False,12,...,0,0,0,0,1,0,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
450861286,2020-03-12,0,2020,24,0,2012,0.707025,-0.483819,-0.419375,0,False,8,...,0,0,0,0,1,0,0,0,1,0
450872858,2011-12-21,1,2011,21,1,1991,0.574190,0.437210,0.166398,1,True,20,...,0,0,0,0,0,0,0,0,1,0
450873154,2012-03-05,1,2012,2,1,2012,-1.201592,-0.487603,0.166398,1,True,0,...,0,0,0,0,0,0,0,0,1,0
450873682,2011-10-13,1,2011,21,1,1998,-0.516448,-0.489017,0.166398,0,False,13,...,0,0,0,0,1,0,0,0,1,0


split train X and Y

In [105]:
train_X = balanced_df.drop(["Incidence"], axis = 1)
train_Y = balanced_df["Incidence"]

train_X = train_X.drop(["Material_FO", "Material_PV"], axis = 1)

In [106]:
train_X

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,InspectionYear,MonthsLastRev,YearBuilt,Diameter,Length,Pressure,NumConnections,BoolBridle,Age,PrevSeverity,...,Material_CU,Material_FD,Material_FG,Material_FI,Material_PA,Material_PE,Material_PN,Material_ZD,GasType_Gas natural,GasType_Gas propano
PipeId,InspectionDate,Severity,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1
497136,2019-01-25,1,2019,24,2008,0.707025,3.917794,-0.419375,11,False,11,0,...,0,0,0,0,0,1,0,0,1,0
510311,2012-06-21,1,2012,17,2011,-1.390357,0.790643,-0.419375,6,False,1,0,...,0,0,0,0,0,1,0,0,1,0
586278,2012-05-01,3,2012,19,1970,-1.201592,-0.157565,0.166398,0,True,42,0,...,0,0,0,0,0,0,0,0,1,0
634981,2011-02-16,1,2011,21,1999,-1.201592,0.554661,1.992183,0,False,12,0,...,0,0,0,0,0,0,0,0,1,0
644491,2013-07-30,1,2013,24,1989,-0.166884,0.040765,-0.438394,2,False,24,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
133067019,2016-04-08,0,2016,49,2001,1.406152,-0.492524,-0.419375,1,False,15,0,...,0,0,0,0,0,1,0,0,1,0
423217978,2015-09-24,0,2015,23,2011,-0.166884,-0.492842,-0.419375,0,False,4,0,...,0,0,0,0,0,1,0,0,1,0
300463684,2011-01-28,0,2011,24,2006,1.406152,0.498410,-0.419375,0,False,5,0,...,0,0,0,0,0,1,0,0,1,0
24303171,2015-05-28,0,2015,22,1996,-0.166884,0.512272,0.166398,0,False,19,0,...,0,0,0,0,0,1,0,0,1,0


Reorder prediction columns

In [107]:
pred_X = pred_X[train_X.columns]
pred_X

Unnamed: 0_level_0,Unnamed: 1_level_0,InspectionYear,MonthsLastRev,YearBuilt,Diameter,Length,Pressure,NumConnections,BoolBridle,Age,PrevSeverity,...,Material_CU,Material_FD,Material_FG,Material_FI,Material_PA,Material_PE,Material_PN,Material_ZD,GasType_Gas natural,GasType_Gas propano
PipeId,InspectionDate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
489780,2022-04-18,2022,24,2009,-0.111998,-0.421161,0.387956,0,False,13,0,...,0,0,0,0,0,1,0,0,1,0
489790,2022-04-18,2022,24,2012,-0.111998,-0.462561,0.216777,0,False,10,0,...,0,0,0,0,0,1,0,0,1,0
489792,2022-04-18,2022,24,2012,-0.111998,-0.362993,0.216777,1,False,10,0,...,0,0,0,0,0,1,0,0,1,0
489793,2022-04-18,2022,24,2012,-0.111998,-0.438989,0.216777,1,False,10,0,...,0,0,0,0,0,1,0,0,1,0
489981,2022-04-18,2022,24,2012,-0.111998,0.142952,0.216777,0,False,10,0,...,0,0,0,0,0,1,0,0,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
416223449,2022-12-31,2022,24,2011,-1.138929,-0.516547,7.235145,0,False,11,0,...,0,0,0,0,0,0,0,0,1,0
416223471,2022-12-31,2022,24,2011,-1.138929,-0.519670,7.235145,0,False,11,0,...,0,0,0,0,0,0,0,0,1,0
416425646,2019-08-10,2019,24,2011,2.385941,-0.495651,7.235145,0,False,8,0,...,0,0,0,0,0,0,0,0,1,0
416425691,2019-08-09,2019,24,2011,2.385941,-0.482059,7.235145,0,False,8,0,...,0,0,0,0,0,0,0,0,1,0


## Model again


In [108]:
forecaster2 = RandomForestClassifier(criterion= 'gini',
                                     max_depth= 17,
                                     max_features= 'auto',
                                     min_samples_leaf= 1,
                                     min_samples_split= 4,
                                     n_estimators= 115)
forecaster2.fit(train_X, train_Y)
Incidence_pred2 = forecaster2.predict(pred_X)

## Export

In [109]:
pred_X.reset_index(inplace=True)

In [110]:
export = pd.DataFrame()
export['PipeId'] = pred_X["PipeId"]
export['Incidence'] = Incidence_pred2
export

Unnamed: 0,PipeId,Incidence
0,489780,0
1,489790,0
2,489792,0
3,489793,0
4,489981,0
...,...,...
909728,416223449,0
909729,416223471,0
909730,416425646,0
909731,416425691,0


In [111]:
print(len(export[export.Incidence == 1]))
print(len(export[export.Incidence == 0]))

149913
759820


In [112]:
submission_set = pipes.merge(export[['PipeId', 'Incidence']], on = 'PipeId', how = 'left')
submission_set

Unnamed: 0,PipeId,Incidence
0,446859944,0
1,428124500,0
2,438428871,0
3,429034569,0
4,411184477,0
...,...,...
909728,235426649,0
909729,235426673,0
909730,235426707,0
909731,190956601,0


In [113]:
submission_set.to_csv('Asan-Dopollos.csv', index=False)