In [1]:
import pandas as pd
pd.set_option("display.max_columns", 100)
import numpy as np


import matplotlib.pyplot as plt 
%matplotlib inline

import seaborn as sns

from sklearn.linear_model import LinearRegression, ElasticNet, Ridge, Lasso, LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error, mean_absolute_error
from sklearn.tree import ExtraTreeRegressor, DecisionTreeClassifier

import warnings 
warnings.filterwarnings("ignore")

from sklearn.naive_bayes import GaussianNB
from sklearn.naive_bayes import BernoulliNB
from sklearn.neighbors import KNeighborsClassifier

from sklearn.metrics import accuracy_score,precision_score,recall_score,f1_score
from sklearn.metrics import confusion_matrix, classification_report

In [2]:
gbc = GradientBoostingClassifier()
r = RandomForestClassifier()

In [3]:
algorithms = [gbc, r]
names = ["GradientBoosting", "RandomForest"]

In [4]:
def algo_test(X, y, algorithms = algorithms, names = names):
    for i in range(len(algorithms)):
        algorithms[i] = algorithms[i].fit(X, y)
    accuracy = []
    precision = []
    recall = []
    f1 = []
    for i in range(len(algorithms)):
        accuracy.append(accuracy_score(y, algorithms[i].predict(X)))
        precision.append(precision_score(y, algorithms[i].predict(X)))
        recall.append(recall_score(y, algorithms[i].predict(X)))
        f1.append(f1_score(y, algorithms[i].predict(X)))
    metrics = pd.DataFrame(columns = ["Accuracy", "Precision", "Recall", "F1"], index = names)
    metrics["Accuracy"] = accuracy
    metrics["Precision"] = precision
    metrics["Recall"] = recall
    metrics["F1"] = f1
    return metrics.sort_values("F1", ascending = False)

In [5]:
df1 = pd.read_csv("event_type.csv")

In [6]:
df2 = pd.read_csv("log_feature.csv")

In [7]:
df3 = pd.read_csv("resource_type.csv")

In [8]:
df4 = pd.read_csv("severity_type.csv")

In [9]:
df5 = pd.read_csv("train.csv")

In [10]:
Frames = [df1,df2,df3,df4,df5]

In [11]:
df = pd.concat(Frames)

In [12]:
dff = df1.merge(df2,on = "id").merge(df3,on = "id").merge(df4,on = "id").merge(df5,on = "id")

In [13]:
dff

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location,fault_severity
0,8011,event_type 15,feature 68,7,resource_type 8,severity_type 2,location 1,0
1,2588,event_type 15,feature 82,9,resource_type 8,severity_type 1,location 1,0
2,2588,event_type 15,feature 201,5,resource_type 8,severity_type 1,location 1,0
3,2588,event_type 15,feature 80,15,resource_type 8,severity_type 1,location 1,0
4,2588,event_type 15,feature 203,5,resource_type 8,severity_type 1,location 1,0
...,...,...,...,...,...,...,...,...
61834,8114,event_type 11,feature 54,1,resource_type 8,severity_type 2,location 999,0
61835,8114,event_type 11,feature 87,3,resource_type 8,severity_type 2,location 999,0
61836,878,event_type 11,feature 62,1,resource_type 8,severity_type 2,location 999,0
61837,4464,event_type 11,feature 209,1,resource_type 8,severity_type 1,location 999,0


In [14]:
df

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location,fault_severity
0,6597,event_type 11,,,,,,
1,8011,event_type 15,,,,,,
2,2597,event_type 15,,,,,,
3,5022,event_type 15,,,,,,
4,5022,event_type 11,,,,,,
...,...,...,...,...,...,...,...,...
7376,870,,,,,,location 167,0.0
7377,18068,,,,,,location 106,0.0
7378,14111,,,,,,location 1086,2.0
7379,15189,,,,,,location 7,0.0


In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 136850 entries, 0 to 7380
Data columns (total 8 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   id              136850 non-null  int64  
 1   event_type      31170 non-null   object 
 2   log_feature     58671 non-null   object 
 3   volume          58671 non-null   float64
 4   resource_type   21076 non-null   object 
 5   severity_type   18552 non-null   object 
 6   location        7381 non-null    object 
 7   fault_severity  7381 non-null    float64
dtypes: float64(2), int64(1), object(5)
memory usage: 9.4+ MB


In [16]:
df1

Unnamed: 0,id,event_type
0,6597,event_type 11
1,8011,event_type 15
2,2597,event_type 15
3,5022,event_type 15
4,5022,event_type 11
...,...,...
31165,3761,event_type 11
31166,8720,event_type 11
31167,6488,event_type 11
31168,878,event_type 11


In [17]:
df1.nunique()

id            18552
event_type       53
dtype: int64

In [18]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31170 entries, 0 to 31169
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   id          31170 non-null  int64 
 1   event_type  31170 non-null  object
dtypes: int64(1), object(1)
memory usage: 487.2+ KB


In [19]:
df["event_type"] = df["event_type"].str.extract('(\d+)')

In [20]:
df2

Unnamed: 0,id,log_feature,volume
0,6597,feature 68,6
1,8011,feature 68,7
2,2597,feature 68,1
3,5022,feature 172,2
4,5022,feature 56,1
...,...,...,...
58666,8720,feature 209,1
58667,6488,feature 54,3
58668,878,feature 62,1
58669,4464,feature 209,1


In [21]:
df2.nunique()

id             18552
log_feature      386
volume           341
dtype: int64

In [22]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 58671 entries, 0 to 58670
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype 
---  ------       --------------  ----- 
 0   id           58671 non-null  int64 
 1   log_feature  58671 non-null  object
 2   volume       58671 non-null  int64 
dtypes: int64(2), object(1)
memory usage: 1.3+ MB


In [23]:
df3

Unnamed: 0,id,resource_type
0,6597,resource_type 8
1,8011,resource_type 8
2,2597,resource_type 8
3,5022,resource_type 8
4,6852,resource_type 8
...,...,...
21071,3761,resource_type 8
21072,8720,resource_type 8
21073,6488,resource_type 8
21074,878,resource_type 8


In [24]:
df3.nunique()

id               18552
resource_type       10
dtype: int64

In [25]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 21076 entries, 0 to 21075
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             21076 non-null  int64 
 1   resource_type  21076 non-null  object
dtypes: int64(1), object(1)
memory usage: 329.4+ KB


In [26]:
df4

Unnamed: 0,id,severity_type
0,6597,severity_type 2
1,8011,severity_type 2
2,2597,severity_type 2
3,5022,severity_type 1
4,6852,severity_type 1
...,...,...
18547,3761,severity_type 1
18548,8720,severity_type 1
18549,6488,severity_type 2
18550,878,severity_type 2


In [27]:
df4.nunique()

id               18552
severity_type        5
dtype: int64

In [28]:
df4.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 18552 entries, 0 to 18551
Data columns (total 2 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   id             18552 non-null  int64 
 1   severity_type  18552 non-null  object
dtypes: int64(1), object(1)
memory usage: 290.0+ KB


In [29]:
df5

Unnamed: 0,id,location,fault_severity
0,14121,location 118,1
1,9320,location 91,0
2,14394,location 152,1
3,8218,location 931,1
4,14804,location 120,0
...,...,...,...
7376,870,location 167,0
7377,18068,location 106,0
7378,14111,location 1086,2
7379,15189,location 7,0


In [30]:
df5.nunique()

id                7381
location           929
fault_severity       3
dtype: int64

In [31]:
df5.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7381 entries, 0 to 7380
Data columns (total 3 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   id              7381 non-null   int64 
 1   location        7381 non-null   object
 2   fault_severity  7381 non-null   int64 
dtypes: int64(2), object(1)
memory usage: 173.1+ KB


In [32]:
dff

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location,fault_severity
0,8011,event_type 15,feature 68,7,resource_type 8,severity_type 2,location 1,0
1,2588,event_type 15,feature 82,9,resource_type 8,severity_type 1,location 1,0
2,2588,event_type 15,feature 201,5,resource_type 8,severity_type 1,location 1,0
3,2588,event_type 15,feature 80,15,resource_type 8,severity_type 1,location 1,0
4,2588,event_type 15,feature 203,5,resource_type 8,severity_type 1,location 1,0
...,...,...,...,...,...,...,...,...
61834,8114,event_type 11,feature 54,1,resource_type 8,severity_type 2,location 999,0
61835,8114,event_type 11,feature 87,3,resource_type 8,severity_type 2,location 999,0
61836,878,event_type 11,feature 62,1,resource_type 8,severity_type 2,location 999,0
61837,4464,event_type 11,feature 209,1,resource_type 8,severity_type 1,location 999,0


In [33]:
dff["event_type"] = dff["event_type"].str.extract('(\d+)')

In [34]:
dff["event_type"] = dff["event_type"].astype(int)

In [35]:
dff["log_feature"] = dff["log_feature"].str.extract('(\d+)')

In [36]:
dff["log_feature"] = dff["log_feature"].astype(int)

In [37]:
dff["resource_type"] = dff["resource_type"].str.extract('(\d+)')

In [38]:
dff["resource_type"] = dff["resource_type"].astype(int)

In [39]:
dff["severity_type"] = dff["severity_type"].str.extract('(\d+)')

In [40]:
dff["severity_type"] = dff["severity_type"].astype(int)

In [41]:
dff["location"] = dff["location"].str.extract('(\d+)')

In [42]:
dff["location"] = dff["location"].astype(int)

In [44]:
dff.head()

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location,fault_severity
0,8011,15,68,7,8,2,1,0
1,2588,15,82,9,8,1,1,0
2,2588,15,201,5,8,1,1,0
3,2588,15,80,15,8,1,1,0
4,2588,15,203,5,8,1,1,0


In [45]:
abs(dff.corr()["fault_severity"]).sort_values(ascending = False)

fault_severity    1.000000
severity_type     0.325157
resource_type     0.285475
location          0.270390
event_type        0.262184
log_feature       0.208400
id                0.045528
volume            0.027196
Name: fault_severity, dtype: float64

In [46]:
x = dff.drop(["fault_severity"], axis = 1)

In [47]:
x.head()

Unnamed: 0,id,event_type,log_feature,volume,resource_type,severity_type,location
0,8011,15,68,7,8,2,1
1,2588,15,82,9,8,1,1
2,2588,15,201,5,8,1,1
3,2588,15,80,15,8,1,1
4,2588,15,203,5,8,1,1


In [48]:
y = dff[["fault_severity"]]

In [49]:
x["id"] = x["id"].astype("category")

In [50]:
x = pd.get_dummies(x, drop_first = True)

In [51]:
x.head()

Unnamed: 0,event_type,log_feature,volume,resource_type,severity_type,location,id_5,id_6,id_8,id_13,id_19,id_20,id_23,id_24,id_26,id_27,id_28,id_29,id_31,id_33,id_38,id_43,id_44,id_49,id_53,id_54,id_55,id_57,id_59,id_60,id_61,id_64,id_67,id_68,id_69,id_75,id_79,id_80,id_81,id_83,id_84,id_88,id_89,id_90,id_91,id_92,id_93,id_94,id_99,id_104,...,id_18414,id_18415,id_18416,id_18419,id_18426,id_18428,id_18440,id_18441,id_18442,id_18443,id_18444,id_18448,id_18449,id_18451,id_18452,id_18462,id_18463,id_18466,id_18467,id_18472,id_18473,id_18474,id_18480,id_18482,id_18484,id_18490,id_18492,id_18493,id_18498,id_18503,id_18506,id_18507,id_18508,id_18511,id_18520,id_18524,id_18525,id_18527,id_18528,id_18530,id_18533,id_18535,id_18536,id_18537,id_18538,id_18539,id_18542,id_18543,id_18548,id_18550
0,15,68,7,8,2,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,15,82,9,8,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,15,201,5,8,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,15,80,15,8,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,15,203,5,8,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [None]:
gbcmodel = gbc.fit(x, y)

In [None]:
y_pred = gbcmodel.predict(x)

In [None]:
y_pred_gbc = gbcmodel.predict_proba(x)

In [None]:
result=pd.DataFrame({
    "id":x.id,
    "Predicted fault_severity":y_pred,
    "prediction_probability_0":y_pred_gbc[:,0],
    "prediction_probability_1":y_pred_gbc[:,1],
    "prediction_probability_2":y_pred_gbc[:,2]
    

    
},columns = ["id", "Predicted fault_severity", "prediction_probability_0", "prediction_probability_1", "prediction_probability_2"])

In [None]:
result.head()