# Vcituvanje biblioteki


In [None]:
import numpy as np 
import pandas as pd
import seaborn as sns
from matplotlib import pyplot as plt
import plotly.graph_objects as go
import plotly.express as px
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from imblearn.metrics import classification_report_imbalanced
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier

# Vcituvanje podatoci


In [None]:
telemetry = pd.read_csv('/kaggle/input/microsoft-azure-predictive-maintenance/PdM_telemetry.csv')
failures = pd.read_csv('/kaggle/input/microsoft-azure-predictive-maintenance/PdM_failures.csv')
errors = pd.read_csv('/kaggle/input/microsoft-azure-predictive-maintenance/PdM_errors.csv')
maint = pd.read_csv('/kaggle/input/microsoft-azure-predictive-maintenance/PdM_maint.csv')
machines = pd.read_csv('/kaggle/input/microsoft-azure-predictive-maintenance/PdM_machines.csv')

In [None]:
print(telemetry.head())
print(failures.head())
print(errors.head())
print(machines.head())


**Razgleduvame podatici za 100 razlicni masini istite se smesteni vo podatocnite mnozestva PdM_telemetry, PdM_errors, PdM_failures, PdM_machines, PdM_maint.**

* Vo mnozestvoto PdM_telemetry imame podatoci za vibracii, pritisok, brzina na vrtenje i napon za sekoja od masinite. Podatocite se prosecni vrednosti na merenja na senzorite na eden cas vremenski period.

* Vo mnozestvoto PdM_failures imame podatok za sekoj defekt koj se slucil na masinite, koga se slucil i koja od komponentite otkazala.

* Vo mnozestvoto PdM_machines imame podatoci za model na masinite i za starost.

* Vo mnozestvoto PdM_errors ima podatoci za koga sekoja od masinite javila greska i za koja greska stavua zbor
vo PdM_main ima podatok za koga bil izvrsuvan servis na sekoja od masinite i koj del bil zamenet.

# Konverzija od datatype 'string' vo datatype 'pandas.dataframe'

In [None]:
table = [telemetry, maint, failures, errors]
for i in table:
    i["datetime"] = pd.to_datetime(i["datetime"]) 
    i.sort_values(["datetime", "machineID"], inplace=True, ignore_index=True)

Za sekoja tabela posebno kolonata 'datetime' ja kovertirame od string vo datetime format

# EDA Telemetry

In [None]:
telemetry.describe()

Moze da zebelezime deka podatocite se od site senzori se dvizat vo odredeni granici i nema nekoi znacitelni otstapuvanja so sto moze da zaklucime deka podatocite se filtrirani.

In [None]:
sns.distplot(telemetry['volt'])

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharey=False)
sns.distplot(telemetry['volt'],ax=axes[0],)
sns.distplot(telemetry['rotate'],ax=axes[1],)
sns.distplot(telemetry['pressure'],ax=axes[2],)
sns.distplot(telemetry['vibration'],ax=axes[3],)

Distribucijata na podatocite e spored gausova kriva -> nema potreba za obrabotka, podatocite pogodni za treniranje na model 

In [None]:
plot = go.Figure()
plot_telemetry = telemetry.loc[(telemetry['machineID'] == 1) &
                        (telemetry['datetime'] > pd.to_datetime('2015-01-01')) &
                        (telemetry['datetime'] < pd.to_datetime('2016-01-01')), 
                        ['datetime', 'volt','rotate', 'pressure','vibration']]
plot.add_traces(go.Scatter(x = plot_telemetry['datetime'].values, y = plot_telemetry['volt'].values,name = 'Voltage',))
plot.add_trace(go.Scatter(x = plot_telemetry['datetime'].values, y = plot_telemetry['rotate'].values,name = 'Rotation'))
plot.add_trace(go.Scatter(x = plot_telemetry['datetime'].values, y = plot_telemetry['pressure'].values,name = 'Pressure'))
plot.add_trace(go.Scatter(x = plot_telemetry['datetime'].values, y = plot_telemetry['vibration'].values,name = 'Vibration'))

plot.update_layout()
plot.show(renderer='iframe')

# EDA errors


In [None]:
plot = px.bar(x=errors['errorID'].values,
              color=errors['errorID'].values,
              template='none'
             )
plot.show()

# EDA failure

In [None]:
plot = px.bar(x=failures['failure'].values,
              color=failures['failure'].values,
              template='none'
             )
plot.show()

# EDA Maint

In [None]:

fig = px.bar(x=maint['comp'].values, template='none', color=maint['comp'].values) #
fig.show()

# EDA machines

In [None]:
fig = px.bar(x=machines['model'].values, template='none', color=machines['model'].values) #
fig.show()

In [None]:
error_cnt = errors.groupby(["machineID", "errorID"]).size().reset_index()
error_cnt.columns = ["machineID", "errorID", "error_val"]
error_cnt_pivot = pd.pivot(error_cnt, index="machineID", columns="errorID", values="error_val")
error_cnt_pivot.plot.bar(stacked=True, figsize=(20, 6), title="Count of Errors for All Machines")

In [None]:
def resample_feat_rule(df, features, rule, aggrs):
    ''' Presmetuva min, max, standarda deviacjija i sredna vrednost na odreden vremenski period. Funkcijata ima 4 argumenti:
    df - pandas.DataFrame format - tabela na koja se izvrsuvaat promenite
    features - list - naslovi na koloni
    rule - string - na kolkav vremenski interval da se vrsi resample
    aggrs - list - [min, max, std, mean]'''
    temp = []
    columns = []
    for aggr in aggrs: 
        for col in features:
            if aggr == 'min':
                temp.append(pd.pivot_table(df,
                                           index='datetime',
                                           columns='machineID',
                                           values=col).resample(rule, closed='left', label='right').min().unstack())
            elif aggr == 'max':
                temp.append(pd.pivot_table(df,
                                           index='datetime',
                                           columns='machineID',
                                           values=col).resample(rule, closed='left', label='right').max().unstack())
            elif aggr == 'mean':
                temp.append(pd.pivot_table(df,
                                           index='datetime',
                                           columns='machineID',
                                           values=col).resample(rule, closed='left', label='right').mean().unstack())
            elif aggr == 'std':
                temp.append(pd.pivot_table(df,
                                           index='datetime',
                                           columns='machineID',
                                           values=col).resample(rule, closed='left', label='right').std().unstack())
            else:
                raise ValueError("Invalid method name. Use 'min', 'max', 'mean', or 'standard deviation'.")
        df_temp = pd.concat(temp, axis=1)
        columns = columns + [i + '_' + aggr + '_' + rule.lower() for i in features]
    df_temp.columns = columns
    df_temp.reset_index(inplace=True)
    return df_temp

In [None]:
def roll_resample_feat_rule(df, features, window, rule, aggrs):
    ''' Se vrsi rolling na odreden interval(window) min, max, standarda deviacjija i potoa se vrsi resempliranje na vremenski interval - rule. Funkcijata ima 5 argumenti:
    df - pandas.DataFrame format - tabela na koja se izvrsuvaat promenite
    features - list - naslovi na koloni
    window - int - na koj interval da se presmeta rolling funkcijata
    rule - string - na kolkav vremenski interval da se vrsi resample
    aggrs - list - [min, max, std, mean]'''
    temp = []
    columns = []
    for aggr in aggrs:
        for col in features: 
            if aggr == 'min':
                temp.append(pd.pivot_table(df, index='datetime',
                                               columns='machineID',
                                               values=col).rolling(window=window,
                                                                    center=False).min().resample(rule,
                                                                                    closed='left',
                                                                                    label='right').first().unstack())
            elif aggr == 'max':
                temp.append(pd.pivot_table(df, index='datetime',
                                               columns='machineID',
                                               values=col).rolling(window=window,
                                                                    center=False).max().resample(rule,
                                                                                    closed='left',
                                                                                    label='right').first().unstack())
            elif aggr == 'mean':
                temp.append(pd.pivot_table(df, index='datetime',
                                               columns='machineID',
                                               values=col).rolling(window=window,
                                                                    center=False).mean().resample(rule,
                                                                                    closed='left',
                                                                                    label='right').first().unstack())
            elif aggr == 'std':
                temp.append(pd.pivot_table(df, index='datetime',
                                               columns='machineID',
                                               values=col).rolling(window=window,
                                                                    center=False).std().resample(rule,
                                                                                    closed='left',
                                                                                    label='right').first().unstack())
            elif aggr == 'sum':
                temp.append(pd.pivot_table(df, index='datetime',
                                               columns='machineID',
                                               values=col).rolling(window=window,
                                                                    center=False).sum().resample(rule,
                                                                                    closed='left',
                                                                                    label='right').first().unstack())
            else:
                raise ValueError("Invalid method name. Use 'min', 'max', 'mean', or 'standard deviation' , or 'sum'.")
        df_temp = pd.concat(temp, axis=1)
        columns = columns + [i + '_' + aggr + '_' + str(window) for i in features]
    df_temp.columns = columns
    df_temp.reset_index(inplace=True)
    df_temp.dropna(inplace=True)
    return df_temp

In [None]:
aggrs = ['min','max','std','mean']
features = ['volt', 'rotate', 'pressure', 'vibration'] 
rule = '3h'
window = 24
telemetry_lagfeat = pd.concat([resample_feat_rule(telemetry, features, rule, aggrs),
                            roll_resample_feat_rule(telemetry, features, window, rule, aggrs).iloc[:, 2:]], axis=1).dropna()

In [None]:
telemetry_lagfeat.dtypes

In [None]:
fig, axes = plt.subplots(1, 4, figsize=(20, 5), sharey=False) ## volt ok raspredelba rotate, vibrate, pressure 3h
sns.distplot(telemetry_lagfeat['volt_min_24'],ax=axes[0],)
sns.distplot(telemetry_lagfeat['volt_mean_24'],ax=axes[1],)
sns.distplot(telemetry_lagfeat['volt_max_24'],ax=axes[2],)
sns.distplot(telemetry_lagfeat['volt_std_24'],ax=axes[3],)

## Errors sreduvanje


In [None]:
Error2 = pd.get_dummies(errors.set_index('datetime')).reset_index()
Error2.columns = ['datetime', 'machineID', 'error1', 'error2', 'error3', 'error4', 'error5']
Error2 = Error2.groupby(['machineID', 'datetime']).sum().reset_index()
Error2.head()

In [None]:
Error2 = telemetry[['datetime', 'machineID']].merge(
    Error2, on=['machineID', 'datetime'], how='left').fillna(0)
Error2.head()

In [None]:
aggrs = ['sum']
features = ['error%d' % i for i in range(1,6)] 
rule = '3h'
window = 24
Error224 =roll_resample_feat_rule(Error2, features, window, rule, aggrs)

## Service

In [None]:
comp_dum = pd.get_dummies(maint.set_index('datetime'),dtype = int)
comp_dum = comp_dum.groupby(['machineID', 'datetime']).sum().reset_index()
comp_dum = telemetry[['datetime', 'machineID']].merge(comp_dum,
                                                      on=['datetime',
                                                          'machineID'],
                                                      how='outer').fillna(0).sort_values(by=['machineID', 'datetime'])
comp_dum.columns = ['datetime', 'machineID',
                    'comp1', 'comp2', 'comp3', 'comp4']
comp_dum.head()

## Krajna tabela

In [None]:
final = telemetry_lagfeat.merge(Error224, on=['datetime', 'machineID'], how='left')
final = final.merge(comp_dum, on=['datetime', 'machineID'], how='left')
final = final.merge(machines, on=['machineID'], how='left')
final.describe()

In [None]:
final2 = final.merge(failures, on=['datetime', 'machineID'], how='left')
final2 = final2.fillna(method='bfill', limit=7) 
final2['failure'] = final2['failure'].astype('str')
final2.replace({'nan': "none"}, inplace= True)
final2.dropna(inplace= True)

In [None]:
final2[final2['machineID']==1].head(50)

In [None]:
# failures.head(50)


## Train test Split

In [None]:
X = final2.drop(labels=['datetime', 'machineID', 'failure'], axis=1)
X = pd.get_dummies(X,dtype = int)
y = final2['failure']
y.replace(['none', 'comp1', 'comp2', 'comp3', 'comp4'],[int(0), int(1), int(2), int(3), int(4)],inplace = True)

In [None]:
# X_val = X.tail(14000)
# X.drop(X.tail(14000).index,
#         inplace = True)
# y_val = y.tail(14000)
# y.drop(y.tail(14000).index,
#         inplace = True)
# # vo slucaj da treba da vadime podatoci za validacija 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=200)

# Model training XGBoost

In [None]:
model = XGBClassifier(n_estimators=50,
            learning_rate = 0.5, 
            max_depth = 3,
        )

In [None]:
model.fit(X_train,y_train)

In [None]:
# y_train.replace(['none', 'comp1', 'comp2', 'comp3', 'comp4'],[int(0), int(1), int(2), int(3), int(4)],inplace = True)
# y_test.replace(['none', 'comp1', 'comp2', 'comp3', 'comp4'],[int(0), int(1), int(2), int(3), int(4)],inplace = True)


In [None]:
yn_pred = model.predict(X_test)
print(classification_report_imbalanced(y_test,yn_pred))

In [None]:
yn_pred

In [None]:
yn_pred = model.predict(X_val)
print(classification_report_imbalanced(y_val
                                       
                                       ,yn_pred))

## Model Training Random Forest

In [None]:
# for i in range(len(y)):
#     if y[i]==0:
#         continue
#     else:
#         y[i]=1
# # vo slucaj da se raboti so dve klasi        

In [None]:
clf = RandomForestClassifier(n_estimators = 150, max_depth = 5, min_samples_split=5,min_samples_leaf=5,random_state = 100,criterion = 'entropy')

In [None]:
clf.fit(X_train,y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report_imbalanced(y_test,y_pred))# ok

klasa 1 i 3 ne postavuva nitu edna vrednost -> ke porbame so dve kategorii dali ke ima otkaz na nekoja komponenta ili ne

In [None]:
# y_pred = clf.predict(X_val)
# print(classification_report_imbalanced(y_val,y_pred))

## Model Trainig SVM

Linear-SVC

In [None]:
clf = make_pipeline(StandardScaler(),
                    LinearSVC(random_state=0, tol=1e-7,max_iter = 30000))## tol = 1e-5 najdobro
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)

In [None]:
print(classification_report_imbalanced(y_test,y_pred))

In [None]:
# clf = 1

C-SVC

In [None]:
clf = make_pipeline(StandardScaler(),
                    SVC(random_state=0))##
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report_imbalanced(y_test,y_pred))

## Two classes

In [None]:
for i in range(len(y)):
    if y[i]==0:
        continue
    else:
        y[i]=1
# vo slucaj da se raboti so dve klasi    

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.45, random_state=200)

## XGB Classifier

In [None]:
model = XGBClassifier(n_estimators=50,
            learning_rate = 0.5, 
            max_depth = 3,
        )
model.fit(X_train,y_train)

In [None]:
y_pred = model.predict(X_test)
print(classification_report_imbalanced(y_test,y_pred))


## Linear SVC

In [None]:
clf = make_pipeline(StandardScaler(),
                    LinearSVC(random_state=0, tol=1e-7,max_iter = 30000))## tol = 1e-5 najdobro
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report_imbalanced(y_test,y_pred))

## C SVC

In [None]:
clf = make_pipeline(StandardScaler(),
                    SVC(random_state=0, tol=1e-7,max_iter = 30000))## tol = 1e-5 najdobro
clf.fit(X_train, y_train)

In [None]:
y_pred = clf.predict(X_test)
print(classification_report_imbalanced(y_test,y_pred))

# Zaklucok

Site algoritmi dobro go predviduvaat otakzot na sekoja od masinite. najdobri rezultati dava XBVClassifier i vo slucaj koga razgleduvame koja komponenta ke otakze i vo slucaj koga predviduva samo dali kje ima otkaz ili ne.


Idejata e da go predvidime otkazot na nekoja od masinite 24 casa pred navistina da nastane istiot so cel da moze da se intervenira navremeno i da se plalira zamenata i da se izbegne nenadeen zastoj.