## LIBRARIES

In [1]:
import pandas as pd
import os
from os import listdir
from os.path import join
import numpy as np
import plotly.graph_objects as go
import plotly.offline as poff
import scipy.io
import scipy.signal
import scipy.io.wavfile
import math
import matplotlib.pyplot as plt
from  matplotlib import pyplot
import datetime
from dateutil.relativedelta import *
from sklearn.model_selection import train_test_split
import seaborn as sns 
from sklearn.linear_model import LogisticRegression
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance_matrix
from past.builtins import xrange
from sklearn import preprocessing
from scipy.stats import wasserstein_distance
import warnings
from matplotlib.backends.backend_pdf import PdfPages

## DEFINE 2 DATASETS WITH DATA BEFORE AND DURING BEARING DAMAGE

In [2]:
df_dic_a ={}
df_dic_b ={}
df_dic ={}
data_dir = r'S:\Analytics\TemporaryThings\Bearing_lenght_of_damage\condition_data'
for temp in os.listdir(data_dir):
    tmp = temp.split('Export_')[-1].split('.csv')[0]
    file = join(data_dir,temp)
    df_dic[tmp] = pd.read_csv(file,index_col = False,
        parse_dates = ['UTC Time Date'],usecols = ['UTC Time Date','BHI','WHI','Speed','RMS X','RMS Y','RMS Z','Peak X','Peak Y','Peak Z'])
    df_dic_a[tmp] = df_dic[tmp].loc[(df_dic[tmp]['UTC Time Date']>= df_dic[tmp]['UTC Time Date'].max()- datetime.timedelta(days=30))]
    df_dic_b[tmp] = df_dic[tmp].loc[(df_dic[tmp]['UTC Time Date']< df_dic[tmp]['UTC Time Date'].max()- datetime.timedelta(days=90))]
    
    


In [3]:
df_a = pd.concat(df_dic_a)
df_b = pd.concat(df_dic_b)
df_a = df_a.reset_index().rename(columns ={'level_0':'Wheel'}).drop(['level_1'],axis=1)
df_a = df_a.loc[(df_a['Speed']>60) & (df_a['Peak X']> 0) & (df_a['Peak Y']> 0) & (df_a['Peak Z']>0) & (df_a['RMS X']>0) & (df_a['RMS Y']>0)  & (df_a['RMS Z']>0)]
df_b = df_b.reset_index().rename(columns ={'level_0':'Wheel'}).drop(['level_1'],axis=1)
df_b = df_b.loc[(df_b['Speed']>60) & (df_b['Peak X']> 0) & (df_b['Peak Y']> 0) & (df_b['Peak Z']>0) & (df_b['RMS X']>0) & (df_b['RMS Y']>0)  & (df_b['RMS Z']>0)]                                                                                                                                  

## DEFINE TRAIN AND TEST SET FOR DATAFRAME WITH DATA BEFORE AND DURING BEARING DAMAGE

In [4]:
# define train and test set

train_a, test_a = train_test_split(df_a['Wheel'].unique(),test_size=0.2)
df_train_l_a = {}
df_test_l_a = {}
for t in train_a:
    df_train_l_a[t] = df_a.loc[df_a['Wheel']==t]
for te in test_a:
    df_test_l_a[te] = df_a.loc[df_a['Wheel']==te]

df_train_s_a = pd.concat(df_train_l_a).reset_index().drop(['level_0','level_1'],axis=1) 
df_train_s_a = pd.concat([pd.DataFrame(df_train_s_a.Wheel.str.split('_').tolist(),columns = ['Customer ID','Unit','Vehicle','Wheel']),df_train_s_a.iloc[:,1:]],axis = 1,sort = False)
df_test_s_a = pd.concat(df_test_l_a).reset_index().drop(['level_0','level_1'],axis=1)
df_test_s_a = pd.concat([pd.DataFrame(df_test_s_a.Wheel.str.split('_').tolist(),columns = ['Customer ID','Unit','Vehicle','Wheel']),df_test_s_a.iloc[:,1:]],axis = 1,sort = False)

In [5]:
train_b, test_b = train_test_split(df_b['Wheel'].unique(),test_size=0.2)
df_train_l_b = {}
df_test_l_b = {}
for t in train_b:
    df_train_l_b[t] = df_b.loc[df_b['Wheel']==t]
for te in test_b:
    df_test_l_b[te] = df_b.loc[df_b['Wheel']==te]

df_train_s_b = pd.concat(df_train_l_b).reset_index().drop(['level_0','level_1'],axis=1) 
df_train_s_b = pd.concat([pd.DataFrame(df_train_s_b.Wheel.str.split('_').tolist(),columns = ['Customer ID','Unit','Vehicle','Wheel']),df_train_s_b.iloc[:,1:]],axis = 1,sort = False)
#df_train_s_b['ID'] = df_train_s_b[['Customer ID','Unit','Vehicle','Wheel']].agg('_'.join, axis=1)
df_test_s_b = pd.concat(df_test_l_b).reset_index().drop(['level_0','level_1'],axis=1)
#df_test_s_b['ID'] = df_test_s_b[['Customer ID','Unit','Vehicle','Wheel']].agg('_'.join, axis=1)
df_test_s_b = pd.concat([pd.DataFrame(df_test_s_b.Wheel.str.split('_').tolist(),columns = ['Customer ID','Unit','Vehicle','Wheel']),df_test_s_b.iloc[:,1:]],axis = 1,sort = False)

## ADD A COLUMN WITH LEVEL OF DAMAGE (CATEGORY)

In [6]:
df_len = pd.read_csv(r'S:\Analytics\TemporaryThings\Bearing_lenght_of_damage\length_of_bearing_damage\length_of_damage.csv',dtype = {'Fleet ID': str, 'Unit': str,
    'Vehicle': str},usecols = ['Fleet ID','Confirmation date','Unit','Vehicle','Wheel','Damage length(mm)'],parse_dates = ['Confirmation date']).dropna()

df_len.loc[(df_len['Damage length(mm)'] <= 30),'Damage Cat']=0
df_len.loc[(df_len['Damage length(mm)'] > 30) & (df_len['Damage length(mm)'] <=80),'Damage Cat']=1
df_len.loc[(df_len['Damage length(mm)'] > 80),'Damage Cat']=2

In [7]:
df_train_a = pd.merge(df_train_s_a,df_len,on=['Unit','Wheel']).drop(['Vehicle_x'],axis = 1).rename(columns={"Vehicle_y": "Vehicle"}).dropna()
df_train_a['ID'] = df_train_a[['Customer ID', 'Unit','Vehicle','Wheel']].apply(lambda x: '_'.join(x), axis=1)
df_test_a = pd.merge(df_test_s_a,df_len,on=['Unit','Wheel']).drop(['Vehicle_x'],axis = 1).rename(columns={"Vehicle_y": "Vehicle"}).dropna()
df_test_a['ID'] = df_test_a[['Customer ID', 'Unit','Vehicle','Wheel']].apply(lambda x: '_'.join(x), axis=1)

In [None]:
df_train_b = pd.merge(df_train_s_b,df_len,on=['Unit','Wheel']).drop(['Vehicle_x'],axis = 1).rename(columns={"Vehicle_y": "Vehicle"}).dropna()
df_train_b['ID'] = df_train_b[['Customer ID', 'Unit','Vehicle','Wheel']].apply(lambda x: '_'.join(x), axis=1)
df_test_b = pd.merge(df_test_s_b,df_len,on=['Unit','Wheel']).drop(['Vehicle_x'],axis = 1).rename(columns={"Vehicle_y": "Vehicle"}).dropna()
df_test_b['ID'] = df_test_b[['Customer ID', 'Unit','Vehicle','Wheel']].apply(lambda x: '_'.join(x), axis=1)

## COUNT FOR EACH FLEET THE NUMBER OF DAMAGE FOR EACH CATEGORY 

In [None]:
count = []
for (f,ins),vals in df_train_a.groupby(['Fleet ID','Damage Cat']):
    count.append(df_train_a.loc[(df_train_a['Fleet ID'] == f) & (df_train_a['Damage Cat'] == ins)].groupby(['Fleet ID','Damage Cat'])[['Wheel']].nunique())
count = pd.concat(count) 

## CHANGE OF VIBRATIONS DISTRIBUTION WHEN THE DAMAGE IS GETS WORST


In [None]:
for f in ['170','171','375','376','377','465']:
    df_q = df_train_a.loc[(df_train_a['Fleet ID'] == f)]
    df_q = df_q.set_index('UTC Time Date').groupby(pd.Grouper(freq='D')).quantile(0.85)
    fig, ax = plt.subplots(1, 6, figsize=(20,5))
    for num,p in enumerate(['Peak X','Peak Y','Peak Z','RMS X','RMS Y','RMS Z']):
        sns.distplot(df_q.loc[(df_q['Damage Cat'] == 0)][p], hist=False, rug=False,ax = ax[num],label = 'light')
        sns.distplot(df_q.loc[(df_q['Damage Cat'] == 1)][p], hist=False, rug=False,ax = ax[num],label = 'middle')
        sns.distplot(df_q.loc[(df_q['Damage Cat'] == 2)][p], hist=False, rug=False,ax = ax[num],label = 'high')
        plt.title(''.join([f,' 85th Percentile']),fontsize = 20)
        ax[num].legend(fontsize = 15)
        ax[num].tick_params(axis="x", labelsize=20)
        ax[num].tick_params(axis="y", labelsize=20)
        warnings.filterwarnings("ignore")
        ax[num].set_xlabel(p, fontsize=20)
        plt.savefig(''.join(['C:\\Users\\ElenaR\\Documents\\Bearing_lenght_of_damage\\output\\',f,' 85th Percentile.png']))

## DISTANCE MATRIX OF AVERAGING STNDARDIZE VIBRATIONS INDEX

In [None]:
distance_low = distance_matrix(preprocessing.scale(pd.DataFrame(df_0_m_a).T),preprocessing.scale(pd.DataFrame(df_0_m_a).T))
distance_middle = distance_matrix(preprocessing.scale(pd.DataFrame(df_1_m_a).T),preprocessing.scale(pd.DataFrame(df_1_m_a).T))
distance_high = distance_matrix(preprocessing.scale(pd.DataFrame(df_2_m_a).T),preprocessing.scale(pd.DataFrame(df_2_m_a).T))

for n,fleet in enumerate(df_train_a['Fleet ID'].unique()):
    distance_low  = (pd.DataFrame(distance_low)).rename(columns = {n : fleet},index = {n : fleet})
    distance_middle  = (pd.DataFrame(distance_middle)).rename(columns = {n : fleet},index = {n : fleet})
    distance_high  = (pd.DataFrame(distance_high)).rename(columns = {n : fleet},index = {n : fleet} )

In [None]:
fig, (ax1,ax2,ax3) = plt.subplots(1,3, figsize=(50,50))
sns.heatmap(distance_low.T, square=True, annot=True,annot_kws={"size": 30},cmap='coolwarm', cbar=False,ax = ax1)
ax1.set_title('Distance between fleets (low damage)',fontsize= 30)
sns.heatmap(distance_middle.T, square=True, annot=True,annot_kws={"size": 30},cmap='coolwarm', cbar=False,ax = ax2)
ax2.set_title('Distance between fleets (middle damage)',fontsize= 30)
sns.heatmap(distance_high.T, square=True, annot=True,annot_kws={"size": 30},cmap='coolwarm', cbar=False,ax = ax3)
ax3.set_title('Distance between fleets (high damage)',fontsize= 30)
for ax in [ax1,ax2,ax3]:
    ax.tick_params(axis="x", labelsize=30)
    ax.tick_params(axis="y", labelsize=30)
plt.savefig(''.join(['C:\\Users\\ElenaR\\Documents\\Bearing_lenght_of_damage\\output\\distance.png']))
    
    

## COMPARE VIBRATIONS BEFORE AND AFTER DAMAGE

In [None]:
f = 'Peak X'
for fleet in ['170','171','375','376','377','465']:
    df1 = df_train_a.loc[df_train_a['Fleet ID']==fleet].groupby(['Damage Cat',pd.Grouper(key='UTC Time Date', freq='D')])[['Peak X','Peak Y','Peak Z','RMS X','RMS Y','RMS Z','Damage length(mm)']].mean()
    df2 = df_train_b.loc[df_train_b['Fleet ID']==fleet].groupby(['Damage Cat',pd.Grouper(key='UTC Time Date', freq='D')])[['Peak X','Peak Y','Peak Z','RMS X','RMS Y','RMS Z','Damage length(mm)']].mean()

    for num,p in enumerate(['Peak X','Peak Y','Peak Z','RMS X','RMS Y','RMS Z']):
        fig, ax = plt.subplots(1, 3, figsize=(40,10))
        sns.distplot(df1.loc[df1.index.get_level_values(0)==0][p],hist=False,label = 'light damage',color = 'red',ax = ax[0])
        sns.distplot(df2.loc[df2.index.get_level_values(0)==0][p],hist=False,label = 'No light damage',color = 'blue',ax = ax[0])
        ax[0].legend(fontsize = 30)
        sns.distplot(df1.loc[df1.index.get_level_values(0)==1][p],hist=False,label = 'middle damage',color = 'red',ax = ax[1])
        sns.distplot(df2.loc[df2.index.get_level_values(0)==1][p],hist=False,label = 'No middle damage',color = 'blue',ax = ax[1])
        ax[1].legend(fontsize = 30)
        sns.distplot(df1.loc[df1.index.get_level_values(0)==2][p],hist=False,label = 'high damage',color = 'red',ax = ax[2])
        sns.distplot(df2.loc[df2.index.get_level_values(0)==2][p],hist=False,label = 'No high damage',color = 'blue',ax = ax[2])
        ax[2].legend(fontsize = 30)

        for axs in [ax[0],ax[1],ax[2]]:
            axs.set_title(' '.join(['Vibrations distribution ',fleet]),fontweight = 'bold',fontsize = 20)
            axs.tick_params(axis="x", labelsize=30)
            axs.tick_params(axis="y", labelsize=30)
            axs.set_xlabel(p, fontsize=30)
        fig.savefig(''.join(['C:\\Users\\ElenaR\\Documents\\Bearing_lenght_of_damage\\output\\',fleet,'_',p,'.png']))
    

            
        
        
    plt.show()


## distribution distance 

In [None]:
w = []
for fleet in ['170','171','375','376','377','465']:
    da = df_train_a.loc[(df_train_a['Fleet ID'] == fleet)]
    db = df_train_b.loc[(df_train_b['Fleet ID'] == fleet)]
    for p in ['Peak X','Peak Y','Peak Z','RMS X','RMS Y','RMS Z']:
        w.append({'Distribution distance': wasserstein_distance(db[p],da[p]),'Parameter' : p,'Fleet' : fleet, 'Light damage average': da.loc[(da['Damage Cat'] == 0)]['Damage length(mm)'].mean() ,'Middle damage average': da.loc[(da['Damage Cat'] == 1)]['Damage length(mm)'].mean()})
        

w = pd.DataFrame(w)

## SCATTER PLOT LENGHT OF DAMAGE AGAINST MAX BHI

In [None]:
f = 20
index = 'BHI'
plt.figure(figsize = (20,14))
for fleet in df_train_a['Fleet ID'].unique():
    plt.scatter(df_train_a.loc[df_train_a['Fleet ID']==fleet].groupby(['Unit','Wheel'])['Damage length(mm)'].max(),df_train_a.loc[df_train_a['Fleet ID']==fleet].groupby(['Unit','Wheel'])[index].max(),s = 60)
plt.legend(df_train_a['Fleet ID'].unique(),fontsize = f)
plt.title('max BHI vs lenght of damage',fontsize = f)
plt.xticks(fontsize = f)
plt.yticks(fontsize = f)
plt.xlabel('length of damage',fontsize = f)
plt.ylabel('max BHI',fontsize = f)
plt.savefig(r'C:\Users\ElenaR\Documents\Bearing_lenght_of_damage\output\max BHI vs lenght of damage')