# Sandvik

## Import packages and load files

### Import packages

In [2]:
# Data manipulation
import numpy as np
import pandas as pd
import glob, os
import time

# Data visualization
import matplotlib.pyplot as plt
import seaborn as sns

# Modelling
from sklearn.ensemble import IsolationForest 
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error

### Load files

In [None]:
# arkiv_B
path_arkivb = '/Users/Emil/Data/Exjobbsdata/Sandvik/Data/CSV_2018_B_Arkiv_Sekund'
all_arkivb = glob.glob(os.path.join(path_arkivb, "B1_2018_V2*.TXT")) 
df_each_arkivb = (pd.read_csv(f, sep=';', header = None, encoding = 'unicode_escape') for f in all_arkivb)
arkivb_df = pd.concat(df_each_arkivb, ignore_index = True)

arkivb_df['Datetime'] = pd.to_datetime(arkivb_df[3] + ' ' + arkivb_df[4])
arkivb_df = arkivb_df.drop([3, 4], axis = 1)

In [None]:
# handelser (sep 17- jun 18)
path_handelser = '/Users/Emil/Data/Exjobbsdata/Sandvik/Data/Handelser'
all_handelser = glob.glob(os.path.join(path_handelser, "*.TXT"))

df_each_handelser = (pd.read_csv(f, sep=';', header = None, encoding = 'unicode_escape', parse_dates = [1]) for f in all_handelser)
handelser_df1 = pd.concat(df_each_handelser, ignore_index = True)
handelser_df = handelser_df1.drop([3, 4], axis=1)

In [None]:
# larm
#path_larm = '/Users/Emil/Data/Exjobbsdata/Sandvik/Data/Larm'
#all_larm = glob.glob(os.path.join(path_larm, "*.TXT"))

#df_each_larm = (pd.read_csv(f, sep=';', header = None, encoding = 'unicode_escape') for f in all_larm)
#larm_df = pd.concat(df_each_larm, ignore_index = True)

## Preprocessing

In [None]:
# Pivot data
arkivb_pivot_df = pd.pivot_table(arkivb_df, values=0, index='Datetime', columns=2)

In [None]:
# Group by time
arkivb_grouped_df = arkivb_pivot_df.resample("3T").mean()
arkivb_grouped_df.shape

### Erasing if variance is zero

In [None]:
# Drop columns with zero variance (constant)
cols_without_variance = [col for col in arkivb_grouped_df.columns if (arkivb_grouped_df[col].var() == 0)]
arkivb_variance_df = arkivb_grouped_df.drop(cols_without_variance, axis=1)

# Resulting shape
arkivb_variance_df.shape

### Handling missing values and imputing

In [None]:
# Specify acceptance percent of missing data
cut_off = 0.75

# Drop columns with missing data
arkivb_missing_df = arkivb_variance_df.dropna(thresh = cut_off*len(arkivb_variance_df.index), axis = 1)

# Impute missing values with linear function
arkivb_imputed_df = arkivb_missing_df.interpolate(axis=0)

# Resulting shape
arkivb_missing_df.shape

### Resulting dataframe

In [None]:
arkivb_df = arkivb_imputed_df
arkivb_df.head()

## Visualization

In [None]:
plt.figure(figsize=(14,6))
sns.lineplot(data = arkivb_df['GIvare: Valsoljeflöde, skalat värde'])

## Model

In [None]:
X = arkivb_df
if_model = IsolationForest(contamination = 'auto', behaviour="new")
if_model.fit(X)
if_score = if_model.decision_function(X)
plt.figure(figsize=(14,6))
sns.lineplot(x = X.index, y = if_score * 1000)
#sns.lineplot(data = X['GIvare: Valsoljeflöde, skalat värde'])