# Import the required modules

In [3]:
!pip install git+https://github.com/goolig/dsClass.git

In [3]:

%matplotlib inline

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectPercentile, f_classif, chi2, SelectKBest, SelectFpr
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.metrics import confusion_matrix, accuracy_score, precision_recall_fscore_support

from dsClass.path_helper import *



In [4]:
ts_file_path = get_file_path('ts_data.csv')
ts_data = pd.read_csv(ts_file_path)
print(ts_data.shape)
ts_data

# Load the time-series data and lookt at the features statistics

In [5]:
ts_data.groupby('fail').describe()

# Plot the time series data

In [6]:
original_features = ['bb_count', 'r-w_rate', 'reconsects_count',
       'recovbydrv_count', 'xfer_rate', 'bb_diffs']

fig, axes = plt.subplots(nrows=1, ncols=2)

ts_data.loc[ts_data['d_id']==1,original_features].plot(figsize=(10,12), title='drive '+str(1)+' fail', ax=axes[0])

num_drives = ts_data['d_id'].unique().shape[0]
#num_drives

ts_data.loc[ts_data['d_id']==num_drives,original_features].plot(figsize=(10,12), title='drive '+str(num_drives)+' non-fail', ax=axes[1])
plt.legend()

# Engineer the time-series features

For each Create the aggregated features (mean, median, variance, minimum and naximum):
* for each sn
* for each feature
* for each aggregation type
        aggregate the 20 daily samples into one aggregated sample  

In [7]:
#Q1
mean_ts=ts_data.groupby(['d_id','fail']).mean()
median_ts=ts_data.groupby(['d_id','fail']).median()
var_ts=ts_data.groupby(['d_id','fail']).var()
max_ts=ts_data.groupby(['d_id','fail']).max().drop('time',axis=1)
min_ts=ts_data.groupby(['d_id','fail']).min().drop('time',axis=1)


mean_median=median_ts.merge(mean_ts, on=['d_id','fail'],suffixes=('-medain','-mean'))
mean_median_max=mean_median.merge(max_ts, on=['d_id','fail'],suffixes=('','-max'))
mean_median_max_min=mean_median_max.merge(min_ts, on=['d_id','fail'],suffixes=('','-min'))
mean_median_max_min_var=mean_median_max_min.merge(var_ts, on=['d_id','fail'],suffixes=('','-var'))

mean_median_max_min_var



In [2]:
#Q1
mean_ts=ts_data.groupby(['d_id','fail']).mean()
median_ts=ts_data.groupby(['d_id','fail']).median()
var_ts=ts_data.groupby(['d_id','fail']).var()
max_ts=ts_data.groupby(['d_id','fail']).max().drop('time',axis=1)
min_ts=ts_data.groupby(['d_id','fail']).min().drop('time',axis=1)


mean_median=median_ts.merge(mean_ts, on=['d_id','fail'],suffixes=('-medain','-mean'))
mean_median_max=mean_median.merge(max_ts, on=['d_id','fail'],suffixes=('','-max'))
mean_median_max_min=mean_median_max.merge(min_ts, on=['d_id','fail'],suffixes=('','-min'))
mean_median_max_min_var=mean_median_max_min.merge(var_ts, on=['d_id','fail'],suffixes=('','-var'))

mean_median_max_min_var


In [1]:
list(mean_median_max_min_var.columns.values)

# Load the cofiguration data

In [9]:
conf_file_path = get_file_path('conf_data.csv')
conf_data = pd.read_csv(conf_file_path)
conf_data

# Look at the features\labels distribution

In [10]:
plt.figure()
conf_data[['age', 'fail']].boxplot(by='fail', figsize=(10,12), sym='')

In [11]:
g = conf_data.groupby(["model", "fail"])['d_id'].count().unstack('fail')
g.plot(kind="bar", stacked=True, grid=True, alpha=0.75, rot=45)

# Handle categorical data 

For the decision tree algorithm, map the categorical features to numeric with:
* "Change_capacity" function for "capacity" column
* "pd.get_dummies" function for "model" column


In [13]:
#Q2
def change_capacity(data):
    # A function that receives a data frame and a column name as input and map the categorical capacity feature to numeric 
    di = dict(zip(conf_data["capacity"].unique(), [1000, 600, 300]))

    return data.replace({"capacity": di}, inplace=False)


In [14]:
new_conf_data = change_capacity(conf_data)
new_conf_data=pd.get_dummies(new_conf_data,columns=['model'])
new_conf_data

# Merge the aggregated time-series and the configuration datasets

In [15]:
#Q3
merged_data_frame=new_conf_data.merge(mean_median_max_min_var,on='d_id')
merged_data_frame_withoutDid=merged_data_frame.drop(['d_id','fail'],axis=1)
onlyFails=merged_data_frame['fail']
merged_data_frame_withoutDid


In [16]:
list(merged_data_frame_withoutDid.columns.values)

# Select the 10 most informative features and transform the data

In [17]:
corr=merged_data_frame_withoutDid.corr()
corr.style.background_gradient()

In [18]:
#Q4
selectedKBest=SelectKBest(f_classif,k=10).fit_transform(merged_data_frame_withoutDid,onlyFails)
selectedKBest

# Train a Decision tree classifier 

Since we are learning and testing on the same set we will limit the maximum depth parameter to 5 to prevent overfitting

In [19]:
#Q5
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test=train_test_split(selectedKBest,onlyFails,random_state=0,shuffle=True)

In [20]:
print(type(y_train))
print(y_train.head())
print(X_train)

In [21]:
clf=DecisionTreeClassifier()
clf.max_depth=20
clf.fit(X_train,y_train)

print('Accuracy of Decision Tree classifier on training set: {:.2f}'
     .format(clf.score(X_train, y_train)))


**************How is it possible that the accuracy is so good?**

# Make the prediction and plot the confusion matrix
https://en.wikipedia.org/wiki/Confusion_matrix

* Change "max_depth" paramter to see what happens to model results


In [22]:
#Q6
predictions=clf.predict(X_test)
print('Accuracy of Decision Tree classifier on test set: {:.2f}'
     .format(clf.score(X_test, y_test)))

In [25]:
from sklearn.metrics import ConfusionMatrixDisplay
#from sklearn.metrics. import from_predictions
#from sklearn.metrics import plot_confustion_matrix
labelsss=['pass','fail']
matrix=confusion_matrix(y_test,predictions)
ConfusionMatrixDisplay(matrix).plot()


In [None]:
fig=plt.figure()
ax=fig.add_subplot(11)
cax=ax.matshow(cm)
plt.title('Confusion matrix')
fig.colorbar(cax)
plt.xlabel('Predicted')
plt.ylabel('True')
plt.show()
#plot_confustion_matrix(matrix).plot()

In [69]:
print(matrix)

# Find the error

Guy claims that if you followed the instruction until Q6 then there is a mistake in the notebook and that we can not rely on the reported performance when information about a new hard drive will arrive

1. Explain in words what is the error 
1. Explain in words what you have to change in order to fix the error
1. Fix the error. Submit a fixed notebook only

In [None]:
print(type(y_test))

for i in range(len(predictions)):
    if(predictions[i]==y_test[i]):
        countCorrects+=1
    else:
        countWrongs+=1

print("Corrects: "+str(countCorrects))
print("Wrongs: "+str(countWrongs))

In [None]:
#Q7

# Improving the results
Find an improvment which will increase the performance in terms of model's accuracy (https://scikit-learn.org/stable/modules/generated/sklearn.metrics.accuracy_score.html)

Explain in words what did you change, document the results before and after you suggested improvment

In [None]:
#Q8

Not all questions will be checked