In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import missingno as msno
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier

In [2]:
from sklearn.metrics import recall_score, precision_score, accuracy_score, f1_score

In [3]:
df_train=pd.read_csv("train.csv",parse_dates=True)
df_test=pd.read_csv("test.csv",parse_dates=True)
display(df_train.head())
display(df_test.head())

Unnamed: 0,deviceid,avg_time_charging_lag1,avg_time_charging_lag2,avg_time_charging_lag3,avg_time_charging_lag7,charging_rate_lag3,charging_rate_lag7,avg_time_discharging_lag1,avg_time_discharging_lag2,avg_time_discharging_lag3,...,number_times_restart,avg_volt_change_charging,avg_volt_change_discharging,avg_time_charging,avg_time_discharging,max_voltage_day,piececount,cycle_time,LastRecord,Date Deployed
0,28647,5.12,41.11,6.56,25.39,0.086667,-0.006667,4.37,91.7,15.7,...,2.375,377.08,367.42875,21.69625,39.005,4174.875,14.2,60.70125,4/1/2021,10/6/2019
1,36175,36.6,5.16,6.23,6.96,0.136667,-1.296667,62.67,6.53,6.16,...,1.0,350.517857,350.309286,19.883571,32.213571,4161.0,19.777778,52.097143,4/1/2021,10/27/2019
2,16107,5.51,5.04,4.52,5.96,-0.46,-0.083333,5.13,5.65,4.14,...,4.095238,346.113333,343.409524,13.675238,20.471429,4152.238095,19.2,34.146667,4/1/2021,10/31/2019
3,27362,4.66,39.85,35.76,40.69,0.076667,-0.006667,3.93,76.37,60.86,...,0.736842,356.184211,349.844737,16.306842,26.189474,4164.631579,23.625,42.496316,4/1/2021,10/1/2019
4,19463,5.1,43.24,4.63,5.26,0.04,-0.153333,4.69,71.44,3.8,...,1.133333,370.904,367.73,18.797333,30.812,4169.8,12.666667,49.609333,4/1/2021,9/13/2019


Unnamed: 0,deviceid,avg_time_charging_lag1,avg_time_charging_lag2,avg_time_charging_lag3,avg_time_charging_lag7,charging_rate_lag3,charging_rate_lag7,avg_time_discharging_lag1,avg_time_discharging_lag2,avg_time_discharging_lag3,...,number_times_restart,avg_volt_change_charging,avg_volt_change_discharging,avg_time_charging,avg_time_discharging,max_voltage_day,piececount,cycle_time,LastRecord,Date Deployed
0,42979,35.56,4.31,5.07,4.6,-0.033333,0.04,66.29,3.77,4.07,...,3.64,346.3012,345.3796,17.4816,27.7216,4159.0,12.4375,45.2032,4/1/2021,9/8/2019
1,14911,31.26,4.76,27.4,29.6,-0.076667,-0.06,49.57,3.63,44.06,...,2.826087,356.218261,357.305652,17.47913,27.650435,4163.608696,34.818182,45.129565,4/1/2021,9/13/2019
2,48386,30.08,5.07,4.76,4.74,0.013333,-0.016667,52.96,5.16,4.05,...,1.266667,368.532667,366.844667,11.361333,16.739333,4176.133333,15.888889,28.100667,4/1/2021,10/10/2019
3,18822,33.31,32.79,5.09,4.96,0.0,0.003333,49.26,68.54,5.83,...,0.9375,352.899375,352.595,17.16875,27.460625,4159.875,12.75,44.629375,4/1/2021,10/13/2019
4,49810,4.44,30.03,5.57,5.02,-0.2,-0.03,4.16,54.8,4.52,...,1.2,363.9128,365.2084,14.86,23.9808,4168.4,7.5,38.8408,4/1/2021,10/19/2019


In [4]:
# there are thousands of missing values in some columns, removing so many records will lead to lose of data,
# We chose to replace the values with 0.

# Filling null values with 0
df_train.fillna(0,inplace=True)
df_test.fillna(0,inplace=True)

In [5]:
# The time frame of the last record
print(df_train["LastRecord"].min())
print(df_train["LastRecord"].max())
# The last record is from April 1st

print(" ")
print(df_train["Date Deployed"].min())
print(df_train["Date Deployed"].max())

4/1/2021
4/1/2021
 
1/1/2020
9/9/2019


In [6]:
df_train["LastRecord"]=pd.to_datetime(df_train["LastRecord"])
df_train["Date Deployed"]=pd.to_datetime(df_train["Date Deployed"])

df_test["LastRecord"]=pd.to_datetime(df_test["LastRecord"])
df_test["Date Deployed"]=pd.to_datetime(df_test["Date Deployed"])

In [7]:
# For training set- taking average of charging lag, discharging lag, discharging rate lag and charging rate lag.

df_train["avg_time_charging_lag"]=((df_train["avg_time_charging_lag1"]+df_train["avg_time_charging_lag2"]+df_train["avg_time_charging_lag3"]
                                  +df_train["avg_time_charging_lag4"]+df_train["avg_time_charging_lag5"]+df_train["avg_time_charging_lag6"]
                                  +df_train["avg_time_charging_lag7"]+df_train["avg_time_charging_lag8"]+ df_train["avg_time_charging_lag9"]
                                  +df_train["avg_time_charging_lag10"]+df_train["avg_time_charging_lag11"]+ df_train["avg_time_charging_lag12"]
                                  +df_train["avg_time_charging_lag13"]+df_train["avg_time_charging_lag14"] )/14).replace(np.nan,0)

df_train["avg_time_discharging_lag"]=((df_train["avg_time_discharging_lag1"]+df_train["avg_time_discharging_lag2"]+df_train["avg_time_discharging_lag3"]
                                      +df_train["avg_time_discharging_lag4"]+df_train["avg_time_discharging_lag5"]+df_train["avg_time_discharging_lag6"]
                                      +df_train["avg_time_discharging_lag7"]+df_train["avg_time_discharging_lag8"]+df_train["avg_time_discharging_lag9"]
                                      +df_train["avg_time_discharging_lag10"]+df_train["avg_time_discharging_lag11"]+df_train["avg_time_discharging_lag12"]
                                      +df_train["avg_time_discharging_lag13"]+df_train["avg_time_discharging_lag13"])/14).replace(np.nan,0)


df_train["discharging_rate_lag"]= ((df_train["discharging_rate_lag3"]+df_train['discharging_rate_lag4']
                                  +df_train['discharging_rate_lag5']+df_train['discharging_rate_lag6']
                                  +df_train['discharging_rate_lag7']+df_train["discharging_rate_lag8"])/6).replace(np.nan,0)

df_train["charging_rate_lag"]= ((df_train['charging_rate_lag3']+df_train['charging_rate_lag4']+df_train['charging_rate_lag5']
                               +df_train["charging_rate_lag6"]+df_train['charging_rate_lag7'])/5).replace(np.nan,0)


# For the test set- taking average of charging lag, discharging lag, discharging rate lag and charging rate lag

df_test["avg_time_charging_lag"]=((df_test["avg_time_charging_lag1"]+df_test["avg_time_charging_lag2"]+ df_test["avg_time_charging_lag3"]
                                  +df_test["avg_time_charging_lag4"]+df_test["avg_time_charging_lag5"]+ df_test["avg_time_charging_lag6"]
                                  +df_test["avg_time_charging_lag7"]+df_test["avg_time_charging_lag8"]+ df_test["avg_time_charging_lag9"]
                                  +df_test["avg_time_charging_lag10"]+df_test["avg_time_charging_lag11"]+ df_test["avg_time_charging_lag12"]
                                  +df_test["avg_time_charging_lag13"]+df_test["avg_time_charging_lag14"] )/14).replace(np.nan,0)

df_test["avg_time_discharging_lag"]=((df_test["avg_time_discharging_lag1"]+df_test["avg_time_discharging_lag2"]+df_test["avg_time_discharging_lag3"]
                                      +df_test["avg_time_discharging_lag4"]+df_test["avg_time_discharging_lag5"]+df_test["avg_time_discharging_lag6"]
                                      +df_test["avg_time_discharging_lag7"]+df_test["avg_time_discharging_lag8"]+df_test["avg_time_discharging_lag9"]
                                      +df_test["avg_time_discharging_lag10"]+df_test["avg_time_discharging_lag11"]+df_test["avg_time_discharging_lag12"]
                                      +df_test["avg_time_discharging_lag13"]+df_test["avg_time_discharging_lag13"])/14).replace(np.nan,0)


df_test["discharging_rate_lag"]= ((df_test["discharging_rate_lag3"]+df_test["discharging_rate_lag4"]
                                  +df_test['discharging_rate_lag5']+df_test["discharging_rate_lag6"]
                                  +df_test['discharging_rate_lag7']+df_test["discharging_rate_lag8"])/6).replace(np.nan,0)

df_test["charging_rate_lag"]= ((df_test['charging_rate_lag3']+df_test['charging_rate_lag4']+df_test['charging_rate_lag5']
                               +df_test["charging_rate_lag6"]+df_test['charging_rate_lag7'])/5).replace(np.nan,0)


df_train.head()

Unnamed: 0,deviceid,avg_time_charging_lag1,avg_time_charging_lag2,avg_time_charging_lag3,avg_time_charging_lag7,charging_rate_lag3,charging_rate_lag7,avg_time_discharging_lag1,avg_time_discharging_lag2,avg_time_discharging_lag3,...,avg_time_discharging,max_voltage_day,piececount,cycle_time,LastRecord,Date Deployed,avg_time_charging_lag,avg_time_discharging_lag,discharging_rate_lag,charging_rate_lag
0,28647,5.12,41.11,6.56,25.39,0.086667,-0.006667,4.37,91.7,15.7,...,39.005,4174.875,14.2,60.70125,2021-04-01,2019-10-06,10.781429,19.077857,0.121111,-0.352
1,36175,36.6,5.16,6.23,6.96,0.136667,-1.296667,62.67,6.53,6.16,...,32.213571,4161.0,19.777778,52.097143,2021-04-01,2019-10-27,16.997857,29.199286,-1.671667,-0.390667
2,16107,5.51,5.04,4.52,5.96,-0.46,-0.083333,5.13,5.65,4.14,...,20.471429,4152.238095,19.2,34.146667,2021-04-01,2019-10-31,14.462143,19.005,-0.258333,-0.116667
3,27362,4.66,39.85,35.76,40.69,0.076667,-0.006667,3.93,76.37,60.86,...,26.189474,4164.631579,23.625,42.496316,2021-04-01,2019-10-01,15.245714,25.743571,-0.008889,0.368
4,19463,5.1,43.24,4.63,5.26,0.04,-0.153333,4.69,71.44,3.8,...,30.812,4169.8,12.666667,49.609333,2021-04-01,2019-09-13,19.771429,33.357143,-0.355,0.304667


In [8]:
predictors=['chargecycles','charge_cycle_time_below_12','dischargecycles','total_off_time', 'number_times_restart',
            'avg_volt_change_charging', 'avg_volt_change_discharging','avg_time_charging', 'avg_time_discharging', 
            'max_voltage_day','piececount', 'cycle_time','avg_time_charging_lag', 'avg_time_discharging_lag', 
            'discharging_rate_lag',"charging_rate_lag"]
X=df_train[predictors]
y=df_train["fail_7"]

X_train, X_valid, y_train, y_valid =train_test_split(X, y, test_size=0.2, random_state=1)

In [9]:
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score

In [10]:
model = xgb.XGBClassifier(objective="binary:logistic", random_state=42)
 
# fit the model with the training data
model.fit(X_train,y_train)

XGBClassifier(random_state=42)

In [11]:
# predict the target on the train dataset
predict_train = model.predict(X_train)

# Accuray Score on train dataset
accuracy_train = accuracy_score(y_train,predict_train)
print('\naccuracy_score on train dataset : ', accuracy_train)


accuracy_score on train dataset :  0.7707407407407407


In [12]:
# predict the target on the test dataset
predict_test = model.predict(X_valid)
print('\nTarget on test data',predict_test) 
 
# Accuracy Score on test dataset
accuracy_test = accuracy_score(y_valid,predict_test)
print('\naccuracy_score on test dataset : ', accuracy_test)


Target on test data [0 0 0 ... 0 0 0]

accuracy_score on test dataset :  0.7711111111111111


In [13]:
from sklearn.ensemble import AdaBoostClassifier
ada = AdaBoostClassifier()
ada.fit(X_train, y_train)
print("Ensembling Tree R-Square score: ", str(ada.score(X_valid, y_valid)))

Ensembling Tree R-Square score:  0.769753086419753


In [14]:
#XGboost
# Model evaluation on training set
print("Accuracy with training set:",accuracy_score(y_train, predict_train))
# Model evaluation on validation set
print("Accuracy with validation set:",accuracy_score(y_valid, predict_test))


# Calculating precision, recall and F-measure on valid

p6=precision_score(y_valid,predict_test)
r6=recall_score(y_valid,predict_test)
f1_6=f1_score(y_valid,predict_test)
a6=accuracy_score(y_valid, predict_test)

print("Precision score: ", p6)
print("Recall score: ", r6)
print("f1-score: ", f1_6)
print("Accuracy:",a6)

Accuracy with training set: 0.7707407407407407
Accuracy with validation set: 0.7711111111111111
Precision score:  0.46835443037974683
Recall score:  0.020010816657652784
f1-score:  0.03838174273858921
Accuracy: 0.7711111111111111
