In [1]:
#import
import pandas as pd
import numpy as np
import os
from sklearn.ensemble import RandomForestClassifier 
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

# Import Data

In [2]:
#set path for raw data
raw_data_path = os.path.join(os.path.pardir,'data','raw')
train_file_path = os.path.join(raw_data_path, 'train.csv')
test_file_path = os.path.join(raw_data_path, 'test.csv')

In [3]:
#read data with all parameters
train_df = pd.read_csv(train_file_path, index_col='ord')
test_df = pd.read_csv(test_file_path, index_col='ord')

In [4]:
# get the type
type(train_df)

pandas.core.frame.DataFrame

# Basic Structure

In [5]:
#train_df.info()
train_df.drop(['TIME','LATITUDE','LONGITUDE','ALTITUDE','VEHICLE_ID','BAROMETRIC_PRESSURE','ENGINE_COOLANT_TEMP','FUEL_LEVEL','AMBIENT_AIR_TEMP','INTAKE_MANIFOLD_PRESSURE','MAF','TERM FUEL TRIM BANK 1','FUEL_ECONOMY','LONG TERM FUEL TRIM BANK 2','FUEL_TYPE','AIR_INTAKE_TEMP','FUEL_PRESSURE','SHORT TERM FUEL TRIM BANK 2','SHORT TERM FUEL TRIM BANK 1','ENGINE_RUNTIME','TIMING_ADVANCE','DTC_NUMBER','TROUBLE_CODES','TIMING_ADVANCE','EQUIV_RATIO','aqui'], axis=1 , inplace=True)
test_df.drop(['TIME','LATITUDE','LONGITUDE','ALTITUDE','VEHICLE_ID','BAROMETRIC_PRESSURE','ENGINE_COOLANT_TEMP','FUEL_LEVEL','AMBIENT_AIR_TEMP','INTAKE_MANIFOLD_PRESSURE','MAF','TERM FUEL TRIM BANK 1','FUEL_ECONOMY','LONG TERM FUEL TRIM BANK 2','FUEL_TYPE','AIR_INTAKE_TEMP','FUEL_PRESSURE','SHORT TERM FUEL TRIM BANK 2','SHORT TERM FUEL TRIM BANK 1','ENGINE_RUNTIME','TIMING_ADVANCE','DTC_NUMBER','TROUBLE_CODES','TIMING_ADVANCE','EQUIV_RATIO','aqui'], axis=1 , inplace=True)


train_df['ENGINE_LOAD'] = train_df['ENGINE_LOAD'].str.replace(',','.')
train_df['THROTTLE_POS'] = train_df['THROTTLE_POS'].str.replace(',','.')


test_df['ENGINE_LOAD'] = test_df['ENGINE_LOAD'].str.replace(',','.')
test_df['THROTTLE_POS'] = test_df['THROTTLE_POS'].str.replace(',','.')

In [6]:
#concat train & test data for cleaning , axis=0/1 ,0-row concat,1-column concat
df = pd.concat((train_df,test_df),axis=0)
df['ENGINE_LOAD'] = df['ENGINE_LOAD'].str.replace(',','.')
df['THROTTLE_POS'] = df['THROTTLE_POS'].str.replace(',','.')



median_ect = df['THROTTLE_POS'].median()

df.THROTTLE_POS.fillna(median_ect, inplace= True)
median_ect = df['SPEED'].median()

df.SPEED.fillna(median_ect, inplace= True)
median_ect = df['ENGINE_LOAD'].median()

df.ENGINE_LOAD.fillna(median_ect, inplace= True)



median_ect = train_df['THROTTLE_POS'].median()

train_df.THROTTLE_POS.fillna(median_ect, inplace= True)
median_ect = train_df['SPEED'].median()

train_df.SPEED.fillna(median_ect, inplace= True)
median_ect = train_df['ENGINE_LOAD'].median()

train_df.ENGINE_LOAD.fillna(median_ect, inplace= True)



median_ect = test_df['THROTTLE_POS'].median()

test_df.THROTTLE_POS.fillna(median_ect, inplace= True)
median_ect = test_df['SPEED'].median()

test_df.SPEED.fillna(median_ect, inplace= True)
median_ect = test_df['ENGINE_LOAD'].median()

test_df.ENGINE_LOAD.fillna(median_ect, inplace= True)


In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 4 columns):
ENGINE_LOAD     1000 non-null object
ENGINE_RPM      1000 non-null int64
SPEED           1000 non-null float64
THROTTLE_POS    1000 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 39.1+ KB


In [8]:
train_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 800 entries, 1 to 800
Data columns (total 4 columns):
ENGINE_LOAD     800 non-null object
ENGINE_RPM      800 non-null int64
SPEED           800 non-null float64
THROTTLE_POS    800 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 31.2+ KB


In [9]:
test_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 200 entries, 801 to 1000
Data columns (total 4 columns):
ENGINE_LOAD     200 non-null object
ENGINE_RPM      200 non-null int64
SPEED           200 non-null float64
THROTTLE_POS    200 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 7.8+ KB


In [10]:
pd.isnull(train_df).sum() > 0

ENGINE_LOAD     False
ENGINE_RPM      False
SPEED           False
THROTTLE_POS    False
dtype: bool

In [11]:
pd.isnull(test_df).sum() > 0

ENGINE_LOAD     False
ENGINE_RPM      False
SPEED           False
THROTTLE_POS    False
dtype: bool

In [12]:
pd.isnull(df).sum() > 0

ENGINE_LOAD     False
ENGINE_RPM      False
SPEED           False
THROTTLE_POS    False
dtype: bool

# Data Munging - Working with Missing Values

### SPEED FILLING

In [13]:
df[df.SPEED.isnull()]

Unnamed: 0_level_0,ENGINE_LOAD,ENGINE_RPM,SPEED,THROTTLE_POS
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1


In [14]:
df.head(10)

Unnamed: 0_level_0,ENGINE_LOAD,ENGINE_RPM,SPEED,THROTTLE_POS
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,43.1,816,32.0,21.2
2,42.0,807,0.0,21.6
3,50.6,1559,9.0,29.8
4,41.2,1692,10.0,29.0
5,55.7,863,11.0,25.9
6,69.8,955,8.0,27.5
7,26.3,777,9.0,29.8
8,47.1,1549,13.0,29.4
9,49.4,2106,12.0,31.8
10,62.4,1080,15.0,28.2


## FINDING ENGINE SPEED CHANGE RATE

In [15]:
df.SPEED.dtype

dtype('float64')

In [16]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 4 columns):
ENGINE_LOAD     1000 non-null object
ENGINE_RPM      1000 non-null int64
SPEED           1000 non-null float64
THROTTLE_POS    1000 non-null object
dtypes: float64(1), int64(1), object(2)
memory usage: 39.1+ KB


In [17]:
df["EngineSpeedChangeRate"] = df["ENGINE_RPM"].diff().round(4)

In [18]:
df.head(20)

Unnamed: 0_level_0,ENGINE_LOAD,ENGINE_RPM,SPEED,THROTTLE_POS,EngineSpeedChangeRate
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,43.1,816,32.0,21.2,
2,42.0,807,0.0,21.6,-9.0
3,50.6,1559,9.0,29.8,752.0
4,41.2,1692,10.0,29.0,133.0
5,55.7,863,11.0,25.9,-829.0
6,69.8,955,8.0,27.5,92.0
7,26.3,777,9.0,29.8,-178.0
8,47.1,1549,13.0,29.4,772.0
9,49.4,2106,12.0,31.8,557.0
10,62.4,1080,15.0,28.2,-1026.0


In [19]:
df['EngineSpeedChangeRate'] = df['EngineSpeedChangeRate'].shift(-1)

In [20]:
df.head(10)

Unnamed: 0_level_0,ENGINE_LOAD,ENGINE_RPM,SPEED,THROTTLE_POS,EngineSpeedChangeRate
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,43.1,816,32.0,21.2,-9.0
2,42.0,807,0.0,21.6,752.0
3,50.6,1559,9.0,29.8,133.0
4,41.2,1692,10.0,29.0,-829.0
5,55.7,863,11.0,25.9,92.0
6,69.8,955,8.0,27.5,-178.0
7,26.3,777,9.0,29.8,772.0
8,47.1,1549,13.0,29.4,557.0
9,49.4,2106,12.0,31.8,-1026.0
10,62.4,1080,15.0,28.2,304.0


In [21]:
df.tail(10)

Unnamed: 0_level_0,ENGINE_LOAD,ENGINE_RPM,SPEED,THROTTLE_POS,EngineSpeedChangeRate
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
991,100.0,1384,54.0,36.1,91.0
992,72.5,1475,55.0,32.2,55.0
993,50.2,1530,56.0,30.6,-54.0
994,15.3,1476,43.0,21.2,-302.0
995,21.6,1174,42.0,19.6,892.0
996,35.3,2066,52.0,35.3,-275.0
997,90.2,1791,56.0,35.3,-223.0
998,61.6,1568,57.0,31.0,3.0
999,61.6,1571,57.0,31.0,-14.0
1000,65.5,1557,55.0,31.4,


In [22]:
df.EngineSpeedChangeRate.fillna('1557', inplace= True)

## FINDING SPEED CHANGE RATE

In [23]:
#convert obj to float
df["SPEED"] = pd.to_numeric(df["SPEED"])

In [24]:
#convert float to int
df["SPEED"]=df["SPEED"].astype(np.int64)

In [25]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 5 columns):
ENGINE_LOAD              1000 non-null object
ENGINE_RPM               1000 non-null int64
SPEED                    1000 non-null int64
THROTTLE_POS             1000 non-null object
EngineSpeedChangeRate    1000 non-null object
dtypes: int64(2), object(3)
memory usage: 46.9+ KB


In [26]:
df["SpeedChangeRate"] = df["SPEED"].diff().round(4)

In [27]:
df['SpeedChangeRate'] = df['SpeedChangeRate'].shift(-1)

In [28]:
df.SpeedChangeRate.fillna('55', inplace= True)

In [29]:
df.head(10)

Unnamed: 0_level_0,ENGINE_LOAD,ENGINE_RPM,SPEED,THROTTLE_POS,EngineSpeedChangeRate,SpeedChangeRate
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,43.1,816,32,21.2,-9,-32
2,42.0,807,0,21.6,752,9
3,50.6,1559,9,29.8,133,1
4,41.2,1692,10,29.0,-829,1
5,55.7,863,11,25.9,92,-3
6,69.8,955,8,27.5,-178,1
7,26.3,777,9,29.8,772,4
8,47.1,1549,13,29.4,557,-1
9,49.4,2106,12,31.8,-1026,3
10,62.4,1080,15,28.2,304,5


In [30]:
df.tail(10)

Unnamed: 0_level_0,ENGINE_LOAD,ENGINE_RPM,SPEED,THROTTLE_POS,EngineSpeedChangeRate,SpeedChangeRate
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
991,100.0,1384,54,36.1,91,1
992,72.5,1475,55,32.2,55,1
993,50.2,1530,56,30.6,-54,-13
994,15.3,1476,43,21.2,-302,-1
995,21.6,1174,42,19.6,892,10
996,35.3,2066,52,35.3,-275,4
997,90.2,1791,56,35.3,-223,1
998,61.6,1568,57,31.0,3,0
999,61.6,1571,57,31.0,-14,-2
1000,65.5,1557,55,31.4,1557,55


### FINDING THROTTLE CHANGE RATE

In [31]:
#THROTTLE FILLING NAN
df[df.THROTTLE_POS.isnull()]

Unnamed: 0_level_0,ENGINE_LOAD,ENGINE_RPM,SPEED,THROTTLE_POS,EngineSpeedChangeRate,SpeedChangeRate
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1


In [32]:
#df.THROTTLE_POS.value_counts()

In [33]:
#median_ect = df['THROTTLE_POS'].median()
#print (median_ect)

In [34]:
#df.THROTTLE_POS.fillna('25.1', inplace= True)

In [35]:
#convert obj to float
df["THROTTLE_POS"] = pd.to_numeric(df["THROTTLE_POS"],downcast='float')


In [36]:
df["ThrottleChangeRate"] = df["THROTTLE_POS"].diff().round(4)

In [37]:
df['ThrottleChangeRate'] = df['ThrottleChangeRate'].shift(-1)

In [38]:
df.ThrottleChangeRate.fillna('31.4', inplace= True)

In [39]:
df.tail(10)

Unnamed: 0_level_0,ENGINE_LOAD,ENGINE_RPM,SPEED,THROTTLE_POS,EngineSpeedChangeRate,SpeedChangeRate,ThrottleChangeRate
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
991,100.0,1384,54,36.099998,91,1,-3.9
992,72.5,1475,55,32.200001,55,1,-1.6
993,50.2,1530,56,30.6,-54,-13,-9.4
994,15.3,1476,43,21.200001,-302,-1,-1.6
995,21.6,1174,42,19.6,892,10,15.7
996,35.3,2066,52,35.299999,-275,4,0.0
997,90.2,1791,56,35.299999,-223,1,-4.3
998,61.6,1568,57,31.0,3,0,0.0
999,61.6,1571,57,31.0,-14,-2,0.4
1000,65.5,1557,55,31.4,1557,55,31.4


In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 7 columns):
ENGINE_LOAD              1000 non-null object
ENGINE_RPM               1000 non-null int64
SPEED                    1000 non-null int64
THROTTLE_POS             1000 non-null float32
EngineSpeedChangeRate    1000 non-null object
SpeedChangeRate          1000 non-null object
ThrottleChangeRate       1000 non-null object
dtypes: float32(1), int64(2), object(4)
memory usage: 58.6+ KB


### FILLING ENGINE LOAD

In [41]:
df[df.ENGINE_LOAD.isnull()]

Unnamed: 0_level_0,ENGINE_LOAD,ENGINE_RPM,SPEED,THROTTLE_POS,EngineSpeedChangeRate,SpeedChangeRate,ThrottleChangeRate
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1


In [42]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 7 columns):
ENGINE_LOAD              1000 non-null object
ENGINE_RPM               1000 non-null int64
SPEED                    1000 non-null int64
THROTTLE_POS             1000 non-null float32
EngineSpeedChangeRate    1000 non-null object
SpeedChangeRate          1000 non-null object
ThrottleChangeRate       1000 non-null object
dtypes: float32(1), int64(2), object(4)
memory usage: 58.6+ KB


In [43]:
#convert obj to float
df["ENGINE_LOAD"] = pd.to_numeric(df["ENGINE_LOAD"],downcast='float').round(4)

In [44]:
#convert obj to float
df["ThrottleChangeRate"] = pd.to_numeric(df["ThrottleChangeRate"],downcast='float')

In [45]:
#convert obj to float
df["EngineSpeedChangeRate"] = pd.to_numeric(df["EngineSpeedChangeRate"])
#convert float to int
df["EngineSpeedChangeRate"]=df["EngineSpeedChangeRate"].astype(np.int64)

In [46]:
#convert obj to float
df["SpeedChangeRate"] = pd.to_numeric(df["SpeedChangeRate"])
#convert float to int
df["SpeedChangeRate"]=df["SpeedChangeRate"].astype(np.int64)

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 7 columns):
ENGINE_LOAD              1000 non-null float32
ENGINE_RPM               1000 non-null int64
SPEED                    1000 non-null int64
THROTTLE_POS             1000 non-null float32
EngineSpeedChangeRate    1000 non-null int64
SpeedChangeRate          1000 non-null int64
ThrottleChangeRate       1000 non-null float32
dtypes: float32(3), int64(4)
memory usage: 50.8 KB


## FINDING RELATIVE RATIO OF SPEED & ENGINE SPEED

In [48]:
#Applying formula -  Rcz(t)= cs(t)/220   /   zs(t)/8000
#Simplifying     -   Rcz(t)= 36.3636 * (cs(t)/zs(t))
#dividing cs/zs
df['RelRatioVSES'] = (df['SPEED']/df['ENGINE_RPM']).round(4)

#multiply with 36.3636
df['RelRatioVSES'] = (df['RelRatioVSES']*36.3636).round(4)
#convert obj to float
df["RelRatioVSES"] = pd.to_numeric(df["RelRatioVSES"],downcast='float')

In [49]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 8 columns):
ENGINE_LOAD              1000 non-null float32
ENGINE_RPM               1000 non-null int64
SPEED                    1000 non-null int64
THROTTLE_POS             1000 non-null float32
EngineSpeedChangeRate    1000 non-null int64
SpeedChangeRate          1000 non-null int64
ThrottleChangeRate       1000 non-null float32
RelRatioVSES             1000 non-null float32
dtypes: float32(4), int64(4)
memory usage: 54.7 KB


## FINDING RELATIVE RATIO OF THROTTLE & ENGINE SPEED

In [50]:
#Applying formula -  Rjz(t)= jq'(t)/max(jq'(t))   /   zs'(t)/max(zs'(t))
#Simplifying jq'(t)/zs'(t)  *  max/max
#find max(zs'(t))  -  convert obj to float
df["EngineSpeedChangeRate"] = pd.to_numeric(df["EngineSpeedChangeRate"],downcast='float')
x=df["EngineSpeedChangeRate"].max()

#find max(jq'(t))  -  convert obj to float
df["ThrottleChangeRate"] = pd.to_numeric(df["ThrottleChangeRate"],downcast='float')
y=df["ThrottleChangeRate"].max()

#max/max - x/y
z=x/y

#divide jq'  /  zs'
df['RelRatioTPES'] = (df['ThrottleChangeRate']/df['EngineSpeedChangeRate']).round(4)

#multiply with z
df['RelRatioTPES'] = (df['RelRatioTPES']*z).round(4)

#convert obj to float
df["RelRatioTPES"] = pd.to_numeric(df["RelRatioTPES"],downcast='float')

In [51]:
median_ect = df['RelRatioTPES'].median()

df.RelRatioTPES.fillna(median_ect, inplace= True)

In [52]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 9 columns):
ENGINE_LOAD              1000 non-null float32
ENGINE_RPM               1000 non-null int64
SPEED                    1000 non-null int64
THROTTLE_POS             1000 non-null float32
EngineSpeedChangeRate    1000 non-null float32
SpeedChangeRate          1000 non-null int64
ThrottleChangeRate       1000 non-null float32
RelRatioVSES             1000 non-null float32
RelRatioTPES             1000 non-null float32
dtypes: float32(6), int64(3)
memory usage: 54.7 KB


# CREATING A NEW DATAFRAME 

In [53]:
df2 = df[['RelRatioVSES','RelRatioTPES','ENGINE_LOAD']]

In [54]:
df2.head(10)

Unnamed: 0_level_0,RelRatioVSES,RelRatioTPES,ENGINE_LOAD
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1.4255,-2.0576,43.099998
2,0.0,0.5051,42.0
3,0.2109,-0.278,50.599998
4,0.2145,0.1715,41.200001
5,0.4618,0.8063,55.700001
6,0.3055,-0.5978,69.800003
7,0.4218,-0.0232,26.299999
8,0.3055,0.1993,47.099998
9,0.2073,0.1622,49.400002
10,0.5055,0.3059,62.400002


### FINDING WHETHER MAINTENANCE NEEDED OR NOT

In [55]:
df2.info()


<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 3 columns):
RelRatioVSES    1000 non-null float32
RelRatioTPES    1000 non-null float32
ENGINE_LOAD     1000 non-null float32
dtypes: float32(3)
memory usage: 19.5 KB


In [56]:
#function to create a column MaintenanceReq
def f(row):
    global val
    if row['RelRatioVSES'] >= 0.9 and row['RelRatioVSES'] <= 1.3 :
        if row['RelRatioTPES'] >= 0.9 and row['RelRatioTPES'] <= 1.3 :
            if row['ENGINE_LOAD'] >= 20 and row['ENGINE_LOAD'] <= 50 :
                val = "NO"
    else:
         val = "YES"
    return val
#Creating column MaintenanceReq
df2 = df2.assign(MaintenanceReq=df2.apply(f, axis=1))


In [57]:
df2['MaintenanceReq'].value_counts()

YES    973
NO      27
Name: MaintenanceReq, dtype: int64

In [58]:
df2=df2.round(4)

df2.ENGINE_LOAD = df2.ENGINE_LOAD.round(4)
df2.head(20)


Unnamed: 0_level_0,RelRatioVSES,RelRatioTPES,ENGINE_LOAD,MaintenanceReq
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.4255,-2.0576,43.099998,YES
2,0.0,0.5051,42.0,YES
3,0.2109,-0.278,50.599998,YES
4,0.2145,0.1715,41.200001,YES
5,0.4618,0.8063,55.700001,YES
6,0.3055,-0.5978,69.800003,YES
7,0.4218,-0.0232,26.299999,YES
8,0.3055,0.1993,47.099998,YES
9,0.2073,0.1622,49.400002,YES
10,0.5055,0.3059,62.400002,YES


In [59]:
df2=df2.replace([np.inf, -np.inf], np.nan)

In [60]:
df2.RelRatioTPES.fillna('0.0', inplace= True)
df2["RelRatioTPES"] = pd.to_numeric(df2["RelRatioTPES"],downcast='float')

In [61]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1000 entries, 1 to 1000
Data columns (total 4 columns):
RelRatioVSES      1000 non-null float32
RelRatioTPES      1000 non-null float32
ENGINE_LOAD       1000 non-null float32
MaintenanceReq    1000 non-null object
dtypes: float32(3), object(1)
memory usage: 27.3+ KB


In [62]:
df2.describe()

Unnamed: 0,RelRatioVSES,RelRatioTPES,ENGINE_LOAD
count,1000.0,1000.0,1000.0
mean,0.817553,-0.369948,42.949135
std,0.418423,18.250523,20.049782
min,0.0,-454.146393,12.2
25%,0.6318,-0.5538,27.1
50%,0.8491,-0.0,40.0
75%,1.1091,0.6905,53.700001
max,2.0473,89.592003,100.0


# ENDGAME

In [63]:
#Creating Train & Test data from df
df2['is_train']= np.random.uniform(0,1,len(df2)) <= .75
df2.head(20)

Unnamed: 0_level_0,RelRatioVSES,RelRatioTPES,ENGINE_LOAD,MaintenanceReq,is_train
ord,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1,1.4255,-2.0576,43.099998,YES,False
2,0.0,0.5051,42.0,YES,False
3,0.2109,-0.278,50.599998,YES,True
4,0.2145,0.1715,41.200001,YES,True
5,0.4618,0.8063,55.700001,YES,True
6,0.3055,-0.5978,69.800003,YES,True
7,0.4218,-0.0232,26.299999,YES,False
8,0.3055,0.1993,47.099998,YES,False
9,0.2073,0.1622,49.400002,YES,True
10,0.5055,0.3059,62.400002,YES,True


In [64]:
#creating df with test rows and train rows
train, test = df2[df2['is_train']==True], df2[df2['is_train']==False]
#Show no of test & train observations
print('No of obs in train:',len(train))
print('No of obs in test:',len(test))

No of obs in train: 741
No of obs in test: 259


In [65]:
#Creating a list of feature column's name
features = df2.columns[:3]
#view feature
features

Index(['RelRatioVSES', 'RelRatioTPES', 'ENGINE_LOAD'], dtype='object')

In [66]:
#Creating target
y = pd.factorize(train['MaintenanceReq'])[0]
z= pd.factorize(test['MaintenanceReq'])[0]

In [67]:
print(train.describe())

       RelRatioVSES  RelRatioTPES  ENGINE_LOAD
count    741.000000    741.000000   741.000000
mean       0.801767     -0.397166    42.576267
std        0.421650     19.576778    20.005634
min        0.000000   -454.146393    12.200000
25%        0.607300     -0.528300    26.299999
50%        0.836400      0.000000    39.599998
75%        1.105500      0.690500    53.700001
max        2.047300     89.592003   100.000000


In [68]:
#Creating Random Forest Classifier
clf = RandomForestClassifier()
#Training the classifier
#clf.fit(train[features],y)
clf = Pipeline([("scale", StandardScaler()),
               ("clf", RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=2))])

In [69]:
train[features] = SimpleImputer().fit_transform(train[features])
clf=clf.fit(train[features],y)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    0.1s


building tree 1 of 100building tree 2 of 100

building tree 3 of 100
building tree 4 of 100building tree 5 of 100
building tree 6 of 100

building tree 7 of 100
building tree 8 of 100
building tree 9 of 100
building tree 10 of 100building tree 11 of 100

building tree 12 of 100
building tree 13 of 100
building tree 14 of 100building tree 15 of 100
building tree 16 of 100

building tree 17 of 100
building tree 18 of 100
building tree 19 of 100
building tree 20 of 100
building tree 21 of 100
building tree 22 of 100building tree 23 of 100

building tree 24 of 100
building tree 25 of 100
building tree 26 of 100
building tree 27 of 100
building tree 28 of 100
building tree 29 of 100
building tree 30 of 100
building tree 31 of 100
building tree 32 of 100
building tree 33 of 100
building tree 34 of 100
building tree 35 of 100
building tree 36 of 100
building tree 37 of 100building tree 38 of 100

building tree 39 of 100
building tree 40 of 100
building tree 41 of 100building tree 42 of 100

b

[Parallel(n_jobs=-1)]: Done 100 out of 100 | elapsed:    0.2s finished


In [70]:
test[features] = SimpleImputer().fit_transform(test[features])
preds=clf.predict(test[features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self.obj[item] = s
[Parallel(n_jobs=4)]: Using backend ThreadingBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  33 tasks      | elapsed:    0.0s
[Parallel(n_jobs=4)]: Done 100 out of 100 | elapsed:    0.0s finished


In [71]:
test['MaintenanceReq'].head()

ord
1     YES
2     YES
7     YES
8     YES
14    YES
Name: MaintenanceReq, dtype: object

In [72]:
#create confusion matrix

pd.crosstab(test['MaintenanceReq'],preds, rownames=['Actual Values'], colnames=['Predicted Values'])

Predicted Values,0,1
Actual Values,Unnamed: 1_level_1,Unnamed: 2_level_1
NO,5,1
YES,252,1


In [73]:
#check accuracy
def checkAccuracy(clf):
    return accuracy_score(z,preds)
    

In [74]:
acc=checkAccuracy(clf)

#printing accuracy
print(acc*100)

97.68339768339769


In [77]:
#check for maintenance
#df2.MaintenanceReq.value_counts()
values = df2['MaintenanceReq'].value_counts().keys().tolist()
counts = df2['MaintenanceReq'].value_counts().tolist()

In [78]:
print(values)


['YES', 'NO']


In [80]:
print(counts)

[973, 27]


In [None]:
if(counts[0]>)