In [38]:
import pandas as pd
import numpy as np
import pickle
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import logging as lg
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression

In [32]:
lg.basicConfig(filename='a141.txt', format='%(name)s - %(levelname)s - %(message)s')

In [92]:
class A14I:
    def __init__(self, file_path):
        self.file_path = file_path
        
    def read_data(self):
        try:
            self.logging_info("Reading data..")
            self.dataframe = pd.read_csv(self.file_path)
            self.logging_info('Reading data complete..')
        except Exception as e:
            self.logging_error('An error has occured. Error: '+str(e))
            print(e)
    
    def show_data(self):
        try:
            self.logging_info('Showing data..')
            print(self.dataframe.head())
        except Exception as e:
            self.logging_error('An error has occured while showing data. Error: ' + str(e))
            print(e)
            
    def pandas_profiling(self):
        try:
            self.logging_info('Performing Pandas profiling..')
            self.dataframe.profile_report().to_widgets()
            self.logging_info('Pandas profiling report generated..')
        except Exception as e:
            self.logging_error('An error has occured while doing pandas profiling. Error: ' + str(e))
            print(e)
    
    def transform_data(self, cols):
        try:
            self.logging_info('Transforming data')
            self.dataframe[cols] = np.log1p(self.dataframe[cols])
            
        except Exception as e:
            self.logging_error('An error occured while transforming data. Error: ' + str(e))
            print(e)
            
    def data_split(self, cols):
        try:
            self.logging_info('Splitting data into train test split')
            x = self.dataframe.drop(cols, axis=1)
            y = self.dataframe[['Air temperature [K]']]
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(x, y, test_size=0.3, random_state=42)
            self.logging_info('Splitting data into train test split complete..')
        except Exception as e:
            self.logging_error('An error has occured in data splitting. Error: ' + str(e))
            print(e)
       
    def scale_data(self, cols):
        try:
            self.logging_info('scaling data')
            self.scaler = StandardScaler()
            self.X_train[cols] = self.scaler.fit_transform(self.X_train[cols])
            self.X_test[cols] = self.scaler.transform(self.X_test[cols])
            self.X_train = np.array(self.X_train)
            self.X_test = np.array(self.X_test)
            print(self.X_train)
            self.logging_info('Scaling data complete..')
        except Exception as e:
            self.logging_error('An error has occured while scalinf data. Error: ' + str(e))
            print(e)
    
    def train_model(self):
        try:
            self.logging_info('Training model') 
            self.linear = LinearRegression()
            self.linear.fit(self.X_train, self.y_train)
            self.logging_info('Training complete')
        except Exception as e:
            self.logging_error('An error has occured while training model. Error: ' + str(e))
            print(e)
    
    def get_accuracy(self):
        try:
            self.logging_info('Getting accuracy of the model')
            print('Training Score: ', self.linear.score(self.X_train, self.y_train))
            print('Testing Score: ', self.linear.score(self.X_test, self.y_test))
        except Exception as e:
            self.logging_error('An error has occured while getting accuracy of model. Error: ' + str(e))
            print(e)
            
    def save_model(self):
        try:
            self.logging_info('Saving the final model')
            with open('linear_regressor.sav','wb') as f:
                pickle.dump(self.linear, f)
                
            with open('standard_scaler.sav','wb') as f:
                pickle.dump(self.scaler, f)
            self.logging_info('Model saved')
        except Exception as e:
            self.logging_error('An error has occured while sving model. error: ' + str(e))
            print(e)
            
    def test_cases(self, cols, scale):
        try:
            regressor = None
            scaler = None
            with open('linear_regressor.sav','rb') as f:
                regressor = pickle.load(f)
            with open('standard_scaler.sav','rb') as f:
                scaler = pickle.load(f)
            self.dataframe = self.dataframe.sample(n = 10).drop(cols, axis=1)
            x = self.dataframe.drop('Air temperature [K]', axis=1)
            y = self.dataframe[['Air temperature [K]']]
            x[scale] = scaler.transform(x[scale])
            y_pred = regressor.predict(x)
            print('Predicted      Original')
            for i in range(len(y_pred)):
                print(y_pred[i], y.iloc[i,:].values[0])

        except Exception as e:
            self.logging_error('An error has occured while testing model. Error: ' + str(e))
            print(e)
            
    def logging_info(self, message):
        try:
            lg.info(message)
        except Exception as e:
            print(e)
    
    def logging_error(self, message):
        try:
            lg.error(message)
        except Exception as e:
            print(e)

In [93]:
obj = A14I('ai4i2020.csv')
obj.read_data()
obj.show_data()

   UDI Product ID Type  Air temperature [K]  Process temperature [K]  \
0    1     M14860    M                298.1                    308.6   
1    2     L47181    L                298.2                    308.7   
2    3     L47182    L                298.1                    308.5   
3    4     L47183    L                298.2                    308.6   
4    5     L47184    L                298.2                    308.7   

   Rotational speed [rpm]  Torque [Nm]  Tool wear [min]  Machine failure  TWF  \
0                    1551         42.8                0                0    0   
1                    1408         46.3                3                0    0   
2                    1498         49.4                5                0    0   
3                    1433         39.5                7                0    0   
4                    1408         40.0                9                0    0   

   HDF  PWF  OSF  RNF  
0    0    0    0    0  
1    0    0    0    0  
2    0  

In [35]:
obj.pandas_profiling()

Summarize dataset:   0%|          | 0/27 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

## Observations
- Air Temperature: 

    No missing values
    
    Has a mean of 300.00493
    
    Standard Deviation of 2.000258683
    
    Highly Possitive Correlatation with Process Temperature
    
    Is rougly normally distributed
   
   
- Process Temperature:
    
    No missing values
    
    Has a mean of 310.00556
    
    Standard Deviation of 1.483734219
    
    Highly Possitive Correlatation with Air Temperature
    
    Is roughly normally distributed
    
    
- Rotational Speed:
    
    No missing values
    
    Has a mean of 1538.7761
    
    Standard Deviation of 179.2840959
    
    Has high correlation negative correlation with Torque
    
    Is normally distributed
    
    with a slight positive skewness
    
    
- Torque:

    No missing value
    
    Has a mean of 39.98691
    
    Standard Deviation of 9.968933725
    
    Has high correlation negative correlation with Rotational Speed
    
    Is normally distributed
    
    
- Tool wear:
    
    No missing value
    
    Has a mean of 107.951
    
    Standard Deviation of 63.65414664
    
    Does't has any noticable correlation
    
    Not normally distributed
    
    
- Machine Failure:

    3.4%(339 out of 10000) of the times the machine has failed.
   
   
- TWF:

    no missing value
    
    very few times times failed due to this
    
    has medium correlation with Machine Failure
    
    
- HDF:

    no missing value
    
    very few times times failed due to this

In [44]:
# With all data
obj.data_split(['UDI', 'Product ID', 'Type'])
obj.scale_data()
obj.train_model()
obj.get_accuracy()

Training Score:  0.7787362531479851
Testing Score:  0.7683008770776208


## Transforming Tool wear [min] using log

In [56]:

obj.transform_data(['Tool wear [min]'])
obj.pandas_profiling()

Summarize dataset:   0%|          | 0/27 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render widgets:   0%|          | 0/1 [00:00<?, ?it/s]

VBox(children=(Tab(children=(Tab(children=(GridBox(children=(VBox(children=(GridspecLayout(children=(HTML(valu…

## Even after scaling can't see any relation between Tool wear [min] and Air temperature [K]

In [58]:
# After removing Tool wear
obj.data_split(['UDI', 'Product ID', 'Type', 'Air temperature [K]', 'Tool wear [min]'])
obj.scale_data()
obj.train_model()
obj.get_accuracy()

Training Score:  0.7787213636314864
Testing Score:  0.7683146095692723


## Rotational speed is showing high signs of multicolinearity and is event contributing much towards Air temperature [K]

In [94]:
# After removing Rotational speed & Tool wear
obj.data_split(['UDI', 'Product ID', 'Type', 'Air temperature [K]', 'Tool wear [min]', 'Rotational speed [rpm]'])
obj.scale_data(['Process temperature [K]', 'Torque [Nm]'])
obj.train_model()
obj.get_accuracy()

[[-1.21373792 -1.19891696  0.         ...  0.          0.
   0.        ]
 [-0.54272491  0.61807336  0.         ...  0.          0.
   0.        ]
 [ 1.33611152  2.06567554  0.         ...  0.          0.
   0.        ]
 ...
 [ 1.53741543  0.70792453  0.         ...  0.          0.
   0.        ]
 [-2.08605484 -0.74966111  0.         ...  0.          0.
   0.        ]
 [ 0.2624907  -1.42853662  0.         ...  0.          0.
   0.        ]]
Training Score:  0.7786319613182082
Testing Score:  0.7683507979018429


In [91]:
obj.save_model()

# we can see accuracy increased after removing Rotational speed & Tool wear

In [83]:
obj.test_cases(['UDI', 'Product ID', 'Type', 'Tool wear [min]', 'Rotational speed [rpm]'])

Predicted      Original
[299.54563674] 300.7
[298.55581453] 297.5
[303.0124882] 303.1
[302.46529148] 303.6
[302.06319191] 302.4
[301.03325926] 301.4
[301.74753606] 302.2
[298.19615413] 297.3
[298.92643258] 298.8
[299.25582898] 299.9
