In [1]:
import pandas as pd
import numpy as np
import pickle
from pandas_profiling import ProfileReport
import matplotlib.pyplot as plt
import logging as lg
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression
import statsmodels.formula.api as smf
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import Ridge, Lasso, RidgeCV, LassoCV, ElasticNet, ElasticNetCV

In [2]:
lg.basicConfig(filename='a141.txt', format='%(name)s - %(levelname)s - %(message)s')

In [3]:
class A14I:
    def __init__(self, file_path):
        self.file_path = file_path
        
    def read_data(self):
        try:
            self.logging_info("Reading data..")
            self.dataframe = pd.read_csv(self.file_path)
            self.logging_info('Reading data complete..')
        except Exception as e:
            self.logging_error('An error has occured. Error: '+str(e))
            print(e)
    
    def show_data(self):
        try:
            self.logging_info('Showing data..')
            return self.dataframe
        except Exception as e:
            self.logging_error('An error has occured while showing data. Error: ' + str(e))
            print(e)
            
    def pandas_profiling(self):
        try:
            self.logging_info('Performing Pandas profiling..')
            return ProfileReport(self.dataframe)
            self.logging_info('Pandas profiling report generated..')
        except Exception as e:
            self.logging_error('An error has occured while doing pandas profiling. Error: ' + str(e))
            print(e)
    
    def transform_data(self, cols):
        try:
            self.logging_info('Transforming data')
            self.dataframe[cols] = np.log1p(self.dataframe[cols])
            
        except Exception as e:
            self.logging_error('An error occured while transforming data. Error: ' + str(e))
            print(e)
            
    def data_split(self, cols):
        try:
            self.logging_info('Splitting data into train test split')
            x = self.dataframe.drop(cols, axis=1)
            y = self.dataframe[['Air_temperature']]
            self.X_train, self.X_test, self.y_train, self.y_test = train_test_split(x, y, test_size=0.3, random_state=42)
            self.logging_info('Splitting data into train test split complete..')
        except Exception as e:
            self.logging_error('An error has occured in data splitting. Error: ' + str(e))
            print(e)
       
    def scale_data(self, cols):
        try:
            self.logging_info('scaling data')
            self.scaler = StandardScaler()
            self.X_train[cols] = self.scaler.fit_transform(self.X_train[cols])
            self.X_test[cols] = self.scaler.transform(self.X_test[cols])
            self.X_train = np.array(self.X_train)
            self.X_test = np.array(self.X_test)
            print(self.X_train)
            self.logging_info('Scaling data complete..')
        except Exception as e:
            self.logging_error('An error has occured while scalinf data. Error: ' + str(e))
            print(e)
    
    def train_model(self):
        try:
            self.logging_info('Training model') 
            self.linear = LinearRegression()
            self.linear.fit(self.X_train, self.y_train)
            self.logging_info('Training complete')
        except Exception as e:
            self.logging_error('An error has occured while training model. Error: ' + str(e))
            print(e)
    
    def get_accuracy(self):
        try:
            self.logging_info('Getting accuracy of the model')
            r2 = self.linear.score(self.X_train, self.y_train)
            n = self.X_train.shape[0]
            p = self.X_train.shape[1]
            train_accuracy =  1- (1-r2)*(n-1)/(n-p-1)
            
            r2 = self.linear.score(self.X_test, self.y_test)
            n = self.X_test.shape[0]
            p = self.X_test.shape[1]
            test_accuracy =  1- (1-r2)*(n-1)/(n-p-1)
            print('Training Score: ', train_accuracy)
            print('Testing Score: ', test_accuracy)
        except Exception as e:
            self.logging_error('An error has occured while getting accuracy of model. Error: ' + str(e))
            print(e)
            
    def lasso(self):
        try:
            self.logging_info('Training model') 
            lassocv = LassoCV(alphas=None,cv=10,max_iter=500000, normalize=True)
            lassocv.fit(self.X_train, self.y_train)
            self.linear = Lasso(alpha=lassocv.alpha_)
            self.linear.fit(self.X_train, self.y_train)
            self.logging_info('Training complete')
        except Exception as e:
            self.logging_error('An error has occured while training model. Error: ' + str(e))
            print(e)
            
    def ridge(self):
        try:
            self.logging_info('Training model') 
            alpha = np.random.uniform(0, 10, 50)
            ridgecv = RidgeCV(alphas = alpha,cv=10, normalize=True)
            ridgecv.fit(self.X_train, self.y_train)
            self.linear = Ridge(alpha=ridgecv.alpha_)
            self.linear.fit(self.X_train, self.y_train)
            self.logging_info('Training complete')
        except Exception as e:
            self.logging_error('An error has occured while training model. Error: ' + str(e))
            print(e)
            
    def elastic_net(self):
        try:
            self.logging_info('Training model') 
            elasticcv = ElasticNetCV(alphas=None, cv=10)
            elasticcv.fit(self.X_train, self.y_train)
            self.linear = ElasticNet(alpha=elasticcv.alpha_)
            self.linear.fit(self.X_train, self.y_train)
            self.logging_info('Training complete')
        except Exception as e:
            self.logging_error('An error has occured while training model. Error: ' + str(e))
            print(e)
            
    def save_model(self):
        try:
            self.logging_info('Saving the final model')
            with open('linear_regressor.sav','wb') as f:
                pickle.dump(self.linear, f)
                
            with open('standard_scaler.sav','wb') as f:
                pickle.dump(self.scaler, f)
            self.logging_info('Model saved')
        except Exception as e:
            self.logging_error('An error has occured while sving model. error: ' + str(e))
            print(e)
            
    def test_cases(self, cols, scale):
        try:
            regressor = None
            scaler = None
            with open('linear_regressor.sav','rb') as f:
                regressor = pickle.load(f)
            with open('standard_scaler.sav','rb') as f:
                scaler = pickle.load(f)
            self.dataframe = self.dataframe.sample(n = 10).drop(cols, axis=1)
            x = self.dataframe.drop('Air_temperature', axis=1)
            y = self.dataframe[['Air_temperature']]
            x[scale] = scaler.transform(x[scale])
            y_pred = regressor.predict(x)
            print('Predicted      Original')
            for i in range(len(y_pred)):
                print(y_pred[i], y.iloc[i,:].values[0])

        except Exception as e:
            self.logging_error('An error has occured while testing model. Error: ' + str(e))
            print(e)
    
    def rename_columns(self, columns):
        try:
            self.logging_info('renaming columns')
            self.dataframe.rename(columns = columns, inplace=True)
        except Exception as e:
            self.logging_error('An error has occured wi;e renaming columns. Error: ' + str(e))
            print(e)
        
    def drop_data(self, data, axis=0):
        self.dataframe.drop(data, axis=axis, inplace=True)
        return self.dataframe
    
    def perform_ols(self, formula):
        try:
            self.logging_info('renaming columns')
            lm = smf.ols(formula=formula, data=self.dataframe).fit()
            return lm.summary()
        except Exception as e:
            self.logging_error('An error has occured while performing ols. Error: ' + str(e))
            print(e)
            
    def logging_info(self, message):
        try:
            lg.info(message)
        except Exception as e:
            print(e)
    
    def logging_error(self, message):
        try:
            lg.error(message)
        except Exception as e:
            print(e)

In [4]:
obj = A14I('ai4i2020.csv')
obj.read_data()
obj.show_data()

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,1,M14860,M,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,2,L47181,L,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,3,L47182,L,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,4,L47183,L,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,5,L47184,L,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9996,M24855,M,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,9997,H39410,H,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,9998,M24857,M,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,9999,H39412,H,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [14]:
report = obj.pandas_profiling()

In [15]:
report.to_file('report.html')

Summarize dataset:   0%|          | 0/27 [00:00<?, ?it/s]

Generate report structure:   0%|          | 0/1 [00:00<?, ?it/s]

Render HTML:   0%|          | 0/1 [00:00<?, ?it/s]

Export report to file:   0%|          | 0/1 [00:00<?, ?it/s]

## Inferences
- Air Temperature: 

    No missing values
    
    Has a mean of 300.00493
    
    Standard Deviation of 2.000258683
    
    Highly Possitive Correlatation with Process Temperature
    
    Is rougly normally distributed
   
   
- Process Temperature:
    
    No missing values
    
    Has a mean of 310.00556
    
    Standard Deviation of 1.483734219
    
    Highly Possitive Correlatation with Air Temperature
    
    Is roughly normally distributed
    
    
- Rotational Speed:
    
    No missing values
    
    Has a mean of 1538.7761
    
    Standard Deviation of 179.2840959
    
    Has high correlation negative correlation with Torque
    
    Is normally distributed
    
    with a slight positive skewness
    
    
- Torque:

    No missing value
    
    Has a mean of 39.98691
    
    Standard Deviation of 9.968933725
    
    Has high correlation negative correlation with Rotational Speed
    
    Is normally distributed
    
    
- Tool wear:
    
    No missing value
    
    Has a mean of 107.951
    
    Standard Deviation of 63.65414664
    
    Does't has any noticable correlation
    
    Not normally distributed
    
    
- Machine Failure:

    3.4%(339 out of 10000) of the times the machine has failed.
   
   
- TWF:

    no missing value
    
    very few times times failed due to this
    
    has medium correlation with Machine Failure
    
    
- HDF:

    no missing value
    
    very few times times failed due to this

In [5]:
columns = obj.drop_data(['UDI', 'Product ID', 'Type'], 1).columns

In [6]:
col = {'Air temperature [K]': 'Air_temperature',
 'Process temperature [K]': 'Process_temperature',
 'Rotational speed [rpm]': 'Rotational_speed',
 'Torque [Nm]': 'Torque',
 'Tool wear [min]': 'Tool_wear',
 'Machine failure': 'Machine_failure',
 'TWF': 'TWF',
 'HDF': 'HDF',
 'PWF': 'PWF',
 'OSF': 'OSF',
 'RNF': 'RNF'}

In [7]:
obj.rename_columns(col)

In [8]:
columns = obj.show_data().columns

In [10]:
obj.show_data()

Unnamed: 0,Air_temperature,Process_temperature,Rotational_speed,Torque,Tool_wear,Machine_failure,TWF,HDF,PWF,OSF,RNF
0,298.1,308.6,1551,42.8,0,0,0,0,0,0,0
1,298.2,308.7,1408,46.3,3,0,0,0,0,0,0
2,298.1,308.5,1498,49.4,5,0,0,0,0,0,0
3,298.2,308.6,1433,39.5,7,0,0,0,0,0,0
4,298.2,308.7,1408,40.0,9,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...
9995,298.8,308.4,1604,29.5,14,0,0,0,0,0,0
9996,298.9,308.4,1632,31.8,17,0,0,0,0,0,0
9997,299.0,308.6,1645,33.4,22,0,0,0,0,0,0
9998,299.0,308.7,1408,48.5,25,0,0,0,0,0,0


In [9]:
columns

Index(['Air_temperature', 'Process_temperature', 'Rotational_speed', 'Torque',
       'Tool_wear', 'Machine_failure', 'TWF', 'HDF', 'PWF', 'OSF', 'RNF'],
      dtype='object')

In [21]:
formula = 'Air_temperature ~ '
for i in columns:
    if i != 'Air_temperature':
        formula += f'{i} +'
formula = formula[:-1]

In [22]:
formula

'Air_temperature ~ Process_temperature +Rotational_speed +Torque +Tool_wear +Machine_failure +TWF +HDF +PWF +OSF +RNF '

In [23]:
obj.perform_ols(formula)

0,1,2,3
Dep. Variable:,Air_temperature,R-squared:,0.776
Model:,OLS,Adj. R-squared:,0.775
Method:,Least Squares,F-statistic:,3454.0
Date:,"Mon, 30 Aug 2021",Prob (F-statistic):,0.0
Time:,09:32:59,Log-Likelihood:,-13649.0
No. Observations:,10000,AIC:,27320.0
Df Residuals:,9989,BIC:,27400.0
Df Model:,10,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
Intercept,-64.2001,2.000,-32.107,0.000,-68.120,-60.281
Process_temperature,1.1738,0.006,183.284,0.000,1.161,1.186
Rotational_speed,0.0002,0.000,1.582,0.114,-4.55e-05,0.000
Torque,0.0003,0.002,0.123,0.902,-0.004,0.005
Tool_wear,8.455e-05,0.000,0.557,0.578,-0.000,0.000
Machine_failure,-0.0901,0.180,-0.500,0.617,-0.443,0.263
TWF,0.2018,0.218,0.923,0.356,-0.227,0.630
HDF,1.7793,0.191,9.334,0.000,1.406,2.153
PWF,0.1420,0.186,0.763,0.445,-0.223,0.507

0,1,2,3
Omnibus:,647.802,Durbin-Watson:,0.074
Prob(Omnibus):,0.0,Jarque-Bera (JB):,241.32
Skew:,-0.091,Prob(JB):,3.96e-53
Kurtosis:,2.261,Cond. No.,334000.0


In [24]:
obj.data_split(['Air_temperature', 'Rotational_speed', 'Torque',
       'Tool_wear', 'Machine_failure', 'TWF', 'PWF', 'OSF', 'RNF'])
obj.scale_data(['Process_temperature'])
obj.train_model()
obj.get_accuracy()

[[-1.21373792  0.        ]
 [-0.54272491  0.        ]
 [ 1.33611152  0.        ]
 ...
 [ 1.53741543  0.        ]
 [-2.08605484  0.        ]
 [ 0.2624907   0.        ]]
Training Score:  0.7783756504269036
Testing Score:  0.7678701414387523


In [139]:
obj.data_split(['Air_temperature', 'Rotational_speed', 'Torque',
       'Tool_wear', 'Machine_failure', 'TWF', 'PWF', 'OSF', 'RNF'])
obj.scale_data(['Process_temperature'])
obj.lasso()
obj.get_accuracy()

[[-1.21373792  0.        ]
 [-0.54272491  0.        ]
 [ 1.33611152  0.        ]
 ...
 [ 1.53741543  0.        ]
 [-2.08605484  0.        ]
 [ 0.2624907   0.        ]]


  return f(*args, **kwargs)


Training Score:  0.7783756417386015
Testing Score:  0.7678707372225946


In [140]:
obj.data_split(['Air_temperature', 'Rotational_speed', 'Torque',
       'Tool_wear', 'Machine_failure', 'TWF', 'PWF', 'OSF', 'RNF'])
obj.scale_data(['Process_temperature'])
obj.ridge()
obj.get_accuracy()

[[-1.21373792  0.        ]
 [-0.54272491  0.        ]
 [ 1.33611152  0.        ]
 ...
 [ 1.53741543  0.        ]
 [-2.08605484  0.        ]
 [ 0.2624907   0.        ]]
Training Score:  0.7783755850678651
Testing Score:  0.7678717130798869


In [141]:
obj.data_split(['Air_temperature', 'Rotational_speed', 'Torque',
       'Tool_wear', 'Machine_failure', 'TWF', 'PWF', 'OSF', 'RNF'])
obj.scale_data(['Process_temperature'])
obj.elastic_net()
obj.get_accuracy()

[[-1.21373792  0.        ]
 [-0.54272491  0.        ]
 [ 1.33611152  0.        ]
 ...
 [ 1.53741543  0.        ]
 [-2.08605484  0.        ]
 [ 0.2624907   0.        ]]


  return f(*args, **kwargs)


Training Score:  0.7780362959159719
Testing Score:  0.7677583479709937


In [25]:
obj.save_model()

In [83]:
obj.test_cases(['UDI', 'Product ID', 'Type', 'Tool wear [min]', 'Rotational speed [rpm]'])

Predicted      Original
[299.54563674] 300.7
[298.55581453] 297.5
[303.0124882] 303.1
[302.46529148] 303.6
[302.06319191] 302.4
[301.03325926] 301.4
[301.74753606] 302.2
[298.19615413] 297.3
[298.92643258] 298.8
[299.25582898] 299.9


In [12]:
final_data = obj.show_data().drop(['Rotational_speed', 'Torque',
       'Tool_wear', 'Machine_failure', 'TWF', 'PWF', 'OSF', 'RNF'], axis=1)
final_data.to_csv('final_data.csv', index=False)
