In [1]:
import pandas as pd
import numpy as np
from datatransform import df
from datatransform import DataTransform


transformer = DataTransform()    
transformer.to_categorical(df, col="Type") 


class  DataFrameTransform:
    '''
    This class contains method that will be used to perform EDA tranformation 
    of dataframes data
    
    methods:
    ---------
    imput_null()
    
    tw_min()
    
    tran_col()
    
    drop_col()
    
    save_df()
    
    map_type()
    
    map_pro_id()
    
    rm_rpm_outl()
    
    rm_tq_outl()
    
    range_seldfcol()
    
    range_sel_df_col_H()
    
    range_sel_df_col_M()
    
    range_sel_df_col_L()
    
    
    
    
    
    '''
    
    @staticmethod
    def imput_null(df):
        '''
        this method imput the null values in  dataframe columns with the mean of the 
        dataframe column values
        (task3, step 3)
        
        parameters:
        ---------
        df:dataframe
        
        '''
        df.fillna({'Air temperature [K]':df['Air temperature [K]'].mean(),'Process temperature [K]': 
            df['Process temperature [K]'].mean(), 'Tool wear [min]': df['Tool wear [min]'].mean()}, inplace=True)
        return df
    
    @staticmethod
    def tw_min(df):
        '''
        this method transforms the dataframe column 'Tool wear [min]' to 
        minutes from seconds
        
        parameters:
        ----------
        df:dataframe
        '''
        df['Tool wear [min]'] = df['Tool wear [min]'] / 60
        return df
    
    @staticmethod
    def tran_col(df):
        ''''
        this method is used to transform the dataframe column, 'Rotational speed [rpm]'  to
        reduce its skewness
        
        parameters:
        ---------
        df:dataframe
        
        '''
        df = df.copy()
        log_trans = df['Rotational speed [rpm]'].map(lambda i: np.log(i) if i > 0 else 0)
        df['Rotational speed [rpm]'] = log_trans
        return df
        
        
    
    @staticmethod
    def map_type(df):
        '''
        In the df i noticed the column 'Tool wear [min]' does not corrrelate with 
        the the product quality type H:M:L which should have this value of 'Tool wear [min]'
        5:3:2 respectively.
        This method corrects that by returning the correct product quality type for the correct
        'Tool wear [min]' values. This is done by mapping the function map_tl()  to the 'Tool wear [min]'
        column
        
        parameters:
        ---------
        df:dataframe
        'Type': Quality of the product being created 
                (L, M, or H, for low, medium and high quality products)
        'Tool wear [min]':The current minutes of wear on the tool. H, M and L product
                          manufacturing cause 5/3/2 minutes of tool wear.
        methods:
        -------
        map_tl()
        '''
        def map_tl(x):
            '''
            this method is used to iterate into the 'Tool wear [min]' column to return
            H:M:L for the 'Tool wear [min]' values 5:3:2 respectively
            
            parameters:
            ---------
            x:dataframe column 'Tool wear [min]' values
            '''
            if 5 >= x >= 3:
                return 'H'
            elif 3 >= x >= 2:
                return 'M'
            elif 2 >= x >= 0:
                return 'L'
                pass
            
        df['Type'] = df['Tool wear [min]'].map(map_tl)
        return df
        
    @staticmethod    
    def map_pro_id(df):
        '''
        This method correct the non numeric part of the 'Product ID' column which is the same as 
        the column 'Type' by replacing it with the modified df['Type']
        
        parameters
        --------
        df:dataframe
        'Product ID':Product specific serial number column
        'Type':Product quality type
        '''
        df['Product ID']= df['Type'].astype("str") + df['Product ID'].astype("str").str.slice(1)
        return df    
        
   
    
    @staticmethod
    def rm_rpm_outl(df):
        '''
        This method is used to remove the outliers from the 'Rotational speed [rpm]'column
        of the dataframe
        
        parameters:
        ----------
        df:column dataframe
        '''
        df = df[df['Rotational speed [rpm]'] <= 1830]
        return df
    
    @staticmethod
    def rm_tq_outl(df):
        '''
        This method is used to remove the outliers from the ['Torque [Nm]' column
        of the dataframe
        
        parameters:
        ----------
        df:column dataframe
        '''
        df = df[df['Torque [Nm]'] <= 66]
        return df
    
    @staticmethod
    def drop_col(df, col):
        '''
        This method is used to drop unwanted columns of the dataframe
        
        parameters:
        ---------
        df:dataframe
        col:dataframe column to be dropped
        '''
        df.drop(columns=[col], inplace=True)
        return df
    
    
    @staticmethod
    def save_df(df):
        '''
        This method is used to save Transformed data of the dataframe to your local
        machine
        
        parameters:
        df:dataframe to be saved
        'new_df.csv': saved dataframe in the your local machine
        '''
        df.to_csv('new_df.csv', index=False)
        

    
    
    @staticmethod
    def range_seldfcol(df):
        '''
        returns a dataframe of the min and max values of of the selected dataframe columns
        
        parameters:
        ----------
        df: dataframe
        '''
        return df[['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 
               'Torque [Nm]','Tool wear [min]']].agg(['min', 'max'])
    
    @staticmethod
    def range_sel_df_col_H(df):
        '''
        returns a dataframe of the min and max values of of the selected dataframe columns grouped by column Type 'H'
        
        '''
        return df[['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 
               'Torque [Nm]','Tool wear [min]']][df['Type'] == 'H'].agg(['min', 'max'])                                                                                            
    
    @staticmethod
    def range_sel_df_col_M(df):
        '''
        returns a dataframe of the min and max values of of the selected dataframe columns grouped by column Type 'M'
        
        '''
        return df[['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 
                'Torque [Nm]','Tool wear [min]']][df['Type'] == 'M'].agg(['min', 'max'])
    
    @staticmethod
    def range_sel_df_col_L(df):
        '''
        returns a dataframe of the min and max values of of the selected dataframe columns grouped by column Type 'L'
        
        '''
        return df[['Air temperature [K]', 'Process temperature [K]', 'Rotational speed [rpm]', 
              'Torque [Nm]','Tool wear [min]']][df['Type'] == 'L'].agg(['min', 'max'])
        

if __name__ == "__main__":   
    transfm = DataFrameTransform()
    

In [2]:
transfm.imput_null(df)
#milestone3, task3, step3

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,3416,L50595,L,301.4,310.4,1579,36.9,133.0,0,0,0,0,0,0
1,7130,L54309,L,300.6,310.0,1635,31.0,107.0,0,0,0,0,0,0
2,2320,M17179,M,299.2,308.8,1700,33.5,185.0,0,0,0,0,0,0
3,9601,M24460,M,298.9,310.0,1561,45.3,58.0,0,0,0,0,0,0
4,614,L47793,L,298.1,309.9,1634,30.0,53.0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9018,M23877,M,297.3,308.1,1369,52.0,214.0,0,0,0,0,0,0
9996,3339,M18198,M,301.6,310.9,1510,40.9,152.0,0,0,0,0,0,0
9997,5771,L52950,L,301.6,311.2,1408,42.9,89.0,0,0,0,0,0,0
9998,3056,L50235,L,300.1,309.2,1427,37.6,73.0,0,0,0,0,0,0


In [3]:
transfm.tw_min(df)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,3416,L50595,L,301.4,310.4,1579,36.9,2.216667,0,0,0,0,0,0
1,7130,L54309,L,300.6,310.0,1635,31.0,1.783333,0,0,0,0,0,0
2,2320,M17179,M,299.2,308.8,1700,33.5,3.083333,0,0,0,0,0,0
3,9601,M24460,M,298.9,310.0,1561,45.3,0.966667,0,0,0,0,0,0
4,614,L47793,L,298.1,309.9,1634,30.0,0.883333,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9018,M23877,M,297.3,308.1,1369,52.0,3.566667,0,0,0,0,0,0
9996,3339,M18198,M,301.6,310.9,1510,40.9,2.533333,0,0,0,0,0,0
9997,5771,L52950,L,301.6,311.2,1408,42.9,1.483333,0,0,0,0,0,0
9998,3056,L50235,L,300.1,309.2,1427,37.6,1.216667,0,0,0,0,0,0


In [4]:
transfm.map_type(df)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,3416,L50595,M,301.4,310.4,1579,36.9,2.216667,0,0,0,0,0,0
1,7130,L54309,L,300.6,310.0,1635,31.0,1.783333,0,0,0,0,0,0
2,2320,M17179,H,299.2,308.8,1700,33.5,3.083333,0,0,0,0,0,0
3,9601,M24460,L,298.9,310.0,1561,45.3,0.966667,0,0,0,0,0,0
4,614,L47793,L,298.1,309.9,1634,30.0,0.883333,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9018,M23877,H,297.3,308.1,1369,52.0,3.566667,0,0,0,0,0,0
9996,3339,M18198,M,301.6,310.9,1510,40.9,2.533333,0,0,0,0,0,0
9997,5771,L52950,L,301.6,311.2,1408,42.9,1.483333,0,0,0,0,0,0
9998,3056,L50235,L,300.1,309.2,1427,37.6,1.216667,0,0,0,0,0,0


In [5]:
transfm.map_pro_id(df)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,3416,M50595,M,301.4,310.4,1579,36.9,2.216667,0,0,0,0,0,0
1,7130,L54309,L,300.6,310.0,1635,31.0,1.783333,0,0,0,0,0,0
2,2320,H17179,H,299.2,308.8,1700,33.5,3.083333,0,0,0,0,0,0
3,9601,L24460,L,298.9,310.0,1561,45.3,0.966667,0,0,0,0,0,0
4,614,L47793,L,298.1,309.9,1634,30.0,0.883333,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9018,H23877,H,297.3,308.1,1369,52.0,3.566667,0,0,0,0,0,0
9996,3339,M18198,M,301.6,310.9,1510,40.9,2.533333,0,0,0,0,0,0
9997,5771,L52950,L,301.6,311.2,1408,42.9,1.483333,0,0,0,0,0,0
9998,3056,L50235,L,300.1,309.2,1427,37.6,1.216667,0,0,0,0,0,0


In [6]:
transfm.rm_rpm_outl(df)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,3416,M50595,M,301.4,310.4,1579,36.9,2.216667,0,0,0,0,0,0
1,7130,L54309,L,300.6,310.0,1635,31.0,1.783333,0,0,0,0,0,0
2,2320,H17179,H,299.2,308.8,1700,33.5,3.083333,0,0,0,0,0,0
3,9601,L24460,L,298.9,310.0,1561,45.3,0.966667,0,0,0,0,0,0
4,614,L47793,L,298.1,309.9,1634,30.0,0.883333,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9018,H23877,H,297.3,308.1,1369,52.0,3.566667,0,0,0,0,0,0
9996,3339,M18198,M,301.6,310.9,1510,40.9,2.533333,0,0,0,0,0,0
9997,5771,L52950,L,301.6,311.2,1408,42.9,1.483333,0,0,0,0,0,0
9998,3056,L50235,L,300.1,309.2,1427,37.6,1.216667,0,0,0,0,0,0


In [7]:
transfm.rm_tq_outl(df)

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,3416,M50595,M,301.4,310.4,1579,36.9,2.216667,0,0,0,0,0,0
1,7130,L54309,L,300.6,310.0,1635,31.0,1.783333,0,0,0,0,0,0
2,2320,H17179,H,299.2,308.8,1700,33.5,3.083333,0,0,0,0,0,0
3,9601,L24460,L,298.9,310.0,1561,45.3,0.966667,0,0,0,0,0,0
4,614,L47793,L,298.1,309.9,1634,30.0,0.883333,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9018,H23877,H,297.3,308.1,1369,52.0,3.566667,0,0,0,0,0,0
9996,3339,M18198,M,301.6,310.9,1510,40.9,2.533333,0,0,0,0,0,0
9997,5771,L52950,L,301.6,311.2,1408,42.9,1.483333,0,0,0,0,0,0
9998,3056,L50235,L,300.1,309.2,1427,37.6,1.216667,0,0,0,0,0,0


In [8]:
transfm.save_df(df)

In [9]:
transfm.range_seldfcol(df)
#milestone4, task1, Create a table which displays to operating ranges of the columns in the table below

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
min,295.3,305.7,1168,3.8,0.0
max,304.5,313.8,2886,76.6,4.216667


In [10]:
#Then breakdown the same data to understand the ranges for each of the different product quality types.
#This is for type 'H'
transfm.range_sel_df_col_H(df)

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
min,295.3,305.7,1181,5.6,3.0
max,304.4,313.7,2833,74.5,4.216667


In [11]:
#Then breakdown the same data to understand the ranges for each of the different product quality types.
#this is for type 'M'
transfm.range_sel_df_col_M(df)

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
min,295.4,305.9,1168,4.6,2.0
max,304.3,313.8,2861,75.4,2.983333


In [12]:
#Then breakdown the same data to understand the ranges for each of the different product quality types.
#this is for type 'L'
transfm.range_sel_df_col_L(df)

Unnamed: 0,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min]
min,295.5,305.9,1200,3.8,0.0
max,304.5,313.7,2886,76.6,1.983333


In [13]:
transfm.tran_col(df)
# i used df.copy because i need the original data of 'Rotational speed [rpm]' in milestone 4
#milestone3, task4

Unnamed: 0,UDI,Product ID,Type,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
0,3416,M50595,M,301.4,310.4,7.364547,36.9,2.216667,0,0,0,0,0,0
1,7130,L54309,L,300.6,310.0,7.399398,31.0,1.783333,0,0,0,0,0,0
2,2320,H17179,H,299.2,308.8,7.438384,33.5,3.083333,0,0,0,0,0,0
3,9601,L24460,L,298.9,310.0,7.353082,45.3,0.966667,0,0,0,0,0,0
4,614,L47793,L,298.1,309.9,7.398786,30.0,0.883333,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,9018,H23877,H,297.3,308.1,7.221836,52.0,3.566667,0,0,0,0,0,0
9996,3339,M18198,M,301.6,310.9,7.319865,40.9,2.533333,0,0,0,0,0,0
9997,5771,L52950,L,301.6,311.2,7.249926,42.9,1.483333,0,0,0,0,0,0
9998,3056,L50235,L,300.1,309.2,7.263330,37.6,1.216667,0,0,0,0,0,0


In [None]:
#milestone3, task 6, dropping over related column.i did not do this because because i need the original data of 'Rotational speed [rpm]' in milestone 4