In [1]:
import pandas as pd
import numpy as np
from datatransform import df
from datatransform import DataTransform
 


transformer = DataTransform()    
transformer.to_categorical(df, col="Type")   


class DataFrameInfo:
    '''
    contain methods that generate useful information about the DataFrame.
    (milestone 3 task 2)
    methods:
    -------
    col_dtypes()
    
    df_info()
    
    dnst_cnt()
    
    amount_null()
    
    count_null()
    
    skew_df()
    
    '''
    
    
    @staticmethod
    def col_dtypes(df):
        '''
        return the data types of dataframe columns
        
        parameters:
        ----------
        df:dataframe
        '''
        return df.dtypes
    
    @staticmethod
    def df_info(df):
        '''
        returns informations about the dataframe columns, these includes mean, median, std, 
        min value, max value, count and others information of dataframe columns
        
        parameters:
        ---------
        df:dataframe
        '''
        
        infor = df.describe()
        return infor
    
    @staticmethod
    def dnst_cnt(df, col):
        '''
        returns the number of unique values  of the dataframe columns
        
        parameters:
        ---------
        df:dataframe
        col:dataframe column
        '''
        return df[col].nunique()
    
    
    @staticmethod
    def amount_null(df):
        '''
        returns the sum of the total number of null values in a dataframe column
        
        parameters:
        ---------
        df:dataframe
        '''
        return df.isnull().sum()
     
    
    @staticmethod
    def count_null(df):
        '''
        returns the percentage of dataframe column nulls sum to dataframe count
        
        parameters:
        ---------
        df:dataframe
        '''
        count_null_percentage = df.isnull().sum()* 100/len(df)
        return count_null_percentage
    
    
    @staticmethod
    def skew_df(df):
        '''
        returns the skewness of dataframe numeric columns
        parameters:
        ---------
        df:dataframe
        '''
        return df.skew(numeric_only= True)
if __name__ == "__main__":
    info = DataFrameInfo()    

In [2]:
(info.col_dtypes(df))

UDI                           int64
Product ID                   object
Type                       category
Air temperature [K]         float64
Process temperature [K]     float64
Rotational speed [rpm]        int64
Torque [Nm]                 float64
Tool wear [min]             float64
Machine failure               int64
TWF                           int64
HDF                           int64
PWF                           int64
OSF                           int64
RNF                           int64
dtype: object

In [3]:
info.dnst_cnt(df, col= "Type")

3

In [4]:
info.df_info(df)

Unnamed: 0,UDI,Air temperature [K],Process temperature [K],Rotational speed [rpm],Torque [Nm],Tool wear [min],Machine failure,TWF,HDF,PWF,OSF,RNF
count,10000.0,9162.0,9119.0,10000.0,10000.0,9516.0,10000.0,10000.0,10000.0,10000.0,10000.0,10000.0
mean,5000.5,300.00859,310.01158,1538.7761,39.98691,108.219525,0.0339,0.0046,0.0115,0.0095,0.0098,0.0019
std,2886.89568,1.999148,1.480138,179.284096,9.968934,63.612534,0.180981,0.067671,0.106625,0.097009,0.098514,0.04355
min,1.0,295.3,305.7,1168.0,3.8,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2500.75,298.3,308.8,1423.0,33.2,53.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,5000.5,300.1,310.1,1503.0,40.1,108.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,7500.25,301.5,311.1,1612.0,46.8,163.0,0.0,0.0,0.0,0.0,0.0,0.0
max,10000.0,304.5,313.8,2886.0,76.6,253.0,1.0,1.0,1.0,1.0,1.0,1.0


In [5]:
info.amount_null(df)
#milestone3, task2, step2

UDI                          0
Product ID                   0
Type                         0
Air temperature [K]        838
Process temperature [K]    881
Rotational speed [rpm]       0
Torque [Nm]                  0
Tool wear [min]            484
Machine failure              0
TWF                          0
HDF                          0
PWF                          0
OSF                          0
RNF                          0
dtype: int64

In [6]:
print(info.count_null(df))

UDI                        0.00
Product ID                 0.00
Type                       0.00
Air temperature [K]        8.38
Process temperature [K]    8.81
Rotational speed [rpm]     0.00
Torque [Nm]                0.00
Tool wear [min]            4.84
Machine failure            0.00
TWF                        0.00
HDF                        0.00
PWF                        0.00
OSF                        0.00
RNF                        0.00
dtype: float64


In [7]:
info.skew_df(df)

UDI                         0.000000
Air temperature [K]         0.118541
Process temperature [K]     0.006529
Rotational speed [rpm]      1.993171
Torque [Nm]                -0.009517
Tool wear [min]             0.021183
Machine failure             5.151852
TWF                        14.644462
HDF                         9.164789
PWF                        10.114516
OSF                         9.953916
RNF                        22.879570
dtype: float64