# Purpose of the Notebook

The purpose of this notebook is to provide functions to look the data in the data frame including, types, example of values in the columns
number of missing values, # of categories and their ratio and for the string/categorical the number of distinct values.

The functions exported for other notebooks to use will be:
> * info_about_columns
> * show_examples_of_data



In [4]:
import pandas as pd

from itertools import repeat

In [5]:
# Reusable functions

In [6]:
def info_about_columns(dataframe, data_science_descriptions):
    '''
        A reusable function that will create a dataframe to contain
        the following : dataypes, Number of Unique Categories, Categories 
        per sample and the type of variable missing values and missing values %
        
        input : A dataframe where data and categories will be retrieved
        series : The data science explamation for each data type
    '''
                                         
    dataframe_info_about_columns = pd.concat([
          dataframe.dtypes, 
          dataframe.nunique(), 
          round(dataframe.nunique()*100/len(dataframe)),
          dataframe.isnull().sum(),
          dataframe.isnull().sum() * 100 / len(dataframe)], axis=1)
    

    dataframe_info_about_columns.columns=[
                                     'DataType', 
                                     '# of Categories', 
                                     'categories/sample ratio', 
                                     'missing values',
                                     'missing values %']
    
    return dataframe_info_about_columns

In [7]:
# Read the data and look at the data types / missing values / types which more accurately explained

In [8]:
pd.set_option('display.max_columns', None)
def show_examples_of_data(dataframe, data_information, category_cutoff):
    '''
       purpose: To show example of the data in each column
       
       input:
          dataframe          The data frame that contains the dataset
          data_information   Information about the categorical, missing values etc..
    '''
    
    data_dictionary = pd.DataFrame(columns=["Field", "Value", "Number of Values"])
    
    for index, row in data_information.iterrows():
        
         values = ""
         number_of_distinct_values = ""
         do_category_cutoff = data_information.loc[index, '# of Categories'] < category_cutoff
         data_type = data_information.loc[index, 'DataType']
         if data_type in ["float64", "int64" ] and do_category_cutoff == False:
            value = str(dataframe[index].min()) + " to " + str(dataframe[index].max())
            number_of_distinct_values = "NA"
         elif data_type in ["float64", "int64" ] and do_category_cutoff == True:
            value = dataframe[index].unique()
            number_of_distinct_values = len(value)
         elif data_type == "object" or data_type == "string[python]":
            value = dataframe[index].unique()
            number_of_distinct_values = len(value)
         elif data_type == "category":
            value = dataframe[index].unique()
            number_of_distinct_values = len(value)
         else:
             value = "Unknown"
             number_of_distinct_values = "NA"
            
         row_data = []
         row_data.append(index)
         row_data.append(value)
         row_data.append(number_of_distinct_values)
    
         data_dictionary.loc[len(data_dictionary.index)] = row_data

         last_column = data_dictionary.columns[-2]
         #data_dictionary[last_column] = data_dictionary[last_column].apply( lambda x : x.ljust(20))
         data_dictionary.style.set_properties(subset='Value', **{'text-align': 'left'})
                
    return data_dictionary
