<a href="https://colab.research.google.com/github/EAsencios/DEEP-LEARING/blob/master/data_wrangling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Data Wrangling**

In [37]:
import random
import numpy as np
import pandas as pd

import datetime
from random import randrange

In [88]:
from time import strftime
startDate = datetime.datetime(2020, 1, 1,8)


def _random_date(start,date_count):
    """This function generates a random date based on params
    Args:
        start (date object): the base date
        date_count (int): number of dates to be generated
    Returns:
        list of random dates

    """
    current = start
    while date_count > 0:
        curr = current + datetime.timedelta(days=randrange(42))
        yield curr
        date_count-=1


def generate_sample_data(row_count=100):
    """This function generates a random transaction dataset
    Args:
        row_count (int): number of rows for the dataframe
    Returns:
        a pandas dataframe

    """

    # sentinels
    startDate = datetime.datetime(2020, 1, 1, 8)
    serial_number_sentinel = 1000
    user_id_sentinel = 5001
    product_id_sentinel = 101
    price_sentinel = 2000

    data_dict = {
    'Serial No': np.arange(row_count)+serial_number_sentinel,
    'Date': np.random.permutation(pd.to_datetime([x.strftime('%d-%m-%Y') for x in _random_date(startDate,row_count)]).date),
    'User ID': np.random.permutation(np.random.randint(0, row_count, size=int(row_count/10)) + user_id_sentinel).tolist()*10,
    'Product ID': np.random.permutation(np.random.randint(0, row_count, size=int(row_count/10))+ product_id_sentinel).tolist()*10 ,
    'Quantity Purchased': np.random.permutation(np.random.randint(1, 42, size=row_count)),
    'Price': np.round(np.abs(np.random.randn(row_count)+1)*price_sentinel, decimals=2),
    'User Type':np.random.permutation([chr(random.randrange(97, 97 + 3 + 1)) for i in range(row_count)])
    }

    
    # introduce missing values
    for index in range(int(np.sqrt(row_count))): 
        data_dict['Price'][np.argmax(data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan
        data_dict['User Type'][np.argmax(data_dict['User Type'] == random.choice(data_dict['User Type']))] = np.nan
        data_dict['Date'][np.argmax(data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan
        data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.choice(data_dict['Product ID']))] = 0
        data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.choice(data_dict['Serial No']))] = -1
        data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(data_dict['User ID']))] = -101
        

    df = pd.DataFrame(data_dict)

    return(df)


def describe_dataframe(df=pd.DataFrame()):
    """This function generates descriptive stats of a dataframe
    Args:
        df (dataframe): the dataframe to be analyzed
    Returns:
        None

    """
    print("\n\n")
    print("*"*30)
    print("About the Data")
    print("*"*30)

    print("Number of rows::",df.shape[0])
    print("Number of columns::",df.shape[1])
    print("\n")
    
    print("Column Names::",df.columns.values.tolist())
    print("\n")
    
    print("Column Data Types::\n",df.dtypes)
    print("\n")
        
    print("Columns with Missing Values::",df.columns[df.isnull().any()].tolist())
    print("\n")
       
    print("Number of rows with Missing Values::",len(pd.isnull(df).any(1).to_numpy().nonzero()[0].tolist()))
    print("\n")

    
    print("Sample Indices with missing data::",pd.isnull(df).any(1).to_numpy().nonzero()[0].tolist()[0:5])
    print("\n")
    
    print("General Stats::")
    print(df.info())
    print("\n")

    
    print("Summary Stats::")
    print(df.describe())
    print("\n")
    
    print("Dataframe Sample Rows::")
    display(df.head(5))
    
    return


def cleanup_column_names(df,rename_dict={},do_inplace=True):
    """This function renames columns of a pandas dataframe
       It converts column names to snake case if rename_dict is not passed. 
    Args:
        rename_dict (dict): keys represent old column names and values point to 
                            newer ones
        do_inplace (bool): flag to update existing dataframe or return a new one
    Returns:
        pandas dataframe if do_inplace is set to False, None otherwise

    """
    if not rename_dict:
        return df.rename(columns={col: col.lower().replace(' ','_') 
                    for col in df.columns.values.tolist()}, 
                  inplace=do_inplace)
    else:
        return df.rename(columns=rename_dict,inplace=do_inplace)

def expand_user_type(u_type):
    """This function maps user types to user classes
    Args:
        u_type (str): user type value
    Returns:
        (str) user_class value

    """
    if u_type in ['a','b']:
        return 'new'
    elif u_type == 'c':
        return 'existing'
    elif u_type == 'd':
        return 'loyal_existing'
    else:
        return 'error'


In [89]:
df = generate_sample_data(1000)

In [90]:
describe_dataframe(df)




******************************
About the Data
******************************
Number of rows:: 1000
Number of columns:: 7


Column Names:: ['Serial No', 'Date', 'User ID', 'Product ID', 'Quantity Purchased', 'Price', 'User Type']


Column Data Types::
 Serial No               int64
Date                   object
User ID                 int64
Product ID              int64
Quantity Purchased      int64
Price                 float64
User Type              object
dtype: object


Columns with Missing Values:: ['Date', 'Price']


Number of rows with Missing Values:: 61


Sample Indices with missing data:: [0, 3, 4, 8, 9]


General Stats::
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Serial No           1000 non-null   int64  
 1   Date                969 non-null    object 
 2   User ID             1000 non-null   int64  
 3   Pro

Unnamed: 0,Serial No,Date,User ID,Product ID,Quantity Purchased,Price,User Type
0,1000,,-101,0,38,27.09,n
1,1001,2020-01-16,5298,722,4,1251.88,n
2,1002,2020-07-01,5746,116,39,7119.68,n
3,1003,,5961,130,3,1590.26,n
4,1004,,5963,525,23,1863.21,n


In [91]:
print('Dataframe columns:\n{}'.format(df.columns.tolist()))

Dataframe columns:
['Serial No', 'Date', 'User ID', 'Product ID', 'Quantity Purchased', 'Price', 'User Type']


In [92]:
cleanup_column_names(df)

In [93]:
print('Dataframe columns:\n{}'.format(df.columns.tolist()))

Dataframe columns:
['serial_no', 'date', 'user_id', 'product_id', 'quantity_purchased', 'price', 'user_type']


In [109]:
print("Using Column Name::")
print(df['quantity_purchased'].values[:10])

Using Column Name::
[38  4 39  3 23 37  1 36 27 24]


In [108]:
print("Using Column Name::" )
print(df.quantity_purchased.values[:10])

Using Column Name::
[38  4 39  3 23 37  1 36 27 24]


In [113]:
print("Using Column Data Type::" )
print(df.select_dtypes(include=['float64']).values[:10])

Using Column Data Type::
[[  27.09]
 [1251.88]
 [7119.68]
 [1590.26]
 [1863.21]
 [2510.93]
 [2789.51]
 [2230.63]
 [1092.86]
 [5024.82]]
