<a href="https://colab.research.google.com/github/EAsencios/DEEP-LEARING/blob/master/data_wrangling.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Data Wrangling**

In [1]:
import random
import numpy as np
import pandas as pd

import datetime
from random import randrange

In [2]:
from time import strftime
startDate = datetime.datetime(2020, 1, 1,8)


def _random_date(start,date_count):
    """This function generates a random date based on params
    Args:
        start (date object): the base date
        date_count (int): number of dates to be generated
    Returns:
        list of random dates

    """
    current = start
    while date_count > 0:
        curr = current + datetime.timedelta(days=randrange(42))
        yield curr
        date_count-=1


def generate_sample_data(row_count=100):
    """This function generates a random transaction dataset
    Args:
        row_count (int): number of rows for the dataframe
    Returns:
        a pandas dataframe

    """

    # sentinels
    startDate = datetime.datetime(2020, 1, 1, 8)
    serial_number_sentinel = 1000
    user_id_sentinel = 5001
    product_id_sentinel = 101
    price_sentinel = 2000

    data_dict = {
    'Serial No': np.arange(row_count)+serial_number_sentinel,
    'Date': np.random.permutation(pd.to_datetime([x.strftime('%d-%m-%Y') for x in _random_date(startDate,row_count)]).date),
    'User ID': np.random.permutation(np.random.randint(0, row_count, size=int(row_count/10)) + user_id_sentinel).tolist()*10,
    'Product ID': np.random.permutation(np.random.randint(0, row_count, size=int(row_count/10))+ product_id_sentinel).tolist()*10 ,
    'Quantity Purchased': np.random.permutation(np.random.randint(1, 42, size=row_count)),
    'Price': np.round(np.abs(np.random.randn(row_count)+1)*price_sentinel, decimals=2),
    'User Type':np.random.permutation([chr(random.randrange(97, 97 + 3 + 1)) for i in range(row_count)])
    }

    
    # introduce missing values
    for index in range(int(np.sqrt(row_count))): 
        data_dict['Price'][np.argmax(data_dict['Price'] == random.choice(data_dict['Price']))] = np.nan
        data_dict['User Type'][np.argmax(data_dict['User Type'] == random.choice(data_dict['User Type']))] = np.nan
        data_dict['Date'][np.argmax(data_dict['Date'] == random.choice(data_dict['Date']))] = np.nan
        data_dict['Product ID'][np.argmax(data_dict['Product ID'] == random.choice(data_dict['Product ID']))] = 0
        data_dict['Serial No'][np.argmax(data_dict['Serial No'] == random.choice(data_dict['Serial No']))] = -1
        data_dict['User ID'][np.argmax(data_dict['User ID'] == random.choice(data_dict['User ID']))] = -101
        

    df = pd.DataFrame(data_dict)

    return(df)


def describe_dataframe(df=pd.DataFrame()):
    """This function generates descriptive stats of a dataframe
    Args:
        df (dataframe): the dataframe to be analyzed
    Returns:
        None

    """
    print("\n\n")
    print("*"*30)
    print("About the Data")
    print("*"*30)

    print("Number of rows::",df.shape[0])
    print("Number of columns::",df.shape[1])
    print("\n")
    
    print("Column Names::",df.columns.values.tolist())
    print("\n")
    
    print("Column Data Types::\n",df.dtypes)
    print("\n")
        
    print("Columns with Missing Values::",df.columns[df.isnull().any()].tolist())
    print("\n")
       
    print("Number of rows with Missing Values::",len(pd.isnull(df).any(1).to_numpy().nonzero()[0].tolist()))
    print("\n")

    
    print("Sample Indices with missing data::",pd.isnull(df).any(1).to_numpy().nonzero()[0].tolist()[0:5])
    print("\n")
    
    print("General Stats::")
    print(df.info())
    print("\n")

    
    print("Summary Stats::")
    print(df.describe())
    print("\n")
    
    print("Dataframe Sample Rows::")
    display(df.head(5))
    
    return


def cleanup_column_names(df,rename_dict={},do_inplace=True):
    """This function renames columns of a pandas dataframe
       It converts column names to snake case if rename_dict is not passed. 
    Args:
        rename_dict (dict): keys represent old column names and values point to 
                            newer ones
        do_inplace (bool): flag to update existing dataframe or return a new one
    Returns:
        pandas dataframe if do_inplace is set to False, None otherwise

    """
    if not rename_dict:
        return df.rename(columns={col: col.lower().replace(' ','_') 
                    for col in df.columns.values.tolist()}, 
                  inplace=do_inplace)
    else:
        return df.rename(columns=rename_dict,inplace=do_inplace)

def expand_user_type(u_type):
    """This function maps user types to user classes
    Args:
        u_type (str): user type value
    Returns:
        (str) user_class value

    """
    if u_type in ['a','b']:
        return 'new'
    elif u_type == 'c':
        return 'existing'
    elif u_type == 'd':
        return 'loyal_existing'
    else:
        return 'error'


In [3]:
df = generate_sample_data(1000)

In [4]:
describe_dataframe(df)




******************************
About the Data
******************************
Number of rows:: 1000
Number of columns:: 7


Column Names:: ['Serial No', 'Date', 'User ID', 'Product ID', 'Quantity Purchased', 'Price', 'User Type']


Column Data Types::
 Serial No               int64
Date                   object
User ID                 int64
Product ID              int64
Quantity Purchased      int64
Price                 float64
User Type              object
dtype: object


Columns with Missing Values:: ['Date', 'Price']


Number of rows with Missing Values:: 61


Sample Indices with missing data:: [0, 1, 2, 3, 4]


General Stats::
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Serial No           1000 non-null   int64  
 1   Date                969 non-null    object 
 2   User ID             1000 non-null   int64  
 3   Pro

Unnamed: 0,Serial No,Date,User ID,Product ID,Quantity Purchased,Price,User Type
0,1000,,-101,0,39,,n
1,1001,,5501,155,1,989.0,n
2,1002,,5622,536,2,2156.11,n
3,1003,,5519,220,37,346.87,n
4,1004,,5481,220,7,3022.2,n


In [5]:
print('Dataframe columns:\n{}'.format(df.columns.tolist()))

Dataframe columns:
['Serial No', 'Date', 'User ID', 'Product ID', 'Quantity Purchased', 'Price', 'User Type']


In [6]:
cleanup_column_names(df)

In [7]:
print('Dataframe columns:\n{}'.format(df.columns.tolist()))

Dataframe columns:
['serial_no', 'date', 'user_id', 'product_id', 'quantity_purchased', 'price', 'user_type']


In [17]:
print("Using Column Name::")
df['quantity_purchased'].values[:10]

Using Column Name::


array([39,  1,  2, 37,  7, 34, 25,  1, 40, 37])

In [18]:
print("Using Column Name::" )
df.quantity_purchased.values[:10]

Using Column Name::


array([39,  1,  2, 37,  7, 34, 25,  1, 40, 37])

In [19]:
print("Using Column Data Type::" )
df.select_dtypes(include=['float64']).values[:10]

Using Column Data Type::


array([[    nan],
       [ 989.  ],
       [2156.11],
       [ 346.87],
       [3022.2 ],
       [ 862.85],
       [7493.35],
       [3448.62],
       [1128.54],
       [ 845.86]])

In [20]:
print('Select Specific row indices::')
df.iloc[[20, 30, 50, 60]]

Select Specific row indices::


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
20,1020,,5211,239,1,1838.79,n
30,1030,2020-06-02,5181,754,31,1122.6,d
50,1050,2020-11-01,5952,332,13,4347.31,d
60,1060,2020-01-14,5236,597,40,1917.6,b


In [21]:
print('Excluding Specific Row indices::')
df.drop([1, 14, 21], axis=0).head(20)

Excluding Specific Row indices::


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
0,1000,,-101,0,39,,n
2,1002,,5622,536,2,2156.11,n
3,1003,,5519,220,37,346.87,n
4,1004,,5481,220,7,3022.2,n
5,1005,,5539,842,34,862.85,n
6,1006,,5457,199,25,7493.35,n
7,1007,,5066,485,1,3448.62,n
8,1008,,5320,642,40,1128.54,n
9,1009,2020-01-27,5519,183,37,845.86,n
10,1010,,5632,868,17,2937.21,n


In [23]:
print('Subsetting based on logical condition(s)::')
df[df.quantity_purchased>25].head()

Subsetting based on logical condition(s)::


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
0,1000,,-101,0,39,,n
3,1003,,5519,220,37,346.87,n
5,1005,,5539,842,34,862.85,n
8,1008,,5320,642,40,1128.54,n
9,1009,2020-01-27,5519,183,37,845.86,n


In [24]:
print('Subsetting based on offset from top (bottom)::')
df[100:].head()  # df.tail(-100)

Subsetting based on offset from top (bottom)::


Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
100,1100,2020-10-01,5543,594,29,111.0,a
101,1101,2020-06-01,5501,155,5,1003.2,b
102,1102,2020-04-01,5622,536,19,2360.9,b
103,1103,2020-08-02,5519,220,39,1581.38,d
104,1104,2020-05-02,5481,220,25,1976.58,d


In [25]:
df.tail(-100)

Unnamed: 0,serial_no,date,user_id,product_id,quantity_purchased,price,user_type
100,1100,2020-10-01,5543,594,29,111.00,a
101,1101,2020-06-01,5501,155,5,1003.20,b
102,1102,2020-04-01,5622,536,19,2360.90,b
103,1103,2020-08-02,5519,220,39,1581.38,d
104,1104,2020-05-02,5481,220,25,1976.58,d
...,...,...,...,...,...,...,...
995,1995,2020-02-02,5417,488,33,4460.92,b
996,1996,2020-11-01,5121,705,39,352.02,d
997,1997,2020-10-01,5567,317,10,,b
998,1998,2020-09-01,5046,343,10,4399.86,d
