## Data Validation for 
- Product Category Lookup Table
- Product Subcategory Lookup Table
- Customer Lookup Table

In [26]:
import pandas as pd

#### Importing Data

In [16]:
# While importing customer lookup data it shows unicode error for utf-8 so I changed it to latin-1
customer_lookup = pd.read_csv('Data/AdventureWorks Customer Lookup.csv', encoding='latin-1')

product_category_lookup = pd.read_csv('Data/AdventureWorks Product Categories Lookup.csv')

product_subcategory_lookup = pd.read_csv('Data/AdventureWorks Product Subcategories Lookup.csv')

#### Data Checking

In [17]:
# To print the datatype and the values
def data_type_check(df):
    print("DataType", end="\n\n")
    print(df.dtypes, end="\n\n")
    print("Dataframe")
    return df.head()

# Check the Datatype one by one
data_type_check(customer_lookup)
# data_type_check(product_category_lookup) # Correct DataType
# data_type_check(product_subcategory_lookup) # Correct DataType

DataType

CustomerKey        object
Prefix             object
FirstName          object
LastName           object
BirthDate          object
MaritalStatus      object
Gender             object
EmailAddress       object
AnnualIncome      float64
TotalChildren     float64
EducationLevel     object
Occupation         object
HomeOwner          object
dtype: object

Dataframe


Unnamed: 0,CustomerKey,Prefix,FirstName,LastName,BirthDate,MaritalStatus,Gender,EmailAddress,AnnualIncome,TotalChildren,EducationLevel,Occupation,HomeOwner
0,11000,MR.,JON,YANG,1966-04-08,M,M,jon24@adventure-works.com,90000.0,2.0,Bachelors,Professional,Y
1,11001,MR.,EUGENE,HUANG,1965-05-14,S,M,eugene10@adventure-works.com,60000.0,3.0,Bachelors,Professional,N
2,11002,MR.,RUBEN,TORRES,1965-08-12,M,M,ruben35@adventure-works.com,60000.0,3.0,Bachelors,Professional,Y
3,11003,MS.,CHRISTY,ZHU,1968-02-15,S,F,christy12@adventure-works.com,70000.0,0.0,Bachelors,Professional,N
4,11004,MRS.,ELIZABETH,JOHNSON,1968-08-08,S,F,elizabeth5@adventure-works.com,80000.0,5.0,Bachelors,Professional,Y


In the above output, we can see that all the data is in the correct data type except for CustomerKey, BirthDate and TotalChildren in Customer_Lookup_Table.

But whenever I try to change the data type for CustomerKey to int, it gives me an error, so i figured to clean the data before changing the data type.

#### Cleaning Data

In [18]:
# Customer Lookup Table
customer_lookup = customer_lookup[customer_lookup["CustomerKey"].notnull() & customer_lookup["CustomerKey"].str.isdigit()]
# Product Category Lookup Table
product_category_lookup = product_category_lookup[product_category_lookup["ProductCategoryKey"].notnull()]
# Product Subcategory Lookup Table
product_subcategory_lookup = product_subcategory_lookup[product_subcategory_lookup["ProductSubcategoryKey"].notnull()]  

#### Changing DataTypes

In [19]:
customer_lookup = customer_lookup.astype({'CustomerKey': 'int64', 'BirthDate': 'datetime64[ns]', 'TotalChildren': 'int32'})

#### Adding Column

In [20]:
capitalize_column = ['Prefix', 'FirstName', 'LastName']
# Change the column values to Capitalize
customer_lookup[capitalize_column] = customer_lookup[capitalize_column].apply(lambda x: x.str.capitalize())
# Create a FullName column with Prefix, FirstName and LastName
customer_lookup["FullName"] = customer_lookup["Prefix"] + " " + customer_lookup["FirstName"] + " " + customer_lookup["LastName"]

# Create an income_level function to check the set of condition with value x
def income_level(x):
    if x >= 150000:
        return "Very High"
    elif x >= 100000:
        return "High"
    elif x >= 50000:
        return "Average"
    else:
        return "Low"
# Applying the funciton the the AnnualIncome column
customer_lookup["IncomeLevel"] = customer_lookup["AnnualIncome"].apply(lambda x: income_level(x))

#### Calculated Column Validation

In [32]:
customer_lookup.value_counts("IncomeLevel")

IncomeLevel
Average      8201
Low          7804
High         1843
Very High     300
Name: count, dtype: int64

### Validation

#### Customer_lookup

- Rows_Count

In [34]:
len(customer_lookup)

18148

- Average AnnualIncome

In [23]:
customer_lookup["AnnualIncome"].mean()

57269.120564249504

#### Product_Category_Lookup

- Rows_Count

In [36]:
product_category_lookup["ProductCategoryKey"].count()

4

#### Product_Subcategory_Lookup

- Rows_Count

In [25]:
product_subcategory_lookup["ProductSubcategoryKey"].count()

37