#NAME - ASWIN RAJ
#EMAIL - aswinraj302002@gmail.com


# KPMG TASK 1

**IMPORT THE DATASET**

In [7]:
import pandas as pd
import chardet
with open('CustomerDemographic.csv', 'rb') as file:
    result = chardet.detect(file.read())
encoding = result['encoding']
df = pd.read_csv('CustomerDemographic.csv', encoding=encoding)

**PRINT THE DATASET**

In [11]:
df = df.iloc[:, :13]  
print(df.head())


   customer_id first_name last_name  gender  \
0           34   Jephthah  Bachmann       U   
1           66     Anselm     Gawne    Male   
2         1888      Sibyl   Scholtz  Female   
3         3435    Stevena   Allcock  Female   
4         2858  Benedicto     Radki    Male   

   past_3_years_bike_related_purchases         DOB           job_title  \
0                                   59  1843-12-21     Legal Assistant   
1                                   46  2002-03-11   Account Executive   
2                                   67  2002-01-26        Food Chemist   
3                                   80  2002-01-15       Senior Editor   
4                                    4  2002-01-09  Recruiting Manager   

  job_industry_category     wealth_segment deceased_indicator  \
0                    IT  Affluent Customer                  N   
1           Argiculture     High Net Worth                  N   
2                Health      Mass Customer                  N   
3           

**1 ACCURACY**

In [12]:
accuracy = df['customer_id'].count() == len(df)
print("Accuracy:", accuracy)

Accuracy: True


**2 COMPLETENESS**

In [13]:
completeness = df.notnull().mean()
print("Completeness:")
print(completeness)

Completeness:
customer_id                            1.00000
first_name                             1.00000
last_name                              0.96875
gender                                 1.00000
past_3_years_bike_related_purchases    1.00000
DOB                                    0.97825
job_title                              0.87350
job_industry_category                  0.83600
wealth_segment                         1.00000
deceased_indicator                     1.00000
default                                0.92450
owns_car                               1.00000
tenure                                 0.97825
dtype: float64


**3 CONSISTENCY**

In [14]:
consistency = (df['gender'].isin(['Male', 'Female'])).all() \
              and (df['deceased_indicator'].isin(['Y', 'N'])).all() \
              and (df['owns_car'].isin(['Yes', 'No'])).all()
print("Consistency:", consistency)

Consistency: False


**4 CURRENCY**

In [15]:
currency = (df['DOB'] <= '2021-09-01').all()
print("Currency:", currency)

Currency: False


**5 RELEVANCY**

In [19]:
df = df.iloc[:, :13]
required_columns = ['gender', 'job_industry_category', 'wealth_segment', 'deceased_indicator', 'owns_car', 'customer_id']
relevancy = {col: list(df[col].unique()) for col in required_columns}
relevancy_check = df[required_columns].apply(lambda x: x.isin(relevancy[x.name])).all()
print("Relevancy:")
print(relevancy_check)



Relevancy:
gender                   True
job_industry_category    True
wealth_segment           True
deceased_indicator       True
owns_car                 True
customer_id              True
dtype: bool


**6 VALIDITY**

In [24]:
df = df.iloc[:, :13]
validity = {
    'gender': ['Male', 'Female'],
    'past_3_years_bike_related_purchases': list(df['past_3_years_bike_related_purchases'].unique()),
    'job_title': list(df['job_title'].unique()),
    'job_industry_category': ['IT', 'Argiculture', 'Health', 'Manufacturing', 'Retail', 'Telecommunications', 'Financial Services'],
    'wealth_segment': ['Affluent Customer', 'Mass Customer', 'High Net Worth'],
    'deceased_indicator': ['Y', 'N'],
    'default': list(df['default'].unique()),
    'owns_car': ['Yes', 'No'],
    'tenure': list(df['tenure'].unique()),
}
validity_check = pd.Series(dtype=bool)
for column in df.columns:
    if column in validity:
        validity_check[column] = df[column].isin(validity[column]).all()

print("Validity:")
print(validity_check)

Validity:
gender                                 False
past_3_years_bike_related_purchases     True
job_title                               True
job_industry_category                  False
wealth_segment                          True
deceased_indicator                      True
default                                 True
owns_car                                True
tenure                                 False
dtype: bool


**7 UNIQUENESS**

In [25]:
uniqueness = df.duplicated().any()
print("Uniqueness:", not uniqueness)

Uniqueness: True
