## Data Consistency Checks

In [1]:
import pandas as pd
import numpy as np
import os 

In [2]:
path = r'C:\Users\efens\cf_tasks\2023-07 Instacard Basket Analysis'

In [3]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', '022 Prepared Data', 'orders_wrangled.csv'), index_col = False)

In [4]:
df_prods_check = pd.read_csv(os.path.join(path, '02 Data', '022 Prepared Data', 'products_checked.csv'), index_col = False)

## 01.Consistence check of df_prods

In [5]:
df_prods_check.describe()

Unnamed: 0,product_id,aisle_id,department_id,prices
count,49672.0,49672.0,49672.0,49672.0
mean,24850.349775,67.762442,11.728942,9.993282
std,14340.705287,38.315784,5.850779,453.615536
min,1.0,1.0,1.0,1.0
25%,12432.75,35.0,7.0,4.1
50%,24850.5,69.0,13.0,7.1
75%,37268.25,100.0,17.0,11.1
max,49688.0,134.0,21.0,99999.0


In [6]:
df_prods_check['prices'].median()

7.1

--> The maximal price value (99999.00) seems to be too large or incorrectly recorded. Client's verification needed

## 02. Investigate a df and share in a markdown cell whether anything about the data looks off or should be investigated further

In [7]:
pd.options.display.float_format = '{:.2f}'.format
df_ords.describe()

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710542.0,102978.21,17.15,2.78,13.45,11.11
std,987581.74,59533.72,17.73,2.05,4.23,9.21
min,1.0,1.0,1.0,0.0,0.0,0.0
25%,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.5,154385.0,23.0,5.0,16.0,15.0
max,3421083.0,206209.0,100.0,6.0,23.0,30.0


Comments:
1. Order ID > according to the table, there should be 3421083 orders
2. User ID > In total 206209 users
3. Order Number > In total 100 orders made
4. Orders_day_of_week > from 0 (Monday) to 6 (Sunday) --> 7 days a week
5. Hours a day > from 0 to 23 --> 24 houts a day
6. Days_since_prior_order > from 0 to 30

In [8]:
df_ords.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


In [9]:
# The column Days_since_prior_order contains missing (NaN) values
# Checking the column in separate

# Check percentile "days_since_prior_order"
# It can't be negative

var_days_since = df_ords['days_since_prior_order']

print("50th percentile of days_since_prior_order : ",
       np.percentile(var_days_since, 50))
print("25th percentile of days_since_prior_order : ",
       np.percentile(var_days_since, 25))
print("75th percentile of days_since_prior_order : ",
       np.percentile(var_days_since, 75))

50th percentile of days_since_prior_order :  nan
25th percentile of days_since_prior_order :  nan
75th percentile of days_since_prior_order :  nan


In [10]:
# Check for missing values in the 'days_since_prior_order' column
missing_values = df_ords['days_since_prior_order'].isnull().sum()
missing_values

206209

In [11]:
## Issue: Missing values (206209) in the variable 'days_since_prior_order'

## 03. Checking for mixed-type data

In [12]:
for col in df_ords.columns:
  mix_test = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[mix_test]) > 0:
    print (col)

--> No output for this code means that there is no mixed-type data

In [30]:
# Checking data types
df_ords.dtypes

order_id                    int64
user_id                     int64
eval_set                   object
order_number                int64
orders_day_of_week          int64
order_hour_of_day           int64
days_since_prior_order    float64
dtype: object

## 04. Running a check for missing values

In [14]:
df_ords.isnull().sum()

order_id                       0
user_id                        0
eval_set                       0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [15]:
df_ords['days_since_prior_order'].describe()

count   3214874.00
mean         11.11
std           9.21
min           0.00
25%           4.00
50%           7.00
75%          15.00
max          30.00
Name: days_since_prior_order, dtype: float64

In [16]:
df_ords.shape

(3421083, 7)

In [17]:
# How many percent of data is missed

percent_of_missed_data = (206209/3214874)*100
print("{:.2f}%".format(percent_of_missed_data))

6.41%


--> The amount of missing values is not large enough to significantly bias the results of the analysis (6.41%). From this point on, the missing values can be removed or changed to the average, median or interpolate values.

## 05. Adressing the missing values

In [24]:
df_ords['days_since_prior_order'].median()


7.0

In [25]:
# Changing missed values to median 
# Creating a new cleaned df

df_ords_clean = df_ords.copy()
median_days = df_ords_clean['days_since_prior_order'].median()
df_ords_clean['days_since_prior_order'].fillna(median_days, inplace=True)

In [26]:
# Checking the results
df_ords_clean.isnull().sum()

order_id                  0
user_id                   0
eval_set                  0
order_number              0
orders_day_of_week        0
order_hour_of_day         0
days_since_prior_order    0
dtype: int64

In [27]:
df_ords_clean.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,prior,1,2,8,7.0
1,2398795,1,prior,2,3,7,15.0
2,473747,1,prior,3,3,12,21.0
3,2254736,1,prior,4,4,7,29.0
4,431534,1,prior,5,4,15,28.0


--> I chose the method of filling missing values with the median because the percentage of missing values is relatively small (6.41%), which indicates that the impact of these missing values on the analysis results would be minimal. 

--> Using the median, a robust central tendency measure, helps maintain the overall distribution and reduces the potential bias that could arise from using more complex imputation methods for this dataset.

## 06. Running a check for duplicate values

In [36]:
df_dups = df_ords_clean[df_ords_clean.duplicated()]

In [37]:
df_dups.head()

Unnamed: 0,order_id,user_id,eval_set,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


## 07. Exporting the cleaned dataframes

In [39]:
df_ords_clean.to_csv(os.path.join(path, '02 Data','022 Prepared Data', 'orders_checked.csv'), header=True, index=False)