# Table of contents
# 1 Importing libraries
# 2 Importing data files
# 3 Mixed type data in test frame
# 4 Finding missing values in df_prods
# 5 Finding and addressing duplicates in df_prods
# 6 Exporting dataframes
# 7 Task
## Question 2: Run the df.describe() function on your df_ords dataframe
## Question 3: Check for mixed-type data in your df_ords dataframe
## Question 4: If you find mixed-type data, fix it. 
## Question 5: Check for missing values in df_ords dataframe
## Question 6: Address the missing values using an appropriate method
## Question 7: Check for duplicate values in your df_ords data
## Question 8: Address the duplicates using an appropriate method
## Question 9: Export your final, cleaned df_prods and df_ords data as “.csv” files 

# 1 Importing libraries

In [4]:
import pandas as pd
import numpy as np
import os

# 2 Importing data files

In [5]:
path = r'C:\Users\Eva\Documents\Instacart Basket Analysis'

In [6]:
df_prods = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'products.csv'), index_col = False)

In [7]:
df_ords = pd.read_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_wrangled.csv'), index_col = False)

# 3 Mixed type data in test frame

In [8]:
## Create a dataframe with mixed type data
df_test = pd.DataFrame()

In [9]:
df_test['mix'] = ['a', 'b', 1, True]

In [10]:
df_test.head()

Unnamed: 0,mix
0,a
1,b
2,1
3,True


In [11]:
## Checking for mixed type
for col in df_test.columns.tolist():
  weird = (df_test[[col]].applymap(type) != df_test[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_test[weird]) > 0:
    print (col)
    

mix


In [12]:
## Changing the data type to string
df_test['mix'] = df_test['mix'].astype('str')

# 4 Finding missing values in df_prods

In [13]:
## Finding the columns with missing values
df_prods.isnull().sum()

product_id        0
product_name     16
aisle_id          0
department_id     0
prices            0
dtype: int64

In [14]:
## Viewing missing values subset
df_nan = df_prods[df_prods['product_name'].isnull()==True]

In [15]:
df_nan

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
33,34,,121,14,12.2
68,69,,26,7,11.8
115,116,,93,3,10.8
261,262,,110,13,12.1
525,525,,109,11,1.2
1511,1511,,84,16,14.3
1780,1780,,126,11,12.3
2240,2240,,52,1,14.2
2586,2586,,104,13,12.4
3159,3159,,126,11,13.1


In [16]:
## There are 16 missing values found (also see above). 

In [17]:
## Rows and columns of the dataframe with missing values
df_prods.shape

(49693, 5)

In [18]:
## Rows and columns of the dataframe without missing values
df_prods_clean = df_prods[df_prods['product_name'].isnull() == False]

In [19]:
df_prods_clean.shape

(49677, 5)

# 5 Finding and addressing duplicates in df_prods

In [20]:
df_dups = df_prods_clean[df_prods_clean.duplicated()]

In [21]:
df_dups

Unnamed: 0,product_id,product_name,aisle_id,department_id,prices
462,462,Fiber 4g Gummy Dietary Supplement,70,11,4.8
18459,18458,Ranger IPA,27,5,9.2
26810,26808,Black House Coffee Roasty Stout Beer,27,5,13.4
35309,35306,Gluten Free Organic Peanut Butter & Chocolate ...,121,14,6.8
35495,35491,Adore Forever Body Wash,127,11,9.9


In [22]:
## 5 duplicates are found.

In [23]:
## Rows and columns of the dataframe with duplicates
df_prods_clean.shape

(49677, 5)

In [24]:
# Dropping the duplicates
df_prods_clean_no_dups = df_prods_clean.drop_duplicates()

In [25]:
## Rows and columns of the dataframe without duplicates
df_prods_clean_no_dups.shape

(49672, 5)

# 6 Exporting dataframes

In [45]:
df_prods_clean_no_dups.to_csv(os.path.join(path, '02 Data','Prepared Data', 'products_checked.csv'), index = False)

# 7 Task

## Question 2: Run the df.describe() function on your df_ords dataframe. Using your new knowledge about how to interpret the output of this function, share in a markdown cell whether anything about the data looks off or should be investigated further.

In [27]:
df_ords.describe()

Unnamed: 0.1,Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
count,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3421083.0,3214874.0
mean,1710541.0,1710542.0,102978.2,17.15486,2.776219,13.45202,11.11484
std,987581.7,987581.7,59533.72,17.73316,2.046829,4.226088,9.206737
min,0.0,1.0,1.0,1.0,0.0,0.0,0.0
25%,855270.5,855271.5,51394.0,5.0,1.0,10.0,4.0
50%,1710541.0,1710542.0,102689.0,11.0,3.0,13.0,7.0
75%,2565812.0,2565812.0,154385.0,23.0,5.0,16.0,15.0
max,3421082.0,3421083.0,206209.0,100.0,6.0,23.0,30.0


### I can drop the "unnamed: 0" columns because it is unnecessary. Apart from that, everything seems to be okay.

In [28]:
## Drop unnecessary column
df_ords = df_ords.drop(columns = ['Unnamed: 0'])

## Question 3: Check for mixed-type data in your df_ords dataframe

In [29]:
for col in df_ords.columns.tolist():
  weird = (df_ords[[col]].applymap(type) != df_ords[[col]].iloc[0].apply(type)).any(axis = 1)
  if len (df_ords[weird]) > 0:
    print (col)
    

### There is no mixed type data.

## Question 4: If you find mixed-type data, fix it. 

### There is no mixed type data.

## Question 5: Check for missing values in df_ords dataframe

In [30]:
## Finding missing values
df_ords.isnull().sum()

order_id                       0
user_id                        0
order_number                   0
orders_day_of_week             0
order_hour_of_day              0
days_since_prior_order    206209
dtype: int64

In [31]:
## Creating a subset of the missing values
df_ords_nan = df_ords[df_ords['days_since_prior_order'].isnull() == True]

In [32]:
## View missing values subset
df_ords_nan

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
11,2168274,2,1,2,11,
26,1374495,3,1,1,14,
39,3343014,4,1,6,11,
45,2717275,5,1,3,12,
...,...,...,...,...,...,...
3420930,969311,206205,1,4,12,
3420934,3189322,206206,1,3,18,
3421002,2166133,206207,1,6,19,
3421019,2227043,206208,1,1,15,


In [33]:
## Viewing orders of user with user_id 1
df_ords[df_ords['user_id'] == 1]

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


In [34]:
## Viewing orders of user with user_id 2
df_ords[df_ords['user_id'] == 2]

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
11,2168274,2,1,2,11,
12,1501582,2,2,5,10,10.0
13,1901567,2,3,1,10,3.0
14,738281,2,4,2,10,8.0
15,1673511,2,5,3,11,8.0
16,1199898,2,6,2,9,13.0
17,3194192,2,7,2,12,14.0
18,788338,2,8,1,15,27.0
19,1718559,2,9,2,9,8.0
20,1447487,2,10,1,11,6.0


In [35]:
## Viewing orders of user with user_id 3
df_ords[df_ords['user_id'] == 3]

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
26,1374495,3,1,1,14,
27,444309,3,2,3,19,9.0
28,3002854,3,3,3,16,21.0
29,2037211,3,4,2,18,20.0
30,2710558,3,5,0,17,12.0
31,1972919,3,6,0,16,7.0
32,1839752,3,7,0,15,7.0
33,3225766,3,8,0,17,7.0
34,3160850,3,9,0,16,7.0
35,676467,3,10,3,16,17.0


### There are 206209 missing values in the column "days since prior order". As the examples from above show, missing values in this column mean that the customer ordered something for the very first time (order_number = 1), so no value could have been entered.

## Question 6: Address the missing values using an appropriate method

In [36]:
df_ords['days_since_prior_order'].fillna(0, inplace=True)

### I imputed 0 values for the NaN values, so that the values tell us that a customer has made its first order.

In [37]:
df_ords.head(30)

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order
0,2539329,1,1,2,8,0.0
1,2398795,1,2,3,7,15.0
2,473747,1,3,3,12,21.0
3,2254736,1,4,4,7,29.0
4,431534,1,5,4,15,28.0
5,3367565,1,6,2,7,19.0
6,550135,1,7,1,9,20.0
7,3108588,1,8,1,14,14.0
8,2295261,1,9,1,16,0.0
9,2550362,1,10,4,8,30.0


## Question 7: Check for duplicate values in your df_ords data

In [38]:
df_dups = df_ords[df_ords.duplicated()]

In [39]:
df_dups

Unnamed: 0,order_id,user_id,order_number,orders_day_of_week,order_hour_of_day,days_since_prior_order


### No duplicate data was found.

## Question 8: Address the duplicates using an appropriate method

### No duplicate data was found, so I don't need to do anything.

## Question 9: Export your final, cleaned df_prods and df_ords data as “.csv” files in your “Prepared Data” folder and give them appropriate, succinct names.

In [44]:
df_ords.to_csv(os.path.join(path, '02 Data','Prepared Data', 'orders_checked.csv'), index = False)