## Creating a random sample

In [1]:
import pandas as pd 
import numpy as np
import os 

In [4]:
path = r'C:\Users\Günay\Documents\Instacart Basket Analysis'

In [8]:
df = pd.read_csv(os.path.join(path, '02 Data', 'Original Data', 'order_products__prior.csv'))

In [9]:
df.head()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered
0,2,33120,1,1
1,2,28985,2,1
2,2,9327,3,0
3,2,45918,4,1
4,2,30035,5,0


In [10]:
df.shape

(32434489, 4)

#### Creating the random split

In [11]:
df['split'] = np.random.randint(1, 101, size=len(df)) #randint stands for random integer from the module random in numpy

# This creates a new column called 'split' which will contain numbers from 0 to 100 (the first 2 arguments denote that).
# Size = len(df) means I want to have as many numbers as the df has rows
# My idea is to use these numbers as a percentage from which to select a certain portion
# I think selecting 20% of the population should be fine 

In [12]:
df.head() # checking whether the column was created

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,split
0,2,33120,1,1,2
1,2,28985,2,1,48
2,2,9327,3,0,76
3,2,45918,4,1,81
4,2,30035,5,0,50


In [13]:
df.tail()

Unnamed: 0,order_id,product_id,add_to_cart_order,reordered,split
32434484,3421083,39678,6,1,65
32434485,3421083,11352,7,0,75
32434486,3421083,4600,8,0,35
32434487,3421083,24852,9,1,2
32434488,3421083,5020,10,1,33


In [14]:
df['split'].value_counts(dropna = False) 
# Mainly I want to look for some hidden NaNs here, which would mean the function didn't work. It looks good.

46    325983
76    325728
19    325519
89    325437
44    325374
       ...  
26    323458
59    323455
93    323327
56    323318
38    323098
Name: split, Length: 100, dtype: int64

In [15]:
df['split'].describe()
# This is an important check because I want to see how the values are ranging - e.g. are there any extreme values that we can't
#account for, which would also mean I did something wrong, such as a max > 100. It looks fine, so I'll create the subset now.

count    3.243449e+07
mean     5.049665e+01
std      2.886378e+01
min      1.000000e+00
25%      2.500000e+01
50%      5.000000e+01
75%      7.500000e+01
max      1.000000e+02
Name: split, dtype: float64

In [16]:
df_sample = df[df['split'] <= 20]

# You're familiar with subsetting, so what happens here is simply selecting only the values in 'split' that are under 20. 
# This will give us 20% of the sample :) 

In [17]:
df_sample.shape # Final check - around 6.5 million sounds about right, it is 20% of 32.4 million. 

(6487173, 5)

Don't forget to export and save the new dataframes! 

In [18]:
df_sample.to_csv(os.path.join(path, '02 Data', 'Prepared Data', 'orders_products_prior_sample.csv'))