In [1]:
import pandas as pd

In [2]:
adverts = pd.read_csv('./data/Advertising.csv')
adverts.shape

(200, 4)

In [3]:
adverts

Unnamed: 0,TV,Radio,Newspaper,Sales
0,230.1,37.8,69.2,22.1
1,44.5,39.3,45.1,10.4
2,17.2,45.9,69.3,12.0
3,151.5,41.3,58.5,16.5
4,180.8,10.8,58.4,17.9
...,...,...,...,...
195,38.2,3.7,13.8,7.6
196,94.2,4.9,8.1,14.0
197,177.0,9.3,6.4,14.8
198,283.6,42.0,66.2,25.5


In [4]:
adverts_unseen = adverts.sample(frac=.20, random_state=42)
df = adverts.drop(adverts_unseen.index)

In [5]:
adverts.shape, adverts_unseen.shape, df.shape

((200, 4), (40, 4), (160, 4))

In [6]:
adverts_unseen.to_csv('./data/adverts_unseen.csv', index=False)
df.to_csv('./data/advert_tt.csv', index=False)

## Alternatively

#### we can do this using a function

In [7]:
import pandas as pd
import udf

In [8]:
data = pd.read_csv('./data/Advertising.csv')
data.shape

(200, 4)

In [9]:
def gen_unseen_pct(data, frac = 0.20, random_state=42):
    '''This function takes 3 arguments:
    data is the Pandas DataFrame that is to be split
    frac is the percentage to be randomly sampled and used as a "set-aside" dataset (default is 20%)
    random_state is the random_state you would like to use for reproducibility while testing (default is a random seed)
    
    Returns two DataFrames:
    1st df is larger remainder of the frac (fractional set)
    2nd df is the fractional set (i.e. 20% by default)'''
    unseen_data = data.sample(frac=frac, random_state=42)
    df = data.drop(unseen_data.index)
    return df, unseen_data

In [10]:
data, unseen_data = udf.gen_unseen_pct(data=data, frac=0.20, random_state=42)
unseen_data.shape, df.shape

((40, 4), (160, 4))

In [11]:
unseen_data.to_csv('./data/unseen_adverts.csv', index=False)
df.to_csv('./data/advert_tt2.csv', index=False)