# Sample Data in Python

In [9]:
import pandas as pd
vehicles = pd.read_csv("vehicles.csv")
vehicles.head()

Unnamed: 0,citympg,cylinders,displacement,drive,highwaympg,make,model,class,year,transmissiontype,transmissionspeeds,co2emissions
0,14.0,6,4.1,2-Wheel Drive,19.0,Buick,Electra/Park Avenue,Large Cars,1984,Automatic,4,555.4375
1,14.0,8,5.0,2-Wheel Drive,20.0,Buick,Electra/Park Avenue,Large Cars,1984,Automatic,4,555.4375
2,18.0,8,5.7,2-Wheel Drive,26.0,Buick,Electra/Park Avenue,Large Cars,1984,Automatic,4,484.761905
3,21.0,6,4.3,Rear-Wheel Drive,31.0,Cadillac,Fleetwood/DeVille (FWD),Large Cars,1984,Automatic,4,424.166667
4,14.0,8,4.1,Rear-Wheel Drive,19.0,Cadillac,Brougham/DeVille (RWD),Large Cars,1984,Automatic,4,555.4375


Isolate 'co2emissions' column - dependent variable

In [2]:
response = 'co2emissions'
y = vehicles[[response]]
y.head()

Unnamed: 0,co2emissions
0,555.4375
1,555.4375
2,484.761905
3,424.166667
4,555.4375


Create list of all columns

In [3]:
predictors = list(vehicles.columns)
predictors

['citympg',
 'cylinders',
 'displacement',
 'drive',
 'highwaympg',
 'make',
 'model',
 'class',
 'year',
 'transmissiontype',
 'transmissionspeeds',
 'co2emissions']

Remove 'co2emissions' column from list - becomes independent variables

In [4]:
predictors.remove(response)
predictors

['citympg',
 'cylinders',
 'displacement',
 'drive',
 'highwaympg',
 'make',
 'model',
 'class',
 'year',
 'transmissiontype',
 'transmissionspeeds']

Print new dataframe of independent variables

In [8]:
x = vehicles[predictors]
x.head()

Unnamed: 0,citympg,cylinders,displacement,drive,highwaympg,make,model,class,year,transmissiontype,transmissionspeeds
0,14.0,6,4.1,2-Wheel Drive,19.0,Buick,Electra/Park Avenue,Large Cars,1984,Automatic,4
1,14.0,8,5.0,2-Wheel Drive,20.0,Buick,Electra/Park Avenue,Large Cars,1984,Automatic,4
2,18.0,8,5.7,2-Wheel Drive,26.0,Buick,Electra/Park Avenue,Large Cars,1984,Automatic,4
3,21.0,6,4.3,Rear-Wheel Drive,31.0,Cadillac,Fleetwood/DeVille (FWD),Large Cars,1984,Automatic,4
4,14.0,8,4.1,Rear-Wheel Drive,19.0,Cadillac,Brougham/DeVille (RWD),Large Cars,1984,Automatic,4


## Split data using Simple Random Sampling

In [10]:
from sklearn.model_selection import train_test_split

In [11]:
x_train, x_test, y_train, y_test = train_test_split(x, y)

In [12]:
x_train.shape

(27734, 11)

In [13]:
y_train.shape

(27734, 1)

In [14]:
x_test.shape

(9245, 11)

In [15]:
y_test.shape

(9245, 1)

train_test_split function default is 25%. 'test_size' overrides that.

In [16]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.4)
x_test.shape

(14792, 11)

# Stratified random sampling

In [17]:
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size = 0.01, 
                                                    random_state = 1234)

In [20]:
x['drive'].value_counts(normalize = True)

Rear-Wheel Drive     0.356797
Front-Wheel Drive    0.353552
All-Wheel Drive      0.239893
4-Wheel Drive        0.036480
2-Wheel Drive        0.013278
Name: drive, dtype: float64

In [21]:
x_test['drive'].value_counts(normalize = True)

Front-Wheel Drive    0.364865
Rear-Wheel Drive     0.332432
All-Wheel Drive      0.248649
4-Wheel Drive        0.035135
2-Wheel Drive        0.018919
Name: drive, dtype: float64

In [23]:
x_train, x_test, y_train, y_test = train_test_split(x, y, 
                                                    test_size = 0.01, 
                                                    random_state = 1234,
                                                   stratify = x['drive'])

In [24]:
x_test['drive'].value_counts(normalize = True)

Rear-Wheel Drive     0.356757
Front-Wheel Drive    0.354054
All-Wheel Drive      0.240541
4-Wheel Drive        0.035135
2-Wheel Drive        0.013514
Name: drive, dtype: float64

Stratified data is closer to original data now