# Split

In [3]:
import os
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

# local packages
import customtools.data_wrangle as dw

warnings.filterwarnings('ignore')

# define functions

# During sampling, we need to check if the distributions are similar to the original data by setting a significance level 
# and using Kolmogorov-Smirnoff method.

def get_sample(df, significance = 0.05, sample_size = 5000, iterations = 100):
    for i in range(iterations):
        sample = df.sample(sample_size)
        sample_indexes = sample.index
        retrieved = True
        for var in range(df.shape[1]):
            var_sample = np.array(sample.iloc[:,var])
            metrics = ks_2samp(df.iloc[:,var], var_sample)
            pvalue = round(metrics[1], 3)
            if pvalue < significance: 
                retrieved = False
                break
        if retrieved == True: 
            print('found sample after {} iterations'.format(i+1) )
            return sample
    if not retrieved: raise ValueError("Could not build samples with {} iterations, significane={}, and sample_size={}"
                           .format(iterations,significance,sample_size))

# Define paths and capture data

In [4]:
inputs = os.path.join('..', 'data', '02_intermediate')
outputs = os.path.join('..', 'data', '02_intermediate')
reports = os.path.join('..', 'data', '06_reporting')

data = pd.read_csv(os.path.join(inputs, 'data.csv'), index_col='id')

In [5]:
print('Dataset dimensions:', data.shape)
data.head()

Dataset dimensions: (7043, 20)


Unnamed: 0_level_0,gender_male,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7590-vhveg,0.0,0,1.0,0.0,1,0.0,no phone service,dsl,0.0,1.0,0.0,0.0,no,0.0,month-to-month,1.0,electronic check,29.85,29.85,0
5575-gnvde,1.0,0,0.0,0.0,34,1.0,no,dsl,1.0,0.0,1.0,0.0,no,0.0,one year,0.0,mailed check,56.95,1889.5,0
3668-qpybk,1.0,0,0.0,0.0,2,1.0,no,dsl,1.0,1.0,0.0,0.0,no,0.0,month-to-month,1.0,mailed check,53.85,108.15,1
7795-cfocw,1.0,0,0.0,0.0,45,0.0,no phone service,dsl,1.0,0.0,1.0,1.0,no,0.0,one year,0.0,bank transfer (automatic),42.3,1840.75,0
9237-hqitu,0.0,0,0.0,0.0,2,1.0,no,fiber optic,0.0,0.0,0.0,0.0,no,0.0,month-to-month,1.0,electronic check,70.7,151.65,1


# data sampling
if sampling_data == True: faster to run project, but will work on a data sample.

During sampling, we need to check if the distributions are similar to the original data by setting a significance level and using Kolmogorov-Smirnoff method. See function defined in the beginning of the notebook.

In [53]:
from scipy.stats import ks_2samp

In [10]:
sampling_data = False

In [11]:
if sampling_data:
    data = dw.get_sample(data, significance=0.05, sample_size=5000, iterations=10)
data.shape

(5000, 20)

# final description

In [12]:
data.tail()

Unnamed: 0_level_0,gender_male,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7330-wzlnc,0.0,0,0.0,0.0,5,1.0,yes,fiber optic,0.0,1.0,0.0,0.0,yes,0.0,month-to-month,1.0,electronic check,90.8,455.5,1
7919-zodzz,0.0,0,1.0,1.0,10,1.0,no,dsl,0.0,1.0,1.0,0.0,no,1.0,one year,1.0,mailed check,65.9,660.05,0
7816-vghto,0.0,0,1.0,1.0,6,0.0,no phone service,dsl,0.0,1.0,1.0,1.0,no,0.0,two year,0.0,mailed check,40.55,217.5,0
8295-fhivv,1.0,0,0.0,0.0,7,1.0,no,no,,,,,no internet service,,month-to-month,1.0,mailed check,19.4,168.65,0
1492-kgeth,1.0,0,1.0,1.0,70,1.0,yes,dsl,1.0,1.0,1.0,1.0,no,1.0,one year,0.0,bank transfer (automatic),78.35,5445.95,0


In [13]:
data.describe()

Unnamed: 0,gender_male,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingmovies,paperlessbilling,monthlycharges,totalcharges,y
count,5000.0,5000.0,5000.0,5000.0,5000.0,5000.0,3905.0,3905.0,3905.0,3905.0,3905.0,5000.0,5000.0,4996.0,5000.0
mean,0.4956,0.1636,0.4898,0.3028,32.4478,0.9036,0.372855,0.441485,0.439181,0.37388,0.496543,0.5908,64.64423,2293.59979,0.2558
std,0.500031,0.369949,0.499946,0.459515,24.581972,0.295169,0.483626,0.496628,0.496351,0.483894,0.500052,0.491735,30.144167,2274.817147,0.436354
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.25,18.8,0.0
25%,0.0,0.0,0.0,0.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,35.1,399.075,0.0
50%,0.0,0.0,0.0,0.0,29.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,70.3,1402.675,0.0
75%,1.0,0.0,1.0,1.0,56.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,89.85,3808.05,1.0
max,1.0,1.0,1.0,1.0,72.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,118.75,8684.8,1.0


# Split data
test_size could depend on data size. For instance, for 1 million entries, it would work fine to establish test_size=0.1.

In [14]:
X = data.drop('y', axis=1)
print('dimensions of X:', X.shape)

y = data.loc[:, 'y']
y = y.astype('float')
print('dimensions of y:', y.shape)

dimensions of X: (5000, 19)
dimensions of y: (5000,)


In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle = False, random_state=42)
print('dimensions of X_train:', X_train.shape)
print('dimensions of y_train:', y_train.shape)
print('dimensions of X_test:', X_test.shape)
print('dimensions of y_test:', y_test.shape)

dimensions of X_train: (3500, 19)
dimensions of y_train: (3500,)
dimensions of X_test: (1500, 19)
dimensions of y_test: (1500,)


# save train and test sets

In [60]:
y_train = pd.DataFrame(y_train, columns=['y'])
y_test = pd.DataFrame(y_test, columns=['y'])

X_train.to_csv(os.path.join(outputs, 'X_train.csv'))
X_test.to_csv(os.path.join(outputs, 'X_test.csv'))
y_train.to_csv(os.path.join(outputs, 'y_train.csv'))
y_test.to_csv(os.path.join(outputs, 'y_test.csv'))