# Split

In [11]:
import os
import sys
import warnings

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

warnings.filterwarnings('ignore')

# define functions

In [12]:
# During sampling, we need to check if the distributions are similar to the original data by setting a significance level 
# and using Kolmogorov-Smirnoff method.

def get_sample(df, significance = 0.05, sample_size = 5000, iterations = 100):
    for i in range(iterations):
        sample = df.sample(sample_size)
        sample_indexes = sample.index
        retrieved = True
        for var in range(df.shape[1]):
            var_sample = np.array(sample.iloc[:,var])
            metrics = ks_2samp(df.iloc[:,var], var_sample)
            pvalue = round(metrics[1], 3)
            if pvalue < significance: 
                retrieved = False
                break
        if retrieved == True: 
            print('found sample after {} iterations'.format(i+1) )
            return sample
    if not retrieved: raise ValueError("Could not build samples with {} iterations, significane={}, and sample_size={}"
                           .format(iterations,significance,sample_size))

# Define paths and capture data

In [13]:
inputs = os.path.join('..', 'data', '02_intermediate')
outputs = os.path.join('..', 'data', '02_intermediate')
reports = os.path.join('..', 'data', '06_reporting')

data = pd.read_csv(os.path.join(inputs, 'data.csv'), index_col='id')

In [14]:
print('Dataset dimensions:', data.shape)
data.head()

Dataset dimensions: (7043, 20)


Unnamed: 0_level_0,gender_male,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
7590-vhveg,0.0,0,1.0,0.0,1,0.0,no phone service,dsl,0.0,1.0,0.0,0.0,no,0.0,month-to-month,1.0,electronic check,29.85,29.85,0.0
5575-gnvde,1.0,0,0.0,0.0,34,1.0,no,dsl,1.0,0.0,1.0,0.0,no,0.0,one year,0.0,mailed check,56.95,1889.5,0.0
3668-qpybk,1.0,0,0.0,0.0,2,1.0,no,dsl,1.0,1.0,0.0,0.0,no,0.0,month-to-month,1.0,mailed check,53.85,108.15,1.0
7795-cfocw,1.0,0,0.0,0.0,45,0.0,no phone service,dsl,1.0,0.0,1.0,1.0,no,0.0,one year,0.0,bank transfer (automatic),42.3,1840.75,0.0
9237-hqitu,0.0,0,0.0,0.0,2,1.0,no,fiber optic,0.0,0.0,0.0,0.0,no,0.0,month-to-month,1.0,electronic check,70.7,151.65,1.0


# data sampling
if sampling_data == True: faster to run project, but will work on a data sample.

During sampling, we need to check if the distributions are similar to the original data by setting a significance level and using Kolmogorov-Smirnoff method. See function defined in the beginning of the notebook.

In [15]:
from scipy.stats import ks_2samp

In [16]:
sampling_data = False

In [17]:
if sampling_data == True:
    data = get_sample(data, significance=0.05, sample_size=5000, iterations=10)
data.shape

(7043, 20)

# final description

In [18]:
data.tail()

Unnamed: 0_level_0,gender_male,seniorcitizen,partner,dependents,tenure,phoneservice,multiplelines,internetservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingtv,streamingmovies,contract,paperlessbilling,paymentmethod,monthlycharges,totalcharges,y
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
6840-resvb,1.0,0,1.0,1.0,24,1.0,yes,dsl,1.0,0.0,1.0,1.0,yes,1.0,one year,1.0,mailed check,84.8,1990.5,0.0
2234-xaduh,0.0,0,1.0,1.0,72,1.0,yes,fiber optic,0.0,1.0,1.0,0.0,yes,1.0,one year,1.0,credit card (automatic),103.2,7362.9,0.0
4801-jzazl,0.0,0,1.0,1.0,11,0.0,no phone service,dsl,1.0,0.0,0.0,0.0,no,0.0,month-to-month,1.0,electronic check,29.6,346.45,0.0
8361-ltmkd,1.0,1,1.0,0.0,4,1.0,yes,fiber optic,0.0,0.0,0.0,0.0,no,0.0,month-to-month,1.0,mailed check,74.4,306.6,1.0
3186-ajiek,1.0,0,0.0,0.0,66,1.0,no,fiber optic,1.0,0.0,1.0,1.0,yes,1.0,two year,1.0,bank transfer (automatic),105.65,6844.5,0.0


In [19]:
data.describe()

Unnamed: 0,gender_male,seniorcitizen,partner,dependents,tenure,phoneservice,onlinesecurity,onlinebackup,deviceprotection,techsupport,streamingmovies,paperlessbilling,monthlycharges,totalcharges,y
count,7043.0,7043.0,7043.0,7043.0,7043.0,7043.0,5517.0,5517.0,5517.0,5517.0,5517.0,7043.0,7043.0,7032.0,7043.0
mean,0.504756,0.162147,0.483033,0.299588,32.371149,0.903166,0.36596,0.440276,0.439007,0.370491,0.495197,0.592219,64.761692,2283.300441,0.26537
std,0.500013,0.368612,0.499748,0.45811,24.559481,0.295752,0.481742,0.496465,0.496311,0.48298,0.500022,0.491457,30.090047,2266.771362,0.441561
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,18.25,18.8,0.0
25%,0.0,0.0,0.0,0.0,9.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,35.5,401.45,0.0
50%,1.0,0.0,0.0,0.0,29.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,70.35,1397.475,0.0
75%,1.0,0.0,1.0,1.0,55.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,89.85,3794.7375,1.0
max,1.0,1.0,1.0,1.0,72.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,118.75,8684.8,1.0


# Split data
test_size could depend on data size. For instance, for 1 million entries, it would work fine to establish test_size=0.1.

In [20]:
X = data.drop('y', axis=1)
print('dimensions of X:', X.shape)

y = data.loc[:, 'y']
y = y.astype('float')
print('dimensions of y:', y.shape)

dimensions of X: (7043, 19)
dimensions of y: (7043,)


In [21]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, shuffle = False, random_state=42)
print('dimensions of X_train:', X_train.shape)
print('dimensions of y_train:', y_train.shape)
print('dimensions of X_test:', X_test.shape)
print('dimensions of y_test:', y_test.shape)

dimensions of X_train: (4930, 19)
dimensions of y_train: (4930,)
dimensions of X_test: (2113, 19)
dimensions of y_test: (2113,)


# save train and test sets

In [22]:
y_train = pd.DataFrame(y_train, columns=['y'])
y_test = pd.DataFrame(y_test, columns=['y'])

X_train.to_csv(os.path.join(outputs, 'X_train.csv'))
X_test.to_csv(os.path.join(outputs, 'X_test.csv'))
y_train.to_csv(os.path.join(outputs, 'y_train.csv'))
y_test.to_csv(os.path.join(outputs, 'y_test.csv'))