# Data Acquisition Exercises

In [1]:
import pandas as pd
import numpy as np
import os

# acquire
from env import get_connection
from pydataset import data
import seaborn as sns

#### 4. In a jupyter notebook, classification_exercises.ipynb, use a python module (pydata or seaborn datasets) containing datasets as a source from the iris data. Create a pandas dataframe, df_iris, from this data.

    a. print the first 3 rows
    b. print the number of rows and columns (shape)
    c. print the column names
    d. print the data type of each column
    e. print the summary statistics for each of the numeric variables


In [2]:
# Load iris data set
df_iris = data('iris')
df_iris.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa


### a. Print the first 3 rows.

In [3]:
df_iris.head(3)

Unnamed: 0,Sepal.Length,Sepal.Width,Petal.Length,Petal.Width,Species
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa


### b. Print the number of rows and columns (shape).

In [4]:
df_iris.shape

(150, 5)

### c. Print the column names.

In [5]:
df_iris.columns

Index(['Sepal.Length', 'Sepal.Width', 'Petal.Length', 'Petal.Width',
       'Species'],
      dtype='object')

### d. Print the data type of each column.

In [6]:
# Return only data types
df_iris.dtypes

# Return columns and data types
df_iris.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 150 entries, 1 to 150
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Sepal.Length  150 non-null    float64
 1   Sepal.Width   150 non-null    float64
 2   Petal.Length  150 non-null    float64
 3   Petal.Width   150 non-null    float64
 4   Species       150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 7.0+ KB


### e. Print the summary statistics for each of the numeric variables. 

In [7]:
# The .T function will transpose the DF to make it more readable
stats = df_iris.describe().T
stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Sepal.Length,150.0,5.843333,0.828066,4.3,5.1,5.8,6.4,7.9
Sepal.Width,150.0,3.057333,0.435866,2.0,2.8,3.0,3.3,4.4
Petal.Length,150.0,3.758,1.765298,1.0,1.6,4.35,5.1,6.9
Petal.Width,150.0,1.199333,0.762238,0.1,0.3,1.3,1.8,2.5


#### 5. Read the data from a Google sheet into a dataframe, df_google.

    a. print the first 3 rows
    b. print the number of rows and columns
    c. print the column names
    d. print the data type of each column
    e. print the summary statistics for each of the numeric variables
    f. print the unique values for each of your categorical variables

[use this google sheet](https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357)

In [8]:
# Grab the Google sheet url.
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357'

In [9]:
# Turn Google sheet address into a CSV export URL.
csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')

In [10]:
# Read in the data using the pandas `pd.read_csv()` function.
df_google = pd.read_csv(csv_export_url)

### a. Print the first 3 rows.

In [11]:
df_google.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


### b. Print the number of rows and columns.

In [12]:
df_google.shape

(891, 12)

### c. Print the column names.

In [13]:
df_google.columns.to_list()

['PassengerId',
 'Survived',
 'Pclass',
 'Name',
 'Sex',
 'Age',
 'SibSp',
 'Parch',
 'Ticket',
 'Fare',
 'Cabin',
 'Embarked']

### d. Print the data type of each column.

In [14]:
df_google.dtypes

PassengerId      int64
Survived         int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object

### e. Print the summary statistics for each of the numeric variables.

- Some of these numeric columns are really like encoded categorical values; at this stage, I'm just noting this observation.

In [15]:
df_google.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
PassengerId,891.0,446.0,257.353842,1.0,223.5,446.0,668.5,891.0
Survived,891.0,0.383838,0.486592,0.0,0.0,0.0,1.0,1.0
Pclass,891.0,2.308642,0.836071,1.0,2.0,3.0,3.0,3.0
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
SibSp,891.0,0.523008,1.102743,0.0,0.0,0.0,1.0,8.0
Parch,891.0,0.381594,0.806057,0.0,0.0,0.0,0.0,6.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


### f. Print the unique values for each of your categorical variables.

- Some of these categorical variable columns have a ton of unique values, so I'll check the number first. If I want to see the unique values, I can do a `.value_counts()` on individual columns.


In [16]:
# I could get a count of unique values for each variable with an object data type.
for col in df_google.columns:
#     print(col)
    if df_google[col].dtypes == 'object':
        print(f'{col} has {df_google[col].nunique()} unique values.')

Name has 891 unique values.
Sex has 2 unique values.
Ticket has 681 unique values.
Cabin has 147 unique values.
Embarked has 3 unique values.


In [17]:
df_google.Survived.value_counts(dropna=False)

0    549
1    342
Name: Survived, dtype: int64

In [18]:
df_google.Pclass.value_counts(dropna=False)

3    491
1    216
2    184
Name: Pclass, dtype: int64

In [19]:
df_google.Sex.value_counts(dropna=False)

male      577
female    314
Name: Sex, dtype: int64

In [20]:
df_google.Embarked.value_counts(dropna=False)

S      644
C      168
Q       77
NaN      2
Name: Embarked, dtype: int64

#### 6. Download the previous exercise's file into an excel (File → Download → Microsoft Excel). Read the downloaded file into a dataframe named df_excel.

    a. assign the first 100 rows to a new dataframe, df_excel_sample
    b. print the number of rows of your original dataframe
    c. print the first 5 column names
    d. print the column names that have a data type of object
    e. compute the range for each of the numeric variables.


In [21]:
df_excel = pd.read_excel('train.xlsx', sheet_name='train')

### a. Assign the first 100 rows to a new dataframe, `df_excel_sample`.

In [22]:
df_excel_sample = df_excel.head(100)

In [23]:
df_excel_sample.shape

(100, 12)

### b. Print the number of rows of your original dataframe.

In [24]:
df_excel.shape[0]

891

### c. Print the first 5 column names.

In [25]:
df_excel.columns[:5]

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex'], dtype='object')

### d. Print the column names that have a data type of object.

In [26]:
df_excel.select_dtypes(include='object').head(0)

Unnamed: 0,Name,Sex,Ticket,Cabin,Embarked


In [27]:
df_excel.select_dtypes(include='object').columns.to_list()

['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked']

### e. Compute the range for each of the numeric variables.

In [28]:
# Some of these numeric columms are more like encoded categorical variables.

df_excel.head(3)

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Thayer)",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S


In [29]:
# The two columns I want to know the range on are 'Age' and 'Fare'
# I can select just the true numeric variables to declutter my results.

titanic_stats = df_excel[['Age', 'Fare']].describe().T
titanic_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292


In [30]:
titanic_stats['range'] = titanic_stats['max'] - titanic_stats['min']
titanic_stats

Unnamed: 0,count,mean,std,min,25%,50%,75%,max,range
Age,714.0,29.699118,14.526497,0.42,20.125,28.0,38.0,80.0,79.58
Fare,891.0,32.204208,49.693429,0.0,7.9104,14.4542,31.0,512.3292,512.3292


---

<div class="alert alert-block alert-warning">

# Data Preparation Exercises

<div class="alert alert-block alert-success">

#### Using the Iris Data:
1. Use the function defined in acquire.py to load the iris data.
2. Drop the species_id and measurement_id columns.
3. Rename the species_name column to just species.
4. Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).
5. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

<div class="alert alert-block alert-info">

1. Use the function defined in acquire.py to load the iris data.

In [31]:
from acquire import new_iris_data

In [32]:
df_iris = new_iris_data()
df_iris.head()

Unnamed: 0,species_id,measurement_id,species_name,sepal_length,sepal_width,petal_length,petal_width
0,1,1,setosa,5.1,3.5,1.4,0.2
1,1,2,setosa,4.9,3.0,1.4,0.2
2,1,3,setosa,4.7,3.2,1.3,0.2
3,1,4,setosa,4.6,3.1,1.5,0.2
4,1,5,setosa,5.0,3.6,1.4,0.2


<div class="alert alert-block alert-info">

2. Drop the species_id and measurement_id columns.

In [33]:
new_df_iris = df_iris.drop(columns=['species_id', 'measurement_id'])

In [34]:
new_df_iris.head()

Unnamed: 0,species_name,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


<div class="alert alert-block alert-info">

3. Rename the species_name column to just species.

In [35]:
# reassign dataframe with renamed column
new_df_iris = new_df_iris.rename(columns = {'species_name': 'species'})
# take a look
new_df_iris.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


<div class="alert alert-block alert-info">

4. Create dummy variables of the species name and concatenate onto the iris dataframe. (This is for practice, we don't always have to encode the target, but if we used species as a feature, we would need to encode it).

In [36]:
# ADAM SAID DON'T DO IT!

<div class="alert alert-block alert-info">

5. Create a function named prep_iris that accepts the untransformed iris data, and returns the data with the transformations above applied.

In [37]:
# accepts the ultratransformed dataframe and returns data frame with transformations applied
def prep_iris(df):
    df = df.drop(columns= (['species_id', 'measurement_id']))
    df = df.rename(columns = {'species_name': 'species'})
    return df

In [38]:
# try new function 'prep_iris'
prepped_iris = prep_iris(new_iris_data())

# take a look
prepped_iris.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
0,setosa,5.1,3.5,1.4,0.2
1,setosa,4.9,3.0,1.4,0.2
2,setosa,4.7,3.2,1.3,0.2
3,setosa,4.6,3.1,1.5,0.2
4,setosa,5.0,3.6,1.4,0.2


<div class="alert alert-block alert-success">

#### Using the Titanic dataset
1. Use the function defined in acquire.py to load the Titanic data.
2. Drop any unnecessary, unhelpful, or duplicated columns.
3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.
4. Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.


<div class="alert alert-block alert-info">

1. Use the function defined in acquire.py to load the Titanic data.

In [39]:
from acquire import new_titanic_data

In [40]:
#use acquire module to import titanic data and assign to variable
titanic = new_titanic_data()
titanic.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


<div class="alert alert-block alert-info">

2. Drop any unnecessary, unhelpful, or duplicated columns.

In [41]:
new_titanic = titanic.drop(columns = ['class', 'embarked', 'passenger_id', 'deck', 'age'])
new_titanic.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,0,3,male,1,0,7.25,Southampton,0
1,1,1,female,1,0,71.2833,Cherbourg,0
2,1,3,female,0,0,7.925,Southampton,1
3,1,1,female,1,0,53.1,Southampton,0
4,0,3,male,0,0,8.05,Southampton,1


In [42]:
# check for nulls
new_titanic.isnull().sum()

survived       0
pclass         0
sex            0
sibsp          0
parch          0
fare           0
embark_town    2
alone          0
dtype: int64

In [43]:
# drop nulls from our data frame
new_titanic.dropna()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,0,3,male,1,0,7.2500,Southampton,0
1,1,1,female,1,0,71.2833,Cherbourg,0
2,1,3,female,0,0,7.9250,Southampton,1
3,1,1,female,1,0,53.1000,Southampton,0
4,0,3,male,0,0,8.0500,Southampton,1
...,...,...,...,...,...,...,...,...
886,0,2,male,0,0,13.0000,Southampton,1
887,1,1,female,0,0,30.0000,Southampton,1
888,0,3,female,1,2,23.4500,Southampton,0
889,1,1,male,0,0,30.0000,Cherbourg,1


<div class="alert alert-block alert-info">

3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

In [44]:
# DON'T DO IT!

<div class="alert alert-block alert-info">

4. Create a function named prep_titanic that accepts the raw titanic data, and returns the data with the transformations above applied.

In [45]:
def prep_titanic(titanic):
    titanic = titanic.drop(columns=['class', 'embarked', 'passenger_id', 'deck', 'age'])
    titanic = titanic.dropna()
    return titanic

In [46]:
# take a look
prepped_titanic = prep_titanic(new_titanic_data())
prepped_titanic.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
0,0,3,male,1,0,7.25,Southampton,0
1,1,1,female,1,0,71.2833,Cherbourg,0
2,1,3,female,0,0,7.925,Southampton,1
3,1,1,female,1,0,53.1,Southampton,0
4,0,3,male,0,0,8.05,Southampton,1


<div class="alert alert-block alert-success">

#### Using the Telco dataset
1. Use the function defined in acquire.py to load the Telco data.
2. Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.
3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.
4. Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

<div class="alert alert-block alert-info">

1. Use the function defined in acquire.py to load the Telco data.

In [47]:
from acquire import new_telco_data

In [48]:
telco = new_telco_data()
telco.head()

Unnamed: 0,payment_type_id,internet_service_type_id,contract_type_id,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,2,1,2,0002-ORFBO,Female,0,Yes,Yes,9,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,2,1,1,0003-MKNFE,Male,0,No,No,9,Yes,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,1,2,1,0004-TLHLJ,Male,0,No,No,4,Yes,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,1,2,1,0011-IGKFF,Male,1,Yes,No,13,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,2,2,1,0013-EXCHZ,Female,1,Yes,No,3,Yes,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


<div class="alert alert-block alert-info">

2. Drop any unnecessary, unhelpful, or duplicated columns. This could mean dropping foreign key columns but keeping the corresponding string values, for example.

In [49]:
new_telco = telco.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id'])
new_telco.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,No,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


<div class="alert alert-block alert-info">

3. Encode the categorical columns. Create dummy variables of the categorical columns and concatenate them onto the dataframe.

In [50]:
# DON'T DO IT!

<div class="alert alert-block alert-info">

4. Create a function named prep_telco that accepts the raw telco data, and returns the data with the transformations above applied.

In [51]:
def prep_telco(telco):
    telco = telco.drop(columns=['payment_type_id', 'internet_service_type_id', 'contract_type_id'])
    return telco

In [52]:
prepped_telco = prep_telco(new_telco_data())
prepped_telco.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
0,0002-ORFBO,Female,0,Yes,Yes,9,Yes,No,No,Yes,...,Yes,Yes,No,Yes,65.6,593.3,No,One year,DSL,Mailed check
1,0003-MKNFE,Male,0,No,No,9,Yes,Yes,No,No,...,No,No,Yes,No,59.9,542.4,No,Month-to-month,DSL,Mailed check
2,0004-TLHLJ,Male,0,No,No,4,Yes,No,No,No,...,No,No,No,Yes,73.9,280.85,Yes,Month-to-month,Fiber optic,Electronic check
3,0011-IGKFF,Male,1,Yes,No,13,Yes,No,No,Yes,...,No,Yes,Yes,Yes,98.0,1237.85,Yes,Month-to-month,Fiber optic,Electronic check
4,0013-EXCHZ,Female,1,Yes,No,3,Yes,No,No,No,...,Yes,Yes,No,Yes,83.9,267.4,Yes,Month-to-month,Fiber optic,Mailed check


<div class="alert alert-block alert-success">

#### Split your data
1. Write a function to split your data into train, test and validate datasets. Add this function to prepare.py.
2. Run the function in your notebook on the Iris dataset, returning 3 datasets, train_iris, validate_iris and test_iris.
3. Run the function on the Titanic dataset, returning 3 datasets, train_titanic, validate_titanic and test_titanic.
4. Run the function on the Telco dataset, returning 3 datasets, train_telco, validate_telco and test_telco.

In [53]:
from sklearn.model_selection import train_test_split

<div class="alert alert-block alert-info">

1. Write a function to split your data into train, test and validate datasets. Add this function to prepare.py.

In [54]:
def split_data(df, target):
    train, test = train_test_split(df, train_size = 0.5, random_state=42, stratify=df[target])
    train, val = train_test_split(train, train_size = 0.7, random_state=42, stratify=train[target])
    return train, val, test

<div class="alert alert-block alert-info">

2. Run the function in your notebook on the Iris dataset, returning 3 datasets, train_iris, validate_iris and test_iris.

In [55]:
train, validate, test = split_data(prepped_iris, target='species')
# take a look
train.head()

Unnamed: 0,species,sepal_length,sepal_width,petal_length,petal_width
138,virginica,6.0,3.0,4.8,1.8
7,setosa,5.0,3.4,1.5,0.2
79,versicolor,5.7,2.6,3.5,1.0
74,versicolor,6.4,2.9,4.3,1.3
97,versicolor,6.2,2.9,4.3,1.3


In [56]:
train.shape, validate.shape, test.shape

((84, 5), (36, 5), (30, 5))

<div class="alert alert-block alert-info">

3. Run the function on the Titanic dataset, returning 3 datasets, train_titanic, validate_titanic and test_titanic.

In [61]:
train, validate, test = split_data(prepped_titanic, target='survived')
# take a look
train.head()

Unnamed: 0,survived,pclass,sex,sibsp,parch,fare,embark_town,alone
474,0,3,female,0,0,9.8375,Southampton,1
167,0,3,female,1,4,27.9,Southampton,0
271,1,3,male,0,0,0.0,Southampton,1
887,1,1,female,0,0,30.0,Southampton,1
118,0,1,male,0,1,247.5208,Cherbourg,0


In [62]:
train.shape, validate.shape, test.shape

((497, 8), (214, 8), (178, 8))

<div class="alert alert-block alert-info">

4. Run the function on the Telco dataset, returning 3 datasets, train_telco, validate_telco and test_telco.

In [63]:
train, validate, test = split_data(prepped_telco, target='churn')
# take a look
train.head()

Unnamed: 0,customer_id,gender,senior_citizen,partner,dependents,tenure,phone_service,multiple_lines,online_security,online_backup,...,tech_support,streaming_tv,streaming_movies,paperless_billing,monthly_charges,total_charges,churn,contract_type,internet_service_type,payment_type
6832,9705-IOVQQ,Male,1,Yes,Yes,61,Yes,Yes,No,Yes,...,No,Yes,Yes,No,99.0,5969.3,No,One year,Fiber optic,Electronic check
433,0635-WKOLD,Male,0,Yes,No,40,Yes,Yes,No,Yes,...,Yes,Yes,No,No,70.75,2921.75,No,One year,DSL,Credit card (automatic)
25,0032-PGELS,Female,0,Yes,Yes,1,No,No phone service,Yes,No,...,No,No,No,No,30.5,30.5,Yes,Month-to-month,DSL,Bank transfer (automatic)
3196,4581-LNWUM,Female,0,No,No,13,No,No phone service,No,No,...,No,Yes,Yes,No,49.15,649.4,No,Month-to-month,DSL,Electronic check
4466,6297-NOOPG,Female,0,Yes,No,70,Yes,Yes,No,Yes,...,Yes,Yes,Yes,Yes,110.5,7752.05,No,Two year,Fiber optic,Electronic check


In [64]:
train.shape, validate.shape, test.shape

((3943, 21), (1691, 21), (1409, 21))