In [1]:
from math import sqrt
from scipy import stats
from pydataset import data
from datetime import datetime


from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
import sklearn.metrics

%matplotlib inline
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from env import host, user, password, sql_connect
from wrangle import wrangle_telco

## Exercise II 

#### Question 1: Acquire the telco_churn database into a pandas dataframe selecting the customer_id, monthly_charges, tenure, and total_charges columns. While filtering for customers that are 2 year contract customers.

In [2]:
sql_query = '''
Select customer_id, monthly_charges, tenure, total_charges
from customers
where contract_type_id = 3;
'''

df = pd.read_sql(sql_query, sql_connect('telco_churn'))

In [3]:
df.head()

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
0,0013-SMEOE,109.7,71,7904.25
1,0014-BMAQU,84.65,63,5377.8
2,0016-QLJIS,90.45,65,5957.9
3,0017-DINOC,45.2,54,2460.55
4,0017-IUDMW,116.8,72,8456.75


In [4]:
df.shape

(1695, 4)

#### Question 2: Using your acquired Telco data, walk through the summarization and cleaning steps in your wrangle.ipynb file like we did above. You may handle the missing values however you feel is appropriate and meaningful; remember to document your process and decisions using markdown and code commenting where helpful.

In [5]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
monthly_charges,1695.0,60.770413,34.678865,18.4,24.025,64.35,90.45,118.75
tenure,1695.0,56.735103,18.209363,0.0,48.0,64.0,71.0,72.0


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      1695 non-null   object 
 1   monthly_charges  1695 non-null   float64
 2   tenure           1695 non-null   int64  
 3   total_charges    1695 non-null   object 
dtypes: float64(1), int64(1), object(2)
memory usage: 53.1+ KB


In [7]:
df.sort_values(by = 'total_charges', ascending = True)

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
524,3213-VVOLG,25.35,0,
416,2520-SGTTA,20.00,0,
678,4075-WKNIU,73.35,0,
234,1371-DWPAZ,56.05,0,
941,5709-LVOEQ,80.85,0,
...,...,...,...,...
731,4526-EXKKN,24.60,40,973.95
442,2675-IJRGJ,19.65,53,978
582,3521-HTQTV,26.10,34,980.35
714,4361-FEBGN,20.15,48,982.95


In [8]:
df.total_charges = df.total_charges.str.replace(' ', '0').astype(float)

In [9]:
df.sort_values(by = 'total_charges', ascending = True)

Unnamed: 0,customer_id,monthly_charges,tenure,total_charges
678,4075-WKNIU,73.35,0,0.00
1293,7644-OMVMY,19.85,0,0.00
716,4367-NUYAO,25.75,0,0.00
234,1371-DWPAZ,56.05,0,0.00
726,4472-LVYGI,52.55,0,0.00
...,...,...,...,...
1679,9924-JPRMC,118.20,72,8547.15
1504,8879-XUAHX,116.25,71,8564.75
1657,9788-HNGUT,116.95,72,8594.40
1649,9739-JLPQJ,117.50,72,8670.10


In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 4 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   customer_id      1695 non-null   object 
 1   monthly_charges  1695 non-null   float64
 2   tenure           1695 non-null   int64  
 3   total_charges    1695 non-null   float64
dtypes: float64(2), int64(1), object(1)
memory usage: 53.1+ KB


In [11]:
df = df.reset_index()
df.head()

Unnamed: 0,index,customer_id,monthly_charges,tenure,total_charges
0,0,0013-SMEOE,109.7,71,7904.25
1,1,0014-BMAQU,84.65,63,5377.8
2,2,0016-QLJIS,90.45,65,5957.9
3,3,0017-DINOC,45.2,54,2460.55
4,4,0017-IUDMW,116.8,72,8456.75


In [13]:
telco_df = wrangle_telco()  ##testing function
telco_df.head()

Unnamed: 0,index,customer_id,monthly_charges,tenure,total_charges
0,0,0013-SMEOE,109.7,71,7904.25
1,1,0014-BMAQU,84.65,63,5377.8
2,2,0016-QLJIS,90.45,65,5957.9
3,3,0017-DINOC,45.2,54,2460.55
4,4,0017-IUDMW,116.8,72,8456.75


In [14]:
telco_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1695 entries, 0 to 1694
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   index            1695 non-null   int64  
 1   customer_id      1695 non-null   object 
 2   monthly_charges  1695 non-null   float64
 3   tenure           1695 non-null   int64  
 4   total_charges    1695 non-null   float64
dtypes: float64(2), int64(2), object(1)
memory usage: 66.3+ KB
