# EDA

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Importing data

In [2]:
train = pd.read_csv('../0_sql_container/datasets/train.csv')
validation = pd.read_csv('../0_sql_container/datasets/validation.csv')
test = pd.read_csv('../0_sql_container/datasets/test.csv')

## Data type


### String Variable Analysis

In [3]:
print(train.select_dtypes(include=object).columns)

Index(['Churn Category', 'Churn Reason', 'City', 'Contract', 'Country',
       'Customer ID', 'Customer Status', 'Gender', 'Internet Type', 'Lat Long',
       'Offer', 'Payment Method', 'Quarter', 'State'],
      dtype='object')


In [4]:
train.select_dtypes(include=object).head(5)

Unnamed: 0,Churn Category,Churn Reason,City,Contract,Country,Customer ID,Customer Status,Gender,Internet Type,Lat Long,Offer,Payment Method,Quarter,State
0,,,San Mateo,Two Year,United States,4526-ZJJTM,Stayed,Female,Fiber Optic,"37.538309, -122.305109",,Bank Withdrawal,Q3,California
1,,,Sutter Creek,Month-to-Month,United States,5302-BDJNT,Stayed,Male,Fiber Optic,"38.432145, -120.77069",,Bank Withdrawal,Q3,California
2,,,Santa Cruz,Month-to-Month,United States,5468-BPMMO,Stayed,Male,,"37.007882, -122.065975",,Bank Withdrawal,Q3,California
3,,,Brea,One Year,United States,2212-LYASK,Stayed,Male,DSL,"33.924143, -117.79387",,Credit Card,Q3,California
4,Dissatisfaction,Network reliability,San Jose,One Year,United States,0378-XSZPU,Churned,Male,Cable,"37.311088, -121.961786",Offer B,Credit Card,Q3,California


Looking at the string variables, I have some interesting questions:
1. What is churn main reason?
2. What are the main cities the company covers?
3. Does the company work only in USA?
4. Do we have locations with higher customer churn?
5. What is the most popular contract?
6. Do we have more female or male customers?
7. What is our main internet product?
8. How is the churn by product?
9. What is the most popular payment method over time?
10. Is there a relation between churn and payment method?

In [25]:
print('Churn main reason: ','\n',train['Churn Category'].value_counts(), '\n')

print('Churn detailed reason: ','\n',train.groupby('Churn Category')['Churn Reason'].value_counts())

Churn main reason:  
 Churn Category
Competitor         486
Attitude           200
Dissatisfaction    171
Other              134
Price              130
Name: count, dtype: int64 

Churn detailed reason:  
 Churn Category   Churn Reason                             
Attitude         Attitude of support person                   136
                 Attitude of service provider                  64
Competitor       Competitor had better devices                188
                 Competitor made better offer                 187
                 Competitor offered more data                  57
                 Competitor offered higher download speeds     54
Dissatisfaction  Network reliability                           46
                 Product dissatisfaction                       42
                 Service dissatisfaction                       32
                 Limited range of services                     25
                 Lack of self-service on Website               17
         

As expected, the main churn reason is due to competitors, specifically better devices and offers. It makes sense considering Telecom industry, where products are relatively the same. My bets are the costumer service would play an important role on making this company different and keeping clients. But when we look into the second reson for churn, 'attitude' is also there.

In [35]:
## make a sql lite entry to filter cities with 'customer status' = stayed

In [None]:
## make a sql lite entry to filter contries with 'customer status' = stayed

In [37]:
train[['Contract','Customer Status']].value_counts()

Contract        Customer Status
Two Year        Stayed             1088
Month-to-Month  Churned             997
                Stayed              947
One Year        Stayed              797
Month-to-Month  Joined              249
One Year        Churned              97
Two Year        Churned              27
                Joined               13
One Year        Joined               10
Name: count, dtype: int64

5. What is the most popular contract?
6. Do we have more female or male customers?
7. What is our main internet product?
8. How is the churn by product?
9. What is the most popular payment method over time?
10. Is there a relation between churn and payment method?

### Numeric Variable Analysis

In [5]:
print(train.select_dtypes(include=np.number).columns)


Index(['Age', 'Avg Monthly GB Download', 'Avg Monthly Long Distance Charges',
       'Churn Score', 'CLTV', 'Dependents', 'Device Protection Plan',
       'Internet Service', 'Latitude', 'Longitude', 'Married',
       'Monthly Charge', 'Multiple Lines', 'Number of Dependents',
       'Number of Referrals', 'Online Backup', 'Online Security',
       'Paperless Billing', 'Partner', 'Phone Service', 'Population',
       'Premium Tech Support', 'Referred a Friend', 'Satisfaction Score',
       'Senior Citizen', 'Streaming Movies', 'Streaming Music', 'Streaming TV',
       'Tenure in Months', 'Total Charges', 'Total Extra Data Charges',
       'Total Long Distance Charges', 'Total Refunds', 'Total Revenue',
       'Under 30', 'Unlimited Data', 'Zip Code', 'Churn'],
      dtype='object')


### 

In [6]:
train.select_dtypes(include=np.number).head(5)


Unnamed: 0,Age,Avg Monthly GB Download,Avg Monthly Long Distance Charges,Churn Score,CLTV,Dependents,Device Protection Plan,Internet Service,Latitude,Longitude,...,Tenure in Months,Total Charges,Total Extra Data Charges,Total Long Distance Charges,Total Refunds,Total Revenue,Under 30,Unlimited Data,Zip Code,Churn
0,72,4,19.44,51,4849,0,1,1,37.538309,-122.305109,...,25,2191.15,0,486.0,0.0,2677.15,0,1,94403,0
1,27,59,45.62,27,3715,0,1,1,38.432145,-120.77069,...,35,3418.2,0,1596.7,0.0,5014.9,1,1,95685,0
2,59,0,16.07,59,5092,0,0,0,37.007882,-122.065975,...,46,851.2,0,739.22,0.0,1590.42,0,0,95064,0
3,25,27,0.0,49,2068,0,1,1,33.924143,-117.79387,...,27,1246.4,30,0.0,0.0,1276.4,1,0,92823,0
4,31,21,17.22,88,4026,0,1,1,37.311088,-121.961786,...,58,3563.8,0,998.76,0.0,4562.56,0,1,95117,1


## Null values


## Statistics

In this section I'll explore datasets basic statistics. This means I'll look into their distribution to see how the variables behave and presence of outliers.
- Statistics (mean, outliers, boxplot, correlation)

In [10]:
round(train.describe(exclude=[object]),2)

Unnamed: 0,Age,Avg Monthly GB Download,Avg Monthly Long Distance Charges,Churn Score,CLTV,Dependents,Device Protection Plan,Internet Service,Latitude,Longitude,...,Tenure in Months,Total Charges,Total Extra Data Charges,Total Long Distance Charges,Total Refunds,Total Revenue,Under 30,Unlimited Data,Zip Code,Churn
count,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,...,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0,4225.0
mean,46.45,20.74,22.77,58.28,4409.75,0.23,0.35,0.79,36.21,-119.77,...,32.68,2306.08,6.97,754.72,1.95,3065.81,0.2,0.68,93491.89,0.27
std,16.73,20.37,15.43,21.2,1170.6,0.42,0.48,0.41,2.47,2.15,...,24.62,2271.45,25.25,854.91,7.81,2884.82,0.4,0.47,1863.34,0.44
min,19.0,0.0,0.0,5.0,2003.0,0.0,0.0,0.0,32.56,-124.3,...,1.0,18.8,0.0,0.0,0.0,21.36,0.0,0.0,90001.0,0.0
25%,32.0,4.0,9.05,40.0,3493.0,0.0,0.0,1.0,33.99,-121.79,...,9.0,401.5,0.0,67.68,0.0,592.75,0.0,0.0,92102.0,0.0
50%,46.0,17.0,22.57,61.0,4531.0,0.0,0.0,1.0,36.21,-119.62,...,30.0,1424.6,0.0,396.64,0.0,2151.47,0.0,1.0,93513.0,0.0
75%,60.0,27.0,36.17,75.0,5381.0,0.0,1.0,1.0,38.2,-117.99,...,56.0,3846.75,0.0,1200.0,0.0,4845.75,0.0,1.0,95337.0,1.0
max,80.0,85.0,49.99,96.0,6500.0,1.0,1.0,1.0,41.96,-114.19,...,72.0,8672.45,150.0,3564.0,49.79,11979.34,1.0,1.0,96150.0,1.0


In [7]:
train.head()

Unnamed: 0,Age,Avg Monthly GB Download,Avg Monthly Long Distance Charges,Churn Category,Churn Reason,Churn Score,City,CLTV,Contract,Country,...,Tenure in Months,Total Charges,Total Extra Data Charges,Total Long Distance Charges,Total Refunds,Total Revenue,Under 30,Unlimited Data,Zip Code,Churn
0,72,4,19.44,,,51,San Mateo,4849,Two Year,United States,...,25,2191.15,0,486.0,0.0,2677.15,0,1,94403,0
1,27,59,45.62,,,27,Sutter Creek,3715,Month-to-Month,United States,...,35,3418.2,0,1596.7,0.0,5014.9,1,1,95685,0
2,59,0,16.07,,,59,Santa Cruz,5092,Month-to-Month,United States,...,46,851.2,0,739.22,0.0,1590.42,0,0,95064,0
3,25,27,0.0,,,49,Brea,2068,One Year,United States,...,27,1246.4,30,0.0,0.0,1276.4,1,0,92823,0
4,31,21,17.22,Dissatisfaction,Network reliability,88,San Jose,4026,One Year,United States,...,58,3563.8,0,998.76,0.0,4562.56,0,1,95117,1
