In [10]:
import pandas as pd
import numpy as np

import seaborn as sns
import matplotlib.pyplot as plt
import plotly.express as px

import os
import missingno as msno

from sklearn.model_selection import train_test_split,cross_val_score,GridSearchCV,RandomizedSearchCV
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier,AdaBoostClassifier,VotingClassifier,GradientBoostingClassifier,BaggingClassifier
from xgboost import XGBClassifier

# Dataset Context : 

## Context
"Predict behavior to retain customers. You can analyze all relevant customer data and develop focused customer retention programs." [IBM Sample Data Sets]

## Content
Each row represents a customer, each column contains customer’s attributes described on the column Metadata.

# The data set includes information about:

Customers who left within the last month – the column is called Churn
Services that each customer has signed up for – phone, multiple lines, internet, online security, online backup, device protection, tech support, and streaming TV and movies
Customer account information – how long they’ve been a customer, contract, payment method, paperless billing, monthly charges, and total charges
Demographic info about customers – gender, age range, and if they have partners and dependents


## Columns Metadata  :

| Column | Description | Potential Values |
| :--- | :--- | :--- |
| **customerID** | Unique identifier for the customer | String |
| **gender** | Whether the customer is a male or a female | Male, Female |
| **SeniorCitizen** | Whether the customer is a senior citizen or not | 1, 0 |
| **Partner** | Whether the customer has a partner or not | Yes, No |
| **Dependents** | Whether the customer has dependents or not | Yes, No |
| **tenure** | Number of months the customer has stayed with the company | Numeric |
| **PhoneService** | Whether the customer has a phone service or not | Yes, No |
| **MultipleLines** | Whether the customer has multiple lines or not | Yes, No, No phone service |
| **InternetService** | Customer’s internet service provider | DSL, Fiber optic, No |
| **OnlineSecurity** | Whether the customer has online security or not | Yes, No, No internet service |
| **OnlineBackup** | Whether the customer has online backup or not | Yes, No, No internet service |
| **DeviceProtection** | Whether the customer has device protection or not | Yes, No, No internet service |
| **TechSupport** | Whether the customer has tech support or not | Yes, No, No internet service |
| **StreamingTV** | Whether the customer has streaming TV or not | Yes, No, No internet service |
| **StreamingMovies** | Whether the customer has streaming movies or not | Yes, No, No internet service |
| **Contract** | The contract term of the customer | Month-to-month, One year, Two year |
| **PaperlessBilling** | Whether the customer has paperless billing or not | Yes, No |
| **PaymentMethod** | The customer’s payment method | Electronic check, Mailed check, Bank transfer, Credit card |
| **MonthlyCharges** | The amount charged to the customer monthly | Numeric |
| **TotalCharges** | The total amount charged to the customer | Numeric |
| **Churn** | Whether the customer churned or not | Yes, No |


In [16]:
# Load data from the correct place and check for errors when loading

file_path = os.path.join('..','datasets','telco-churn-dataset.csv') # The .. to move one level up and search the path

try : 
    df = pd.read_csv(file_path)
except Exception as e:
    print(f'An Unexpected Error occured : \n Details : {e}')



In [None]:
df.shape # We got 21 columns and 7043 rows to work with

(7043, 21)

In [None]:
df.head() 

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,Female,0,Yes,No,1,No,No phone service,DSL,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,No
1,5575-GNVDE,Male,0,No,No,34,Yes,No,DSL,Yes,...,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,No
2,3668-QPYBK,Male,0,No,No,2,Yes,No,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,Yes
3,7795-CFOCW,Male,0,No,No,45,No,No phone service,DSL,Yes,...,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.3,1840.75,No
4,9237-HQITU,Female,0,No,No,2,Yes,No,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Electronic check,70.7,151.65,Yes


In [19]:
df.tail()

Unnamed: 0,customerID,gender,SeniorCitizen,Partner,Dependents,tenure,PhoneService,MultipleLines,InternetService,OnlineSecurity,...,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
7038,6840-RESVB,Male,0,Yes,Yes,24,Yes,Yes,DSL,Yes,...,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.8,1990.5,No
7039,2234-XADUH,Female,0,Yes,Yes,72,Yes,Yes,Fiber optic,No,...,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.2,7362.9,No
7040,4801-JZAZL,Female,0,Yes,Yes,11,No,No phone service,DSL,Yes,...,No,No,No,No,Month-to-month,Yes,Electronic check,29.6,346.45,No
7041,8361-LTMKD,Male,1,Yes,No,4,Yes,Yes,Fiber optic,No,...,No,No,No,No,Month-to-month,Yes,Mailed check,74.4,306.6,Yes
7042,3186-AJIEK,Male,0,No,No,66,Yes,No,Fiber optic,Yes,...,Yes,Yes,Yes,Yes,Two year,Yes,Bank transfer (automatic),105.65,6844.5,No


In [20]:
# No anamolies detected by now 