In [32]:
# Importing the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split


In [38]:
# Load the data from /Dataset/Telco_customer_churn_demographics.xlsx
dataset1 = pd.read_excel('../Dataset/Telco_customer_churn_demographics.xlsx')

In [39]:
# we need to get a column from another excel file and join it with the dataset

# Load the data from /Dataset/Telco_customer_churn.xlsx
dataset2 = pd.read_excel('../Dataset/Telco_customer_churn.xlsx')

In [42]:
# rename the column to match the column name in the dataset
dataset2.rename(columns={'CustomerID':'Customer ID'}, inplace=True)

In [43]:
# Join the two datasets on the column 'Customer ID'
dataset = pd.merge(dataset1, dataset2, on='Customer ID')

In [44]:
# Check for any missing values
dataset.isnull().sum()

Customer ID                0
Count_x                    0
Gender_x                   0
Age                        0
Under 30                   0
Senior Citizen_x           0
Married                    0
Dependents_x               0
Number of Dependents       0
Count_y                    0
Country                    0
State                      0
City                       0
Zip Code                   0
Lat Long                   0
Latitude                   0
Longitude                  0
Gender_y                   0
Senior Citizen_y           0
Partner                    0
Dependents_y               0
Tenure Months              0
Phone Service              0
Multiple Lines             0
Internet Service           0
Online Security            0
Online Backup              0
Device Protection          0
Tech Support               0
Streaming TV               0
Streaming Movies           0
Contract                   0
Paperless Billing          0
Payment Method             0
Monthly Charge

In [48]:
# make sure that the merge happened successfully by comparing Gender_x and Gender_y columns to be the same
difference = dataset['Gender_x'] != dataset['Gender_y']
difference.sum()

0

In [49]:
# check the data types of the columns
dataset.dtypes

Customer ID              object
Count_x                   int64
Gender_x                 object
Age                       int64
Under 30                 object
Senior Citizen_x         object
Married                  object
Dependents_x             object
Number of Dependents      int64
Count_y                   int64
Country                  object
State                    object
City                     object
Zip Code                  int64
Lat Long                 object
Latitude                float64
Longitude               float64
Gender_y                 object
Senior Citizen_y         object
Partner                  object
Dependents_y             object
Tenure Months             int64
Phone Service            object
Multiple Lines           object
Internet Service         object
Online Security          object
Online Backup            object
Device Protection        object
Tech Support             object
Streaming TV             object
Streaming Movies         object
Contract

In [50]:
# drop useless columns

my_columns = ['Gender_x', 'Age', 'Married',
              'Number of Dependents', 'Churn Value']

dataset = dataset[my_columns]

In [51]:
# turn the categorical variables into dummy variables
dataset = pd.get_dummies(dataset)

# check the data types of the columns
dataset.dtypes

Age                     int64
Number of Dependents    int64
Churn Value             int64
Gender_x_Female         uint8
Gender_x_Male           uint8
Married_No              uint8
Married_Yes             uint8
dtype: object

In [53]:
# drop the newly created dummy variables that are not required
if 'Gender_x_Female' in dataset.columns:
    dataset = dataset.drop(
        ['Gender_x_Female', 'Married_No'], axis=1)


# rename the columns to remove the _Yes suffix
dataset.rename(columns={'Gender_x_Male': 'Gender',
               'Married_Yes': 'Married'}, inplace=True)

# check the data types of the columns
dataset.dtypes    

Age                     int64
Number of Dependents    int64
Churn Value             int64
Gender                  uint8
Married                 uint8
dtype: object

##### Gender = 1 then male if 0 then female 

In [54]:
# check the head of the dataset
dataset.head()

Unnamed: 0,Age,Number of Dependents,Churn Value,Gender,Married
0,78,0,1,1,0
1,74,1,1,0,1
2,71,3,1,1,0
3,78,1,1,0,1
4,80,1,1,0,1


In [34]:
# split the dataset into training and test sets
train, test = train_test_split(dataset, test_size=0.2, random_state=42)

# split the training set into training and validation sets
train, val = train_test_split(train, test_size=0.2, random_state=42)