# ****Telco Customer Churn Project****

In [1]:
import numpy as np 
import pandas as pd 
from sklearn.model_selection import train_test_split




# Splitting Data

In [2]:
#Dropping Unnessescary Columns
df= pd.read_csv('/kaggle/input/telco-customer-churn/WA_Fn-UseC_-Telco-Customer-Churn.csv')
df = df.drop(columns=['gender','PhoneService', 'MultipleLines'])
#Replacing Churn Yes or No with True or False which can be converted to a float value
df = df.replace({'Churn': {'Yes': True, 
                                'No': False}})
df

Unnamed: 0,customerID,SeniorCitizen,Partner,Dependents,tenure,InternetService,OnlineSecurity,OnlineBackup,DeviceProtection,TechSupport,StreamingTV,StreamingMovies,Contract,PaperlessBilling,PaymentMethod,MonthlyCharges,TotalCharges,Churn
0,7590-VHVEG,0,Yes,No,1,DSL,No,Yes,No,No,No,No,Month-to-month,Yes,Electronic check,29.85,29.85,False
1,5575-GNVDE,0,No,No,34,DSL,Yes,No,Yes,No,No,No,One year,No,Mailed check,56.95,1889.5,False
2,3668-QPYBK,0,No,No,2,DSL,Yes,Yes,No,No,No,No,Month-to-month,Yes,Mailed check,53.85,108.15,True
3,7795-CFOCW,0,No,No,45,DSL,Yes,No,Yes,Yes,No,No,One year,No,Bank transfer (automatic),42.30,1840.75,False
4,9237-HQITU,0,No,No,2,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Electronic check,70.70,151.65,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7038,6840-RESVB,0,Yes,Yes,24,DSL,Yes,No,Yes,Yes,Yes,Yes,One year,Yes,Mailed check,84.80,1990.5,False
7039,2234-XADUH,0,Yes,Yes,72,Fiber optic,No,Yes,Yes,No,Yes,Yes,One year,Yes,Credit card (automatic),103.20,7362.9,False
7040,4801-JZAZL,0,Yes,Yes,11,DSL,Yes,No,No,No,No,No,Month-to-month,Yes,Electronic check,29.60,346.45,False
7041,8361-LTMKD,1,Yes,No,4,Fiber optic,No,No,No,No,No,No,Month-to-month,Yes,Mailed check,74.40,306.6,True


In [3]:
#Features
X = df.drop(columns='Churn')
#Label
y = df['Churn']
#Changing Total Charges to a Numerical Column 


In [4]:
#Training and Validation data split
X_train, X_valid , y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2)


# Preproccessing Data

When **preprocessing data**, we have two different types of columns: **Numerical columns** and **Categorical Columns**. Seperating between them is important because we need to transform them in different ways. Numerical Columns have a dtype of **float** and **contain float values** while Categorical columns have a dtype of **Object** and usually **contain text values**.

In [5]:
#Identifying the categorical columns
categorical_cols = [cname for cname in X_train.columns if X_train[cname].nunique() < 10 and 
                        X_train[cname].dtype == "object"]
#Identifying the numerical columns
numerical_cols = [cname for cname in X_train.columns if X_train[cname].dtype in ['int64', 'float64']]
#Converting the the Total Charges column to a float column as it is represented incorrectly. 
#If you try this without getting rid of values that don't exist, you will run into an error because you cannot convert a NA value into a float
X['TotalCharges']= X['TotalCharges'].apply(lambda x: x if x!= ' ' else np.nan).astype(float)


In [6]:
X[categorical_cols].nunique()


Partner             2
Dependents          2
InternetService     3
OnlineSecurity      3
OnlineBackup        3
DeviceProtection    3
TechSupport         3
StreamingTV         3
StreamingMovies     3
Contract            3
PaperlessBilling    2
PaymentMethod       4
dtype: int64

# More Preproccessing

In [7]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder

For our **Numerical Columns**, all we need is an **Imputer** which replaces missing values with a certain value instead of dropping the row completely which may removing meaning from the dataset. In this case, we are using the **mean** of all the different possible values. 

For our **Categorical Columns** we are using a **Pipeline** which consists of an Imputer and a **One Hot Encoder**. The Imputer does the same thing as it does with the **Numerical columns** but uses the **most frequent value** in that specific column instead of the mean value. The One Hot Encoder completely transforms a dataset. More information on this here:https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.OneHotEncoder.html

In [8]:
#Creating a transformer for the Numerical Columns
numerical_transformer = SimpleImputer(strategy='mean')


#Creating a transformer for the Categorical Columns
cat_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])



Lets go ahead and use a **Column Transformer** to apply our transformers onto our columns

This approach will not directly change our data but instead will be added in a Pipeline with our model that we use to make predictions. This way, we can have data that has had nothing done to it and still have a working model that will preproccess the data for us.

In [9]:
#Applying the transformers on to our columns
preprocessor = ColumnTransformer(
       transformers=[
        ('num', numerical_transformer, numerical_cols),
        ('cat', cat_transformer, categorical_cols)
    ])

In [10]:
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error


# Creating A Model Pipeline

In [11]:
#Creating our model
model = XGBRegressor(n_estimators=1000, learning_rate=0.05)
#Creating our Pipeline which preprocceses our data before passing it into our model 
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                              ('model', model)
                             ])
my_pipeline

In [12]:
#Training our model with our raw data  
my_pipeline.fit(X_train, y_train)


# Making Predictions and Checking Accuracy

In [13]:
preds = my_pipeline.predict(X_valid)
mae = mean_absolute_error(preds,y_valid) 
mae

0.28224635