In [12]:
# preprocessing of a dataset

import pandas as pd
import numpy as np


df=pd.read_csv("data1.csv")
# df.shape
df.columns
# df.isnull().sum()
# df.dropna(inplace=True)
# df.duplicated().sum()

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [13]:
# creating input and output data

features = df.drop(["Churn","customerID"], axis=1)

target = df["Churn"]
# features.columns

In [14]:
# null values
features.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
dtype: int64

In [15]:
# duplicate values
features.duplicated().sum()

40

In [16]:
# handling duplicate values
features.drop_duplicates(inplace=True)
features.duplicated().sum()

0

In [17]:
# replacing space in float type columns
features["TotalCharges"]=features["TotalCharges"].replace(" ",np.nan)

In [18]:
# Label Encoding the target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
target = le.fit_transform(target)

In [19]:
#   datatype of features
features.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges         object
dtype: object

In [20]:
#  converting object datatype to category
categorical_columns=[]
for col in features.columns:
    if features[col].dtype != 'int64':
        categorical_columns.append(col)

# converting object datatype to category
for col in categorical_columns:
    features[col] = features[col].astype('category')

features.dtypes

gender              category
SeniorCitizen          int64
Partner             category
Dependents          category
tenure                 int64
PhoneService        category
MultipleLines       category
InternetService     category
OnlineSecurity      category
OnlineBackup        category
DeviceProtection    category
TechSupport         category
StreamingTV         category
StreamingMovies     category
Contract            category
PaperlessBilling    category
PaymentMethod       category
MonthlyCharges      category
TotalCharges        category
dtype: object

In [21]:
filter_col=list(features.select_dtypes(exclude=['category']).columns)
filter_col

['SeniorCitizen', 'tenure']

In [22]:
# one_hot encoding
features = pd.get_dummies(features)
features.dtypes

SeniorCitizen          int64
tenure                 int64
gender_Female          uint8
gender_Male            uint8
Partner_No             uint8
                       ...  
TotalCharges_997.75    uint8
TotalCharges_998.1     uint8
TotalCharges_999.45    uint8
TotalCharges_999.8     uint8
TotalCharges_999.9     uint8
Length: 8158, dtype: object

In [23]:
# scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for col in filter_col:
    features[col] = scaler.fit_transform(features[[col]])

# features.head(10)