# Data Preprocessing

In [91]:
# preprocessing of a dataset

import pandas as pd
import numpy as np


df=pd.read_csv("data1.csv")
# df.shape
df.columns
# df.isnull().sum()
# df.dropna(inplace=True)
# df.duplicated().sum()

Index(['customerID', 'gender', 'SeniorCitizen', 'Partner', 'Dependents',
       'tenure', 'PhoneService', 'MultipleLines', 'InternetService',
       'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport',
       'StreamingTV', 'StreamingMovies', 'Contract', 'PaperlessBilling',
       'PaymentMethod', 'MonthlyCharges', 'TotalCharges', 'Churn'],
      dtype='object')

In [92]:
# creating input and output data

features = df.drop(["Churn","customerID"], axis=1)

target = df["Churn"]
# features.columns

In [93]:
# null values
features.isnull().sum()

gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
dtype: int64

In [94]:
# duplicate values
features.duplicated().sum()

40

In [95]:
# handling duplicate values
features.drop_duplicates(inplace=True)
features.duplicated().sum()

0

In [96]:
# replacing space in float type columns
features["TotalCharges"]=features["TotalCharges"].replace(" ",np.nan)

# features.isnull().sum()
features["TotalCharges"]=features["TotalCharges"].astype(float)

In [97]:
features.fillna(features["TotalCharges"].mean(),inplace=True)

In [98]:
# Label Encoding the target variable
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
target = le.fit_transform(target)

In [99]:
#   datatype of features
features.dtypes

gender               object
SeniorCitizen         int64
Partner              object
Dependents           object
tenure                int64
PhoneService         object
MultipleLines        object
InternetService      object
OnlineSecurity       object
OnlineBackup         object
DeviceProtection     object
TechSupport          object
StreamingTV          object
StreamingMovies      object
Contract             object
PaperlessBilling     object
PaymentMethod        object
MonthlyCharges      float64
TotalCharges        float64
dtype: object

In [100]:
#  converting object datatype to category
categorical_columns=[]
for col in features.columns:
    if features[col].dtype == 'object':
        categorical_columns.append(col)

# converting object datatype to category
for col in categorical_columns:
    features[col] = features[col].astype('category')

features.dtypes

gender              category
SeniorCitizen          int64
Partner             category
Dependents          category
tenure                 int64
PhoneService        category
MultipleLines       category
InternetService     category
OnlineSecurity      category
OnlineBackup        category
DeviceProtection    category
TechSupport         category
StreamingTV         category
StreamingMovies     category
Contract            category
PaperlessBilling    category
PaymentMethod       category
MonthlyCharges       float64
TotalCharges         float64
dtype: object

In [101]:
filter_col=list(features.select_dtypes(exclude=['category']).columns)
filter_col

['SeniorCitizen', 'tenure', 'MonthlyCharges', 'TotalCharges']

In [102]:
# one_hot encoding
features = pd.get_dummies(features)
features.dtypes

SeniorCitizen                                int64
tenure                                       int64
MonthlyCharges                             float64
TotalCharges                               float64
gender_Female                                uint8
gender_Male                                  uint8
Partner_No                                   uint8
Partner_Yes                                  uint8
Dependents_No                                uint8
Dependents_Yes                               uint8
PhoneService_No                              uint8
PhoneService_Yes                             uint8
MultipleLines_No                             uint8
MultipleLines_No phone service               uint8
MultipleLines_Yes                            uint8
InternetService_DSL                          uint8
InternetService_Fiber optic                  uint8
InternetService_No                           uint8
OnlineSecurity_No                            uint8
OnlineSecurity_No internet serv

In [103]:
# scaling the data
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
for col in filter_col:
    features[col] = scaler.fit_transform(features[[col]])

# features.head(10)

# Bagging

In [104]:
# ## Importing Libraries
from sklearn.utils import resample
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score


In [105]:
# Ensure features and target are pandas objects
features = pd.DataFrame(features)  # if necessary
target = pd.Series(target)         # if necessary


models=[]
accuracies=[]
# Now you can use .iloc as before
for i in range(9):
    indices = resample(range(len(features)), replace=True, n_samples=int(0.8 * len(features)))

    sampled_features = features.iloc[indices]
    sampled_target = target.iloc[indices]

    X_train, X_test, y_train, y_test = train_test_split(sampled_features, sampled_target, test_size=0.2)

    model = LogisticRegression()
    model.fit(X_train, y_train)
    models.append(model)

    y_pred = model.predict(X_test)
    accuracies.append(accuracy_score(y_test, y_pred))

average_accuracy = sum(accuracies) / len(accuracies)
average_accuracy


0.7257409059371593