In [2]:
import pandas as pd  #pandas for data manipulation
df = pd.read_csv('logistic_regression_input_data.csv')  #reads the csv file and converts to a dataframe
df  #prints the dataframe

Unnamed: 0,city,phone_service,multiple_lines,internet_service,internet_type,contract_type,tenure (months),customer_status
0,Frazier Park,Yes,No,Yes,Cable,One Year,9,Stayed
1,Glendale,Yes,Yes,Yes,Cable,Month-to-Month,9,Stayed
2,Costa Mesa,Yes,No,Yes,Fiber Optic,Month-to-Month,4,Churned
3,Martinez,Yes,No,Yes,Fiber Optic,Month-to-Month,13,Churned
4,Camarillo,Yes,No,Yes,Fiber Optic,Month-to-Month,3,Churned
...,...,...,...,...,...,...,...,...
6584,Los Angeles,Yes,Yes,Yes,Fiber Optic,Month-to-Month,1,Churned
6585,Somerset,Yes,Yes,Yes,Fiber Optic,One Year,38,Stayed
6586,Downey,Yes,No,Yes,Fiber Optic,Two Year,48,Churned
6587,Santa Rosa,Yes,No,No,,One Year,13,Stayed


In [3]:
df['customer_status'] = df['customer_status'].map({'Churned': 1, 'Stayed': 0})  #encode the variables for the target column into 0 or 1
df

Unnamed: 0,city,phone_service,multiple_lines,internet_service,internet_type,contract_type,tenure (months),customer_status
0,Frazier Park,Yes,No,Yes,Cable,One Year,9,0
1,Glendale,Yes,Yes,Yes,Cable,Month-to-Month,9,0
2,Costa Mesa,Yes,No,Yes,Fiber Optic,Month-to-Month,4,1
3,Martinez,Yes,No,Yes,Fiber Optic,Month-to-Month,13,1
4,Camarillo,Yes,No,Yes,Fiber Optic,Month-to-Month,3,1
...,...,...,...,...,...,...,...,...
6584,Los Angeles,Yes,Yes,Yes,Fiber Optic,Month-to-Month,1,1
6585,Somerset,Yes,Yes,Yes,Fiber Optic,One Year,38,0
6586,Downey,Yes,No,Yes,Fiber Optic,Two Year,48,1
6587,Santa Rosa,Yes,No,No,,One Year,13,0


In [4]:
y = df['customer_status']  #y will store the target column
y

0       0
1       0
2       1
3       1
4       1
       ..
6584    1
6585    0
6586    1
6587    0
6588    1
Name: customer_status, Length: 6589, dtype: int64

In [5]:
X = df.drop(columns = ['customer_status'])  #X will store all the feature columns

#Separate the categorical and numerical feature columns
catCol = ['city', 'phone_service', 'multiple_lines', 'internet_service', 'internet_type', 'contract_type']
numCol = ['tenure (months)']

X

Unnamed: 0,city,phone_service,multiple_lines,internet_service,internet_type,contract_type,tenure (months)
0,Frazier Park,Yes,No,Yes,Cable,One Year,9
1,Glendale,Yes,Yes,Yes,Cable,Month-to-Month,9
2,Costa Mesa,Yes,No,Yes,Fiber Optic,Month-to-Month,4
3,Martinez,Yes,No,Yes,Fiber Optic,Month-to-Month,13
4,Camarillo,Yes,No,Yes,Fiber Optic,Month-to-Month,3
...,...,...,...,...,...,...,...
6584,Los Angeles,Yes,Yes,Yes,Fiber Optic,Month-to-Month,1
6585,Somerset,Yes,Yes,Yes,Fiber Optic,One Year,38
6586,Downey,Yes,No,Yes,Fiber Optic,Two Year,48
6587,Santa Rosa,Yes,No,No,,One Year,13


In [6]:
#Preprocession Data (meaning encode categorical data into 0 & 1, and keep the numerical data same)
from sklearn.compose import ColumnTransformer  #ColumnTransfer is a preproceesing and transformation tool
from sklearn.preprocessing import OneHotEncoder  #OneHotEncoder transform categorical into numerical data

# OneHotEncoder converts categorical features into binary (0/1) columns
# sparse_output=False returns a NumPy array instead of a sparse matrix, making it easier to analyze
# handle_unknown='ignore' make sures model does not crash if it encounters new or unknow feature values
# passthrough keeps numerical columns unchanged
preprocessor = ColumnTransformer(
    transformers = [
        ('cat', OneHotEncoder(sparse_output = False, handle_unknown = 'ignore'), catCol),
        ('num', 'passthrough', numCol)
    ]
)


In [7]:
X_encoded = preprocessor.fit_transform(X)  #X_encoded stores the encoded data the feature columns
feature_names = preprocessor.get_feature_names_out()  #Returns the original columns names after encoding the feature column data
X_encoded  #prints the encoded data the feature columns, but it is in NumPy array format, so it needs to be converted to DataFrame in the next step

array([[ 0.,  0.,  0., ...,  1.,  0.,  9.],
       [ 0.,  0.,  0., ...,  0.,  0.,  9.],
       [ 0.,  0.,  0., ...,  0.,  0.,  4.],
       ...,
       [ 0.,  0.,  0., ...,  0.,  1., 48.],
       [ 0.,  0.,  0., ...,  1.,  0., 13.],
       [ 0.,  0.,  0., ...,  0.,  0., 18.]], shape=(6589, 1120))

In [8]:
X_encoded_df = pd.DataFrame(
    X_encoded,
    columns=feature_names,
    index=df.index   # Using the original index (df.index) keeps rows aligned with the target variable
)
X_encoded_df  # prints the encoded feature DataFrame

Unnamed: 0,cat__city_Acampo,cat__city_Acton,cat__city_Adelanto,cat__city_Adin,cat__city_Agoura Hills,cat__city_Aguanga,cat__city_Ahwahnee,cat__city_Alameda,cat__city_Alamo,cat__city_Albany,...,cat__internet_service_No,cat__internet_service_Yes,cat__internet_type_Cable,cat__internet_type_DSL,cat__internet_type_Fiber Optic,cat__internet_type_nan,cat__contract_type_Month-to-Month,cat__contract_type_One Year,cat__contract_type_Two Year,num__tenure (months)
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,4.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,13.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0
6585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,38.0
6586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,48.0
6587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,13.0


In [9]:
full_encoded_df = pd.concat([X_encoded_df, y], axis=1)  # Combines encoded features (X) and target data (y) into one DataFrame
full_encoded_df  #prints the full encoded DataFrame

Unnamed: 0,cat__city_Acampo,cat__city_Acton,cat__city_Adelanto,cat__city_Adin,cat__city_Agoura Hills,cat__city_Aguanga,cat__city_Ahwahnee,cat__city_Alameda,cat__city_Alamo,cat__city_Albany,...,cat__internet_service_Yes,cat__internet_type_Cable,cat__internet_type_DSL,cat__internet_type_Fiber Optic,cat__internet_type_nan,cat__contract_type_Month-to-Month,cat__contract_type_One Year,cat__contract_type_Two Year,num__tenure (months),customer_status
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,9.0,0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,9.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,4.0,1
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,13.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,3.0,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6584,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,1.0,1
6585,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,1.0,0.0,38.0,0
6586,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,1.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,48.0,1
6587,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,13.0,0


In [10]:
#Build a pipeline
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression

# Creates a pipeline that automatically encodes the data and then trains the model
model = Pipeline(steps=[
    ('preprocessor', preprocessor), #Applies OneHot encoding to X
    ('classifier', LogisticRegression(max_iter=1000)) #Trains a logistic regression model on the encoded data
])

In [11]:
#Train & Test the data
from sklearn.model_selection import train_test_split

# Splits the encoded data into training and testing set
# test size means what % of data to use for testing
# random_state=42 ensures reproducibility (the split is the same every time)
# stratfy=y makes sure that the training and testing datasets maintain the same proportion of y labels
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 42, stratify = y)

#Train the model using the training data
model.fit(X_train, y_train)
 
#Test the model by predicting the target variable for the test data
y_pred = model.predict(X_test)

In [12]:
from sklearn.metrics import classification_report, confusion_matrix

#Confusion matrix prints of true positives, true negatives, false positives, and false negatives
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))

#Classification report prints the precision, recall, f1-score, and support for each class
print("\nClassification Report:\n", classification_report(y_test, y_pred))

Confusion Matrix:
 [[821 123]
 [126 248]]

Classification Report:
               precision    recall  f1-score   support

           0       0.87      0.87      0.87       944
           1       0.67      0.66      0.67       374

    accuracy                           0.81      1318
   macro avg       0.77      0.77      0.77      1318
weighted avg       0.81      0.81      0.81      1318



In [13]:
# Sample new data to predict probability of staying & leaving within the company's service
new_customer = pd.DataFrame([{
    'city': 'San Jose',
    'phone_service': 'Yes',
    'multiple_lines': 'No',
    'internet_service': 'Yes',
    'internet_type': 'Cable',
    'contract_type': 'One Year',
    'tenure (months)': 20
}])
new_customer

Unnamed: 0,city,phone_service,multiple_lines,internet_service,internet_type,contract_type,tenure (months)
0,San Jose,Yes,No,Yes,Cable,One Year,20


In [14]:
result = model.predict_proba(new_customer)  #prints the probability for staying & leaving in NumPy array format
result

array([[0.88235978, 0.11764022]])

In [15]:
print("Predicted Probability of staying:", round( float(result[:,0][0]), 2) )

Predicted Probability of staying: 0.88


In [16]:
print("Predicted Probability of leaving:", round( float(result[:,1][0]), 2) )

Predicted Probability of leaving: 0.12
