In [94]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import TomekLinks



In [95]:
file_path = "customer_churn.csv"
data = pd.read_csv(file_path)

In [96]:
print(data.head())

   customerID  gender  SeniorCitizen Partner Dependents  tenure PhoneService   
0  7590-VHVEG  Female              0     Yes         No       1           No  \
1  5575-GNVDE    Male              0      No         No      34          Yes   
2  3668-QPYBK    Male              0      No         No       2          Yes   
3  7795-CFOCW    Male              0      No         No      45           No   
4  9237-HQITU  Female              0      No         No       2          Yes   

      MultipleLines InternetService OnlineSecurity  ... DeviceProtection   
0  No phone service             DSL             No  ...               No  \
1                No             DSL            Yes  ...              Yes   
2                No             DSL            Yes  ...               No   
3  No phone service             DSL            Yes  ...              Yes   
4                No     Fiber optic             No  ...               No   

  TechSupport StreamingTV StreamingMovies        Contract Pape

In [97]:
print(data.info())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7043 entries, 0 to 7042
Data columns (total 21 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   customerID        7043 non-null   object 
 1   gender            7043 non-null   object 
 2   SeniorCitizen     7043 non-null   int64  
 3   Partner           7043 non-null   object 
 4   Dependents        7043 non-null   object 
 5   tenure            7043 non-null   int64  
 6   PhoneService      7043 non-null   object 
 7   MultipleLines     7043 non-null   object 
 8   InternetService   7043 non-null   object 
 9   OnlineSecurity    7043 non-null   object 
 10  OnlineBackup      7043 non-null   object 
 11  DeviceProtection  7043 non-null   object 
 12  TechSupport       7043 non-null   object 
 13  StreamingTV       7043 non-null   object 
 14  StreamingMovies   7043 non-null   object 
 15  Contract          7043 non-null   object 
 16  PaperlessBilling  7043 non-null   object 


In [98]:
print(data.describe())

       SeniorCitizen       tenure  MonthlyCharges
count    7043.000000  7043.000000     7043.000000
mean        0.162147    32.371149       64.761692
std         0.368612    24.559481       30.090047
min         0.000000     0.000000       18.250000
25%         0.000000     9.000000       35.500000
50%         0.000000    29.000000       70.350000
75%         0.000000    55.000000       89.850000
max         1.000000    72.000000      118.750000


SeniorCitizen:

This column represents whether the customer is a senior citizen (1) or not (0).
The dataset contains 7,043 entries.
On average, around 16% of customers are senior citizens.
The standard deviation is approximately 0.37, indicating a moderate amount of variability.

The minimum value is 0 (not a senior citizen), and the maximum value is 1 (senior citizen).

tenure:

This column represents the number of months the customer has been with the company.
The dataset contains 7,043 entries.
On average, customers stay for around 32 months.
The standard deviation is approximately 24.56, indicating a wide range of variation.

The minimum tenure is 0 months, and the maximum is 72 months.
25% of customers have a tenure of 9 months or less, while 75% have a tenure of 55 months or less.

MonthlyCharges:

This column represents the monthly amount charged to the customer.
The dataset contains 7,043 entries.
The average monthly charge is around 64.76.
The standard deviation is approximately 30.09, indicating a considerable amount of variability.
The minimum monthly charge is 18.25, and the maximum is 118.75.
25% of customers are charged 35.5 or less per month, while 75% are charged 89.85 or less.

These statistics give us an overview of the distribution and range of values for each of these numerical variables in the dataset.


In [99]:
print(data.isnull().sum())

customerID          0
gender              0
SeniorCitizen       0
Partner             0
Dependents          0
tenure              0
PhoneService        0
MultipleLines       0
InternetService     0
OnlineSecurity      0
OnlineBackup        0
DeviceProtection    0
TechSupport         0
StreamingTV         0
StreamingMovies     0
Contract            0
PaperlessBilling    0
PaymentMethod       0
MonthlyCharges      0
TotalCharges        0
Churn               0
dtype: int64


In [100]:
for column in data.select_dtypes(include=['object']).columns:
    print(f"Unique values in {column}: {data[column].unique()}")

Unique values in customerID: ['7590-VHVEG' '5575-GNVDE' '3668-QPYBK' ... '4801-JZAZL' '8361-LTMKD'
 '3186-AJIEK']
Unique values in gender: ['Female' 'Male']
Unique values in Partner: ['Yes' 'No']
Unique values in Dependents: ['No' 'Yes']
Unique values in PhoneService: ['No' 'Yes']
Unique values in MultipleLines: ['No phone service' 'No' 'Yes']
Unique values in InternetService: ['DSL' 'Fiber optic' 'No']
Unique values in OnlineSecurity: ['No' 'Yes' 'No internet service']
Unique values in OnlineBackup: ['Yes' 'No' 'No internet service']
Unique values in DeviceProtection: ['No' 'Yes' 'No internet service']
Unique values in TechSupport: ['No' 'Yes' 'No internet service']
Unique values in StreamingTV: ['No' 'Yes' 'No internet service']
Unique values in StreamingMovies: ['No' 'Yes' 'No internet service']
Unique values in Contract: ['Month-to-month' 'One year' 'Two year']
Unique values in PaperlessBilling: ['Yes' 'No']
Unique values in PaymentMethod: ['Electronic check' 'Mailed check' 'Bank t

In [101]:
print(data['Churn'].value_counts())

Churn
No     5174
Yes    1869
Name: count, dtype: int64


The "Churn" column indicates whether customers have churned or not from the service. The count shows:

5,174 customers who did not churn (labeled as "No").
1,869 customers who did churn (labeled as "Yes").
This gives an overview of the class distribution, where there are more customers who did not churn compared to those who did.


_____________

We are planning to build a logistic regression model to predict the "Churn" variable using the independent variables "tenure", "SeniorCitizen", and "MonthlyCharges". Logistic regression is commonly used for binary classification tasks like predicting whether a customer will churn ("Yes") or not ("No").

In [102]:
independent_vars = ["tenure", "SeniorCitizen", "MonthlyCharges"]
X = data[independent_vars]
y = data["Churn"]

In [103]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [104]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [105]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train_scaled, y_train)


In [106]:
y_pred = logreg_model.predict(X_test_scaled)

In [107]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [108]:
print(f"Accuracy: {accuracy:.2f}")
print("Classification Report:\n", classification_rep)

Accuracy: 0.81
Classification Report:
               precision    recall  f1-score   support

          No       0.83      0.92      0.88      1036
         Yes       0.70      0.49      0.57       373

    accuracy                           0.81      1409
   macro avg       0.76      0.70      0.72      1409
weighted avg       0.80      0.81      0.80      1409



Accuracy: 0.81

The model's overall accuracy is 81%, indicating the proportion of correct predictions over the total number of predictions.

Classification Report:

Precision: For customers labeled as "Yes" (churned), the model's precision is 70%. For customers labeled as "No" (not churned), the precision is 83%. Precision measures the proportion of correct positive predictions among all positive predictions.

Recall: The model's recall for customers labeled as "Yes" (churned) is 49%, and for customers labeled as "No" (not churned), it's 92%. Recall measures the proportion of actual positive cases that were correctly predicted.

F1-score: The F1-score for "Yes" (churned) is 0.57, and for "No" (not churned), it's 0.88. The F1-score is the harmonic mean of precision and recall and gives a balanced measure of a model's accuracy.

Support: The number of instances for each class in the testing set is 373 for "Yes" (churned) and 1036 for "No" (not churned).

Macro Avg: The macro average F1-score is 0.72, and the macro average considers each class equally, providing a general assessment of the model's performance.

Weighted Avg: The weighted average F1-score is 0.80, and the weighted average accounts for class imbalance by considering each class's support, providing a more representative measure of the model's overall performance.

In summary, the model has an accuracy of 81%, but it performs better at identifying customers who didn't churn ("No") compared to those who did ("Yes"). The precision, recall, and F1-score provide a more detailed view of the model's performance across different classes.

In [109]:
target_variable = data["Churn"]

In [110]:
print(target_variable)

0        No
1        No
2       Yes
3        No
4       Yes
       ... 
7038     No
7039     No
7040     No
7041    Yes
7042     No
Name: Churn, Length: 7043, dtype: object


The target variable "Churn" represents whether a customer has churned from the service. The results provided are a list of binary labels for each customer in the dataset:

"No": This label indicates that the customer did not churn from the service.

"Yes": This label indicates that the customer did churn from the service.

The list of labels shows the churn status for each of the 7,043 customers in the dataset. This variable is used as the ground truth for training and evaluating the predictive model.

In [111]:
# Extract the independent variables
independent_vars = ["tenure", "SeniorCitizen", "MonthlyCharges"]
X = data[independent_vars]

In [112]:
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

In [113]:
print(X_scaled)

[[-1.27744458 -0.43991649 -1.16032292]
 [ 0.06632742 -0.43991649 -0.25962894]
 [-1.23672422 -0.43991649 -0.36266036]
 ...
 [-0.87024095 -0.43991649 -1.1686319 ]
 [-1.15528349  2.27315869  0.32033821]
 [ 1.36937906 -0.43991649  1.35896134]]


The results show the scaled values of the independent variables for each customer:

Scaling makes the values easier for the model to work with by putting them on a similar scale and adjusting for differences between variables. Each row in the results corresponds to a customer, and each column represents one of these variables.

second regresion model

In [114]:
independent_vars = ["tenure", "SeniorCitizen", "MonthlyCharges"]
X = data[independent_vars]
y = data["Churn"]

In [115]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [116]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


In [117]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train_scaled, y_train)

In [118]:
y_pred = logreg_model.predict(X_test_scaled)

In [119]:
accuracy = accuracy_score(y_test, y_pred)
print(f"Model Accuracy: {accuracy:.2f}")

Model Accuracy: 0.81


An accuracy of 0.81 means that the model's predictions match the actual outcomes for around 81% of the customers regarding whether they churned or not. It's a good sign that the model is performing reasonably well in predicting customer churn.

Even a simple model can provide more than 70% accuracy because the dataset might have inherent patterns that a basic model like logistic regression can capture. Additionally, if the classes are not extremely imbalanced and there are some distinguishing features between customers who churn and those who don't, a simple model can learn and generalize reasonably well.

It's important to note that achieving a high accuracy doesn't necessarily mean the model is perfect. It might miss some important nuances and struggle with predicting the minority class accurately. In imbalanced datasets, it's crucial to consider other metrics like precision, recall, and the F1-score to understand the model's performance more comprehensively, especially in the context of customer churn prediction.

In [120]:
independent_vars = ["tenure", "SeniorCitizen", "MonthlyCharges"]
X = data[independent_vars]
y = data["Churn"]


In [121]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [122]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [123]:
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)


In [124]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train_resampled, y_train_resampled)


In [125]:
y_pred = logreg_model.predict(X_test_scaled)


In [126]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [127]:
print(f"Accuracy with SMOTE: {accuracy:.2f}")
print("Classification Report with SMOTE:\n", classification_rep)

Accuracy with SMOTE: 0.75
Classification Report with SMOTE:
               precision    recall  f1-score   support

          No       0.90      0.74      0.81      1036
         Yes       0.51      0.77      0.62       373

    accuracy                           0.75      1409
   macro avg       0.71      0.75      0.71      1409
weighted avg       0.80      0.75      0.76      1409



After applying SMOTE:

Accuracy: 75%
Precision: Improved for churned customers ("Yes") but decreased for non-churned ("No").
Recall: Improved for churned customers ("Yes") and decreased for non-churned ("No").
F1-score: Increased for churned customers ("Yes") but decreased for non-churned ("No").
Overall, SMOTE improved the model's ability to predict churned customers but led to some trade-offs in precision and recall for non-churned customers.

Without SMOTE (Original Model):

Accuracy: 81%
Precision for "Yes" (churned): 70%
Recall for "Yes" (churned): 49%
F1-score for "Yes" (churned): 57%


With SMOTE:

Accuracy: 75%
Precision for "Yes" (churned): 51%
Recall for "Yes" (churned): 77%
F1-score for "Yes" (churned): 62%
Comparing the two scenarios:

The original model without SMOTE has higher accuracy, precision for the "Yes" class, and F1-score for the "Yes" class.
The model with SMOTE has higher recall for the "Yes" class.
Both approaches have their trade-offs: the original model focuses more on predicting the majority class ("No" churn), while the model with SMOTE performs better in identifying churned customers ("Yes"). The choice between these approaches depends on the specific goals of your analysis and the trade-offs you're willing to make between different performance metrics.

In [128]:
independent_vars = ["tenure", "SeniorCitizen", "MonthlyCharges"]
X = data[independent_vars]
y = data["Churn"]

In [129]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [130]:
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [131]:
tomek_links = TomekLinks()
X_train_resampled, y_train_resampled = tomek_links.fit_resample(X_train_scaled, y_train)


In [132]:
logreg_model = LogisticRegression()
logreg_model.fit(X_train_resampled, y_train_resampled)

In [133]:
y_pred = logreg_model.predict(X_test_scaled)

In [134]:
accuracy = accuracy_score(y_test, y_pred)
classification_rep = classification_report(y_test, y_pred)

In [135]:
print(f"Accuracy with TomekLinks: {accuracy:.2f}")
print("Classification Report with TomekLinks:\n", classification_rep)

Accuracy with TomekLinks: 0.79
Classification Report with TomekLinks:
               precision    recall  f1-score   support

          No       0.84      0.88      0.86      1036
         Yes       0.62      0.55      0.58       373

    accuracy                           0.79      1409
   macro avg       0.73      0.72      0.72      1409
weighted avg       0.79      0.79      0.79      1409



Conclusion:

Applying different techniques like SMOTE and TomekLinks can have varying effects on the model's performance. In this case, using TomekLinks undersampling improved the model's ability to predict the minority class ("Yes" churned customers) by increasing its recall. However, it also slightly decreased the precision for the majority class ("No" non-churned customers). The choice of technique depends on the specific goals and trade-offs in your churn prediction problem.
