In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix, roc_curve, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.linear_model import Ridge, Lasso
from sklearn.ensemble import GradientBoostingRegressor, AdaBoostRegressor
from sklearn.ensemble import GradientBoostingClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [None]:
# Set up warning settings
warnings.filterwarnings('ignore')

# Set pandas and matplotlib settings
pd.set_option('display.max_columns', None)
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

In [None]:
# Import data
# data = pd.read_csv('Your_File_Name.csv')

In [None]:
# Conduct Exploratory Data Analysis (EDA)
num_records = data.shape[0]
num_columns = data.shape[1]
print("Number of records:", num_records)
print("Number of columns:", num_columns)

# descriptive statistics
# make sure numerical columns are typed appropriately
data['tenure'] = pd.to_numeric(data['tenure'], errors='coerce')
data['MonthlyCharges'] = pd.to_numeric(data['MonthlyCharges'], errors='coerce')
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data.describe()

# get null counts
null_counts = data.isnull().sum()
print("Number of Null Values in Each Column:")
print(null_counts)

# show data types for each column
print("Data Types for Each Column:")
data_types = data.dtypes
print(data.dtypes)

# Show unique counts
print("Unique Records for Each Column:")
unique_counts = data.nunique()
print(unique_counts)

# combine results into df
summary_df = pd.concat([data_types, unique_counts, null_counts], axis=1)
summary_df.columns = ['Data Types', 'Unique Counts', 'Null Counts']

# print combined df
print(summary_df)

# set categorical color palette
palette = sns.color_palette("Set2")

# set categorical columns
categorical_cols = ['SeniorCitizen', 'gender', 'Partner', 'Dependents', 'PhoneService', 'MultipleLines', 'InternetService',
                    'OnlineSecurity', 'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
                    'StreamingMovies', 'Contract', 'PaperlessBilling', 'PaymentMethod', 'Churn']

# plot bar chart for each categorical variable
for col in categorical_cols:
    plt.figure(figsize=(4, 3))
    data[col].value_counts().plot(kind='bar', color=palette)
    plt.title('Distribution of ' + col)
    plt.xlabel(col)
    plt.ylabel('Frequency')
    plt.xticks(rotation=0)
    plt.show()


In [None]:
# Preprocessing
# label encode and one hot encode categorical features
# columns to label encode
binary_cols = ['gender', 'SeniorCitizen', 'Partner', 'Dependents', 'PhoneService',
               'PaperlessBilling', 'Churn']
# initialize LabelEncoder
label_encoder = LabelEncoder()
# label encode binary columns
for col in binary_cols:
    data[col] = label_encoder.fit_transform(data[col])

# columns to one-hot encode
columns_to_encode = ['MultipleLines', 'InternetService', 'OnlineSecurity', 'OnlineBackup',
                     'DeviceProtection', 'TechSupport', 'StreamingTV', 'StreamingMovies',
                     'Contract', 'PaymentMethod']
# initialize encoder
encoder = OneHotEncoder(sparse=False, handle_unknown='ignore')
# transform selected columns
encoded_data = encoder.fit_transform(data[columns_to_encode])
# convert encoded data into a df
encoded_df = pd.DataFrame(encoded_data, columns=encoder.get_feature_names_out(columns_to_encode))
# concatenate original df with encoded df
data_encoded = pd.concat([data, encoded_df], axis=1)
# drop original categorical columns
data_encoded.drop(columns=columns_to_encode, inplace=True)
# assign to original df name
data = data_encoded
data.head()

# create new average monthly cost
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce')
data['average_monthly_cost'] = data['TotalCharges'] / data['tenure']
data.head(5)

# create new number of services feature
columns_to_sum = ['MultipleLines_Yes', 'InternetService_DSL', 'InternetService_Fiber optic',
                  'OnlineSecurity_Yes', 'OnlineBackup_Yes', 'DeviceProtection_Yes',
                  'TechSupport_Yes', 'StreamingTV_Yes', 'StreamingMovies_Yes']

data['number_of_services'] = data[columns_to_sum].sum(axis=1)

# correlation matrix
product_correlation = data.corr()
# heatmap for correlation matrix
plt.figure(figsize=(20, 16))
sns.heatmap(product_correlation, cmap='coolwarm', annot=True, fmt='.2f', linewidths=.51)
plt.title('Correlation Matrix')
plt.show()

# calculate correlation between Churn and other columns
churn_correlations = data.corr()['Churn'].drop('Churn')
# normalize correlation values to range between -0.4 and 0.4 (so coolwarm works)
norm = Normalize(vmin=-0.4, vmax=0.4)

# Sort the churn_correlations Series in descending order based on correlation values
sorted_churn_correlations = churn_correlations.sort_values(ascending=False)

# Plot the sorted data
plt.figure(figsize=(15, 9))
plt.barh(sorted_churn_correlations.index, sorted_churn_correlations.values, color=plt.cm.coolwarm(norm(sorted_churn_correlations.values)), edgecolor='black')
plt.title('Correlation between Features and Churn')
plt.xlabel('Correlation with Churn')
plt.ylabel('Features')
plt.grid(axis='x')
plt.show()

In [None]:
# Split data into features and target variable
X = data.drop('Churn', axis=1)
y = data['Churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize and train machine learning models

# Classification models
clf_gradient_boosting = GradientBoostingClassifier(random_state=42)
clf_gradient_boosting.fit(X_train, y_train)
predictions_gradient_boosting = clf_gradient_boosting.predict(X_test)

clf_adaboost = AdaBoostClassifier(random_state=42)
clf_adaboost.fit(X_train, y_train)
predictions_adaboost = clf_adaboost.predict(X_test)

clf_k_neighbors = KNeighborsClassifier()
clf_k_neighbors.fit(X_train, y_train)
predictions_k_neighbors = clf_k_neighbors.predict(X_test)

clf_naive_bayes = GaussianNB()
clf_naive_bayes.fit(X_train, y_train)
predictions_naive_bayes = clf_naive_bayes.predict(X_test)

clf_decision_tree = DecisionTreeClassifier(random_state=42)
clf_decision_tree.fit(X_train, y_train)
predictions_decision_tree = clf_decision_tree.predict(X_test)

clf_random_forest = RandomForestClassifier(random_state=42)
clf_random_forest.fit(X_train, y_train)
predictions_random_forest = clf_random_forest.predict(X_test)

clf_logistic_regression = LogisticRegression(random_state=42)
clf_logistic_regression.fit(X_train, y_train)
predictions_logistic_regression = clf_logistic_regression.predict(X_test)

clf_svc = SVC(random_state=42)
clf_svc.fit(X_train, y_train)
predictions_svc = clf_svc.predict(X_test)

# Regression models

reg_ridge = Ridge()
reg_ridge.fit(X_train, y_train)
predictions_ridge = reg_ridge.predict(X_test)

reg_lasso = Lasso()
reg_lasso.fit(X_train, y_train)
predictions_lasso = reg_lasso.predict(X_test)

reg_gradient_boosting = GradientBoostingRegressor(random_state=42)
reg_gradient_boosting.fit(X_train, y_train)
predictions_gradient_boosting_reg = reg_gradient_boosting.predict(X_test)

reg_adaboost = AdaBoostRegressor(random_state=42)
reg_adaboost.fit(X_train, y_train)
predictions_adaboost_reg = reg_adaboost.predict(X_test)

reg_linear = LinearRegression()
reg_linear.fit(X_train, y_train)
predictions_linear = reg_linear.predict(X_test)

reg_random_forest = RandomForestRegressor(random_state=42)
reg_random_forest.fit(X_train, y_train)
predictions_random_forest = reg_random_forest.predict(X_test)

reg_svr = SVR()
reg_svr.fit(X_train, y_train)
predictions_svr = reg_svr.predict(X_test)

In [None]:
# Evaluate model performance

from sklearn.metrics import accuracy_score, mean_squared_error, r2_score

# Classification model evaluation
accuracy_gradient_boosting = accuracy_score(y_test, predictions_gradient_boosting)
accuracy_adaboost = accuracy_score(y_test, predictions_adaboost)
accuracy_k_neighbors = accuracy_score(y_test, predictions_k_neighbors)
accuracy_naive_bayes = accuracy_score(y_test, predictions_naive_bayes)
accuracy_decision_tree = accuracy_score(y_test, predictions_decision_tree)
accuracy_random_forest = accuracy_score(y_test, predictions_random_forest)
accuracy_logistic_regression = accuracy_score(y_test, predictions_logistic_regression)
accuracy_svc = accuracy_score(y_test, predictions_svc)

# Regression model evaluation
mse_ridge = mean_squared_error(y_test, predictions_ridge)
mse_lasso = mean_squared_error(y_test, predictions_lasso)
mse_gradient_boosting = mean_squared_error(y_test, predictions_gradient_boosting_reg)
mse_adaboost = mean_squared_error(y_test, predictions_adaboost_reg)
mse_linear = mean_squared_error(y_test, predictions_linear)
mse_random_forest = mean_squared_error(y_test, predictions_random_forest)
mse_svr = mean_squared_error(y_test, predictions_svr)

r2_ridge = r2_score(y_test, predictions_ridge)
r2_lasso = r2_score(y_test, predictions_lasso)
r2_gradient_boosting = r2_score(y_test, predictions_gradient_boosting_reg)
r2_adaboost = r2_score(y_test, predictions_adaboost_reg)
r2_linear = r2_score(y_test, predictions_linear)
r2_random_forest = r2_score(y_test, predictions_random_forest)
r2_svr = r2_score(y_test, predictions_svr)
