In [1]:
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from scipy.stats import zscore
from sklearn.metrics import mean_squared_error, accuracy_score

In [2]:
# Read in the data set
df = pd.read_csv('churn_clean.csv')

In [3]:
# Identify and drop any missing values
df = df.drop(columns=['CaseOrder', 'Customer_id', 'Interaction', 'UID',
                      'County', 'Zip', 'Lat', 'Lng', 'Population', 'Area', 'TimeZone', 'Job', 'Email', 'Contacts',
                      'City', 'State', 'Marital', 'PaymentMethod', 'PaperlessBilling'])

df = df.dropna()

df["churn"] = df["Churn"].apply(lambda x: 1 if x == "Yes" else 0)
df.drop("Churn", axis = 1, inplace = True)

In [4]:
print (df.columns)

Index(['Children', 'Age', 'Income', 'Gender', 'Outage_sec_perweek',
       'Yearly_equip_failure', 'Techie', 'Contract', 'Port_modem', 'Tablet',
       'InternetService', 'Phone', 'Multiple', 'OnlineSecurity',
       'OnlineBackup', 'DeviceProtection', 'TechSupport', 'StreamingTV',
       'StreamingMovies', 'Tenure', 'MonthlyCharge', 'Bandwidth_GB_Year',
       'Item1', 'Item2', 'Item3', 'Item4', 'Item5', 'Item6', 'Item7', 'Item8',
       'churn'],
      dtype='object')


In [5]:
# One-hot encode categorical variables
df = pd.get_dummies(df, columns=['Gender'], prefix='Gender', drop_first=True)

df = pd.get_dummies(df, columns=['Contract', 'InternetService'])

In [6]:
df.replace(to_replace={'Yes':1, 'No':0}, inplace=True)

In [7]:
# Compute the Z-scores of all columns in the dataframe
z_scores = df.apply(zscore)

In [8]:
# Set a threshold for the Z-scores
threshold = 3

In [9]:
# Identify rows with Z-scores above the threshold
outliers = np.where(np.abs(z_scores) > threshold)

In [10]:
# Get the indices of the rows with outliers
outlier_indices = list(set(outliers[0]))


In [11]:
# Drop the rows with outliers
df = df.drop(df.index[outlier_indices])

In [12]:
# Split the data into training and test data sets and provide the file(s).
train, test = train_test_split(df, test_size=0.2, random_state=42)
train.to_csv('train209p2.csv', index=False)
test.to_csv('testd209p2.csv', index=False)

In [13]:
# Split the data into training and test sets
X = df.drop(['churn'], axis=1)
y = df['churn']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
# Create the decision tree model
dt = DecisionTreeClassifier(max_depth=3, min_samples_leaf=5)
dt.fit(X_train, y_train)

DecisionTreeClassifier(max_depth=3, min_samples_leaf=5)

In [15]:
# Predict using the test set
y_pred = dt.predict(X_test)

In [16]:
# Evaluate the model's accuracy
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy: ", accuracy)

Accuracy:  0.8372093023255814


In [17]:
# Calculate mean squared error of the prediction model
mse = mean_squared_error(y_test, y_pred)
print("Mean Squared Error: ", mse)

Mean Squared Error:  0.16279069767441862


In [18]:
df.to_csv("D209p2_clean.csv", index=False)