In [1]:
# Import packages
import pandas as pd
import numpy as np
from data_util import *
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import RFECV
from sklearn import linear_model
from sklearn.metrics import make_scorer
import knn
from scipy.spatial import distance

In [2]:
# Load data
churn = pd.read_csv('churn_data.csv')
print(churn.shape)

# It was  yelling at me because data_y was catagorical
# so let's fix that
data_bi = []
for i in churn.Churn:
    if i == 'Yes':
        data_bi.append(1)
    elif i == 'No':
        data_bi.append(0)
churn['Churn'] = data_bi

# Predictor variables
data_x = churn[list(churn)[1:-1]]
print(data_x.shape)

# Response variable
data_y = churn[list(churn)[-1]]
print(data_y.shape)

(128, 9)
(128, 7)
(128,)


In [3]:
# Get some basic info
print(data_x.dtypes)
print(data_x.head())

Gender        object
Age            int64
Income        object
FamilySize     int64
Education      int64
Calls          int64
Visits         int64
dtype: object
   Gender  Age Income  FamilySize  Education  Calls  Visits
0    Male   34  Lower           4         16     14       5
1    Male   20  Lower           5         14     49       1
2  Female   30  Lower           4         20     19       4
3  Female   46  Lower           4         14     15       4
4  Female   23  Lower           4         16     18       0


In [4]:
# Check for NaN values because clasifiers can't
# work with those
print(data_x.Gender.value_counts())
print(np.isnan(data_x.Age).value_counts())
print(data_x.Income.value_counts())
print(np.isnan(data_x.FamilySize).value_counts())
print(np.isnan(data_x.Education).value_counts())
print(np.isnan(data_x.Calls).value_counts())
print(np.isnan(data_x.Visits).value_counts())

Male      72
Female    56
Name: Gender, dtype: int64
False    128
Name: Age, dtype: int64
Upper    65
Lower    63
Name: Income, dtype: int64
False    128
Name: FamilySize, dtype: int64
False    128
Name: Education, dtype: int64
False    128
Name: Calls, dtype: int64
False    128
Name: Visits, dtype: int64


In [5]:
# One hot encode Gender and Income columns so that they are
# quantitative and distance can be measured between them
data_x = pd.get_dummies(data_x, columns=cat_features(data_x))
print(data_x.head())

   Age  FamilySize  Education  Calls  Visits  Gender_Female  Gender_Male  \
0   34           4         16     14       5              0            1   
1   20           5         14     49       1              0            1   
2   30           4         20     19       4              1            0   
3   46           4         14     15       4              1            0   
4   23           4         16     18       0              1            0   

   Income_Lower  Income_Upper  
0             1             0  
1             1             0  
2             1             0  
3             1             0  
4             1             0  


In [6]:
# make sure everything looks okay
print(data_x.shape)
print(data_x.dtypes)
print(data_x.head())

(128, 9)
Age              int64
FamilySize       int64
Education        int64
Calls            int64
Visits           int64
Gender_Female    uint8
Gender_Male      uint8
Income_Lower     uint8
Income_Upper     uint8
dtype: object
   Age  FamilySize  Education  Calls  Visits  Gender_Female  Gender_Male  \
0   34           4         16     14       5              0            1   
1   20           5         14     49       1              0            1   
2   30           4         20     19       4              1            0   
3   46           4         14     15       4              1            0   
4   23           4         16     18       0              1            0   

   Income_Lower  Income_Upper  
0             1             0  
1             1             0  
2             1             0  
3             1             0  
4             1             0  


In [7]:
# Get rid of one gender column and one income column 
# because gender and income have binary values in 
# this data set and counting that information twice 
# could over train our model

data_x = data_x.drop(['Income_Upper', 'Gender_Male'], 1)

# Make sure it looks fine
print(data_x.head())
print(data_x.shape)

# ???: will i need to normalize the data?

   Age  FamilySize  Education  Calls  Visits  Gender_Female  Income_Lower
0   34           4         16     14       5              0             1
1   20           5         14     49       1              0             1
2   30           4         20     19       4              1             1
3   46           4         14     15       4              1             1
4   23           4         16     18       0              1             1
(128, 7)


In [8]:
# split train and test
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size=.3, random_state=4)

# select features with RFECV
print('Selecting features with RFECV...\n')
selector_f = RFECV(estimator = linear_model.LinearRegression(), 
                   scoring = make_scorer(r2_score), 
                   cv=7)
selector_f.fit(x_train, y_train)

# Get cols
xt_train, xt_test = selector_f.transform(x_train), selector_f.transform(x_test)

Selecting features with RFECV...



In [9]:
# base model
base_mod = linear_model.LinearRegression()
base_mod.fit(x_train, y_train)
preds = base_mod.predict(x_test)
print_regression_error_report(preds, y_test)

MSE, MAE, R^2, EVS: [0.1889543737813036, 0.35690398453892624, -2.3060448962948263, -2.137644469567811]


In [10]:
# Base Model Notes: 
    # Mean Squared Error:        0.1889543737813036
    # Mean Absolute Error:       0.35690398453892624
    # R-squared:                -2.3060448962948263
    # Explained Variance Score: -2.137644469567811

In [11]:
# KNN
x_train, x_test, y_train, y_test = train_test_split(data_x, data_y, test_size = 0.3, random_state = 4)
knn = knn.KNN(3, distance.euclidean) # Create a 3-NN algorithm with Euclidean distance
knn.fit(x_train, y_train)
y_hat = knn.predict(x_test)
print_regression_error_report(y_hat, y_test)

Calculating distances from position1...



ValueError: Buffer has wrong number of dimensions (expected 1, got 0)

In [None]:
# one or two of the others