In [2]:
# connect to cluster
from os.path import expanduser
from IPython.parallel import Client

In [9]:
url_file = expanduser('~/.starcluster/ipcluster/SecurityGroup:@sc-my_cluster-us-east-1.json')
sshkey = expanduser('~/.ssh/Amazon_AWS_DataGuy.pem') 
client = Client(url_file, 
                sshkey = sshkey)

In [65]:
!ipcluster stop

2015-10-18 13:19:13.256 [IPClusterStop] Stopping cluster [pid=745] with [signal=2]


In [66]:
!ipcluster start -n=3 --daemon

In [67]:
from IPython.parallel import Client

client = Client()

len(client)

3

In [68]:
dview = client.direct_view()
len(client.ids)

3

In [69]:
from sklearn.externals import joblib
from sklearn.cross_validation import ShuffleSplit
import os

def persist_cv_splits(X, y, n_cv_iter=5, name='data',
    suffix="_cv_%03d.pkl", test_size=0.25, random_state=None):
    """Materialize randomized train test splits of a dataset."""

    cv = ShuffleSplit(X.shape[0], n_iter=n_cv_iter,
        test_size=test_size, random_state=random_state)
    cv_split_filenames = []
    
    for i, (train, test) in enumerate(cv):
        cv_fold = (X[train], y[train], X[test], y[test])
        cv_split_filename = name + suffix % i
        cv_split_filename = os.path.abspath(cv_split_filename)
        joblib.dump(cv_fold, cv_split_filename)
        cv_split_filenames.append(cv_split_filename)
    
    return cv_split_filenames

In [70]:
true = ["yes",
        "True",
        "True."]

false = ["no",
         "False",
         "False."]

In [71]:
import pandas as pd
df = pd.read_csv("~/DSCI6003-student/week4/exercise/data/churn.csv",
                 true_values = true, 
                 false_values = false)

In [72]:
drop_col = ["State",
            "Account Length",
            "Area Code",
            "Phone"]

df.drop(drop_col, axis = 1, 
        inplace = True)

In [73]:
df.rename(columns={'Churn?':'Churn'}, 
          inplace=True)

In [74]:
from sklearn.neighbors import KNeighborsClassifier
import numpy as np
def smote(X, y, minority_weight=.5):
    '''
    generates new observations in minority class
    so that output X, y have specified percentage of majority observations
    '''
    # compute number of new examples required
    # True = 1, False = 0 
    # If True is majority (>=.50) Then round(class_ratio) = 1.0 
    # Else if False is majoirty (< .50) Then round(class_ratio) = 0.0
    # For walk through, let majority class be False = 0
    class_ratio = y.sum()/float(len(y))
    # majority_class_label = 0.0
    majority_class_label = round(class_ratio)
    
    # X_minority holds all obs that correspond to the True label (minority class)
    X_minority = X[y!=majority_class_label]
    # y_minority holds all True values (minority class)
    y_minority = y[y!=majority_class_label]
    min_count = len(X_minority)
    maj_count = len(X) - min_count
    # scaling factor, for minority_weight = 0.5, reduces to Maj_count = SF * Min_count
    # the SF is the ratio between theh classes, muliplying by this will balance them
    scaling_factor = (maj_count/float(min_count))*(minority_weight/(1-minority_weight))
    # generate new_obs_targets so that the minority class will be balance with the majority class
    new_observations_target = round(scaling_factor*min_count) - min_count

    # train KNN
    knn_model = KNeighborsClassifier(n_neighbors=int(round(len(X_minority)**.5)))
    knn_model.fit(X_minority, y_minority)
    if new_observations_target < len(X_minority):
        sample_indices = np.random.choice(xrange(X_minority), 
                                          size=new_observations_target,
                                          replace=False)
        smote_samples = X_minority[sample_indices]
    else:
        smote_samples = X_minority
    neighbors = knn_model.kneighbors(smote_samples)[1]
    
    # generate new samples
    new_observations = np.empty((0,X.shape[1]))
    while len(new_observations) < new_observations_target:
        index = len(new_observations) % len(smote_samples)
        neighbor_index = np.random.choice(neighbors[index])
        neighbor = smote_samples[neighbor_index]
        x = X_minority[index]
        new_x = x + (neighbor - x)*np.random.random(size=X_minority.shape[1])
        new_observations = np.vstack((new_observations, new_x))
    minority_class_label = (majority_class_label + 1) % 2
    X = np.vstack((X, new_observations))
    y = np.hstack((y, np.array([minority_class_label]*len(new_observations))))
    
    return X, y

In [75]:
# columns won't rescale unless dtyp = "float"
df2 = df[df.columns[2:-1]].astype(float)

In [76]:
from sklearn.preprocessing import scale, MinMaxScaler
# rescale columns to range (-1,1)
# Do not include columns with boolean values
df2[df.columns[2:-1]] = df2[df.columns[2:-1]].apply(lambda x: MinMaxScaler(feature_range=(-1,1)).fit_transform(x))

In [77]:
# Move columns with boolean values back into dataframe
df2[df.columns[0]] = df[df.columns[0]].values
df2[df.columns[1]] = df[df.columns[1]].values
df2[df.columns[-1]] = df[df.columns[-1]].values

In [78]:
# target values: churn/not churn --> true/false
y = df2.Churn.values
# feature data
X = df2[df2.columns[:-1]].values

In [79]:
rank_ind = np.array([13,  3,  1, 14, 11,  4,  6,  7,  9,  0, 10, 12,  2,  5,  8, 15])

In [80]:
X_top_features = df2[rank_ind[0:12]].values

In [81]:
X_smote, y_smote = smote(X_top_features,y)

In [82]:
churn_split_filenames = persist_cv_splits(X_smote, 
                                          y_smote,
                                          name='churn', 
                                          random_state=42)

##Future Note
    Train on subset of data set
    This grid search should only be done on 80% of the data
    I would have to find a way to import the test set (20%)
    into another notebook - since I insist on performing grid 
    searches in isolation 

In [83]:
churn_split_filenames

['/Users/Alexander/DSCI6006_Business/churn_cv_000.pkl',
 '/Users/Alexander/DSCI6006_Business/churn_cv_001.pkl',
 '/Users/Alexander/DSCI6006_Business/churn_cv_002.pkl',
 '/Users/Alexander/DSCI6006_Business/churn_cv_003.pkl',
 '/Users/Alexander/DSCI6006_Business/churn_cv_004.pkl']


    Each churn_cv_00*.pkl file has 4 churn_cv_000.pkl_0*.npy accompanied files save.
    Each one is an array: Xtrain, Xtest, ytrain, ytest


In [84]:
ls -lh churn*

-rw-r--r--  1 Alexander  staff   276B Oct 18 13:19 churn_cv_000.pkl
-rw-r--r--  1 Alexander  staff   432K Oct 18 13:19 churn_cv_000.pkl_01.npy
-rw-r--r--  1 Alexander  staff    33K Oct 18 13:19 churn_cv_000.pkl_02.npy
-rw-r--r--  1 Alexander  staff   144K Oct 18 13:19 churn_cv_000.pkl_03.npy
-rw-r--r--  1 Alexander  staff    11K Oct 18 13:19 churn_cv_000.pkl_04.npy
-rw-r--r--  1 Alexander  staff   276B Oct 18 13:19 churn_cv_001.pkl
-rw-r--r--  1 Alexander  staff   432K Oct 18 13:19 churn_cv_001.pkl_01.npy
-rw-r--r--  1 Alexander  staff    33K Oct 18 13:19 churn_cv_001.pkl_02.npy
-rw-r--r--  1 Alexander  staff   144K Oct 18 13:19 churn_cv_001.pkl_03.npy
-rw-r--r--  1 Alexander  staff    11K Oct 18 13:19 churn_cv_001.pkl_04.npy
-rw-r--r--  1 Alexander  staff   276B Oct 18 13:19 churn_cv_002.pkl
-rw-r--r--  1 Alexander  staff   432K Oct 18 13:19 churn_cv_002.pkl_01.npy
-rw-r--r--  1 Alexander  staff    33K Oct 18 13:19 churn_cv_002.pkl_02.npy
-rw-r--r--  1 Alexander  staff   

In [85]:
# # insert cross validation data files into starcluster instance 

# ! starcluster put my_cluster --user sgeadmin churn_cv_00* /mnt/sgeadmin/

In [86]:
# %%px -t0
# %%bash
# scp /mnt/sgeadmin/churn_cv_00* node001:/mnt/sgeadmin/
# scp /mnt/sgeadmin/churn_cv_00* node002:/mnt/sgeadmin/

In [87]:
def compute_evaluation(cv_split_filename, model, params):
    from sklearn.metrics import accuracy_score,f1_score
    """Function executed by a worker to evaluate a model on a CV split"""
    # All module imports should be executed in the worker namespace
    from sklearn.externals import joblib

    X_train, y_train, X_validation, y_validation = joblib.load(
        cv_split_filename)
    
    #, mmap_mode='c') 'c' = compressed file, 
    # this is the reason for "memory-mapped' error
    # LOOK INTO THIS!!!!
    
    model.set_params(**params)
    model.fit(X_train, y_train)
    #validation_score = model.score(X_validation, y_validation)
    return f1_score(model.predict(X_validation),y_validation, average = "binary")

In [88]:
def grid_search(lb_view, model, cv_split_filenames, param_grid):
    """Launch all grid search evaluation tasks."""
    all_tasks = []
    all_parameters = list(ParameterGrid(param_grid))
    
    for i, params in enumerate(all_parameters):
        task_for_params = []
        
        for j, cv_split_filename in enumerate(cv_split_filenames):    
            t = lb_view.apply(
                compute_evaluation, cv_split_filename, model, params)
            task_for_params.append(t) 
        
        all_tasks.append(task_for_params)
        
    return all_parameters, all_tasks

In [89]:
# remote_filenames = ['/mnt/sgeadmin/' + filename.split('/')[-1] for filename in churn_split_filenames]
# remote_filenames

In [90]:
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from sklearn.ensemble import GradientBoostingClassifier

from sklearn.grid_search import ParameterGrid
import numpy as np

In [91]:
lb_view = client.load_balanced_view()

In [92]:
model = RandomForestClassifier(n_estimators = 1000,
                                random_state = 41)

In [93]:
params = {
'max_leaf_nodes': [2,4,None],
'min_samples_leaf': [2,4],
"max_features": [6,10],
"max_depth": [3,5,7],
"min_samples_split": [2,4,8]
}

In [94]:
all_parameters, all_tasks = grid_search(lb_view, 
                                        model, 
                                        churn_split_filenames, 
                                        params)

In [95]:
def progress(tasks):
    return np.mean([task.ready() for task_group in tasks
                                 for task in task_group])

In [96]:
def find_bests(all_parameters, all_tasks, n_top=5):
    """Compute the mean score of the completed tasks"""
    mean_scores = []
    
    for param, task_group in zip(all_parameters, all_tasks):
        scores = [t.get() for t in task_group if t.ready()]
        if len(scores) == 0:
            continue
        mean_scores.append((np.mean(scores), param))
                   
    return sorted(mean_scores, reverse=True)[:n_top]

In [102]:
print("Tasks completed: {0}%".format(100 * progress(all_tasks)))

Tasks completed: 62.2222222222%


In [106]:
from pprint import pprint

print("Tasks completed: {0}%".format(100 * progress(all_tasks)))
pprint(find_bests(all_parameters, all_tasks))

Tasks completed: 100.0%
[(0.90377059998803499,
  {'max_depth': 7,
   'max_features': 10,
   'max_leaf_nodes': None,
   'min_samples_leaf': 2,
   'min_samples_split': 4}),
 (0.90377059998803499,
  {'max_depth': 7,
   'max_features': 10,
   'max_leaf_nodes': None,
   'min_samples_leaf': 2,
   'min_samples_split': 2}),
 (0.90359767009072844,
  {'max_depth': 7,
   'max_features': 6,
   'max_leaf_nodes': None,
   'min_samples_leaf': 2,
   'min_samples_split': 8}),
 (0.90299533752766725,
  {'max_depth': 7,
   'max_features': 6,
   'max_leaf_nodes': None,
   'min_samples_leaf': 2,
   'min_samples_split': 4}),
 (0.90299533752766725,
  {'max_depth': 7,
   'max_features': 6,
   'max_leaf_nodes': None,
   'min_samples_leaf': 2,
   'min_samples_split': 2})]
