# Disconnect Model with RPV + GPI

In [1]:
import pandas as pd
import pickle as pk
import numpy as np

from datetime import datetime
from dateutil.parser import parse

from med_words import med_words

from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.gaussian_process import GaussianProcessClassifier
from sklearn.gaussian_process.kernels import RBF
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB


import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn import metrics

### Import WSLive Results

In [None]:
wslive_path = 'U:\\Source Files\\Data Analytics\\Derek\\SAS_DATA\\SURVEY\\wslive_results.sas7bdat'

wslive_df = pd.read_sas(wslive_path, encoding='latin')

wslive_2019 = wslive_df[wslive_df['WSLIVE_FILE_DT'] >= '2019-01-01']

# save off 2019 WSLive results for easier retrieval later
with open('wslive_2019.pk','wb') as f:
    pk.dump(wslive_2019, f)

In [28]:
# if starting new, just load 2019 wslive results here
wslive_2019 = pk.load(open('U:\\Source Files\\Data Analytics\\Data-Science\\Data\\Get_Phone_Info\\wslive_2019.pk','rb'))

In [82]:
# scope of this modeling data
wsl = wslive_2019[wslive_2019['WSLIVE_FILE_DT'] >= '2019-07-01']

wsl = wsl[['OFFICE_ADDRESS_LINE_1', 
           'OFFICE_ADDRESS_LINE_2', 
           'OFFICE_ADDRESS_VERIFIED_UPDATED',
           'COMMENTS',
           'OFFICE_TELEPHONE',
           'OFFICE_ADDRESS_CITY', 
           'OFFICE_ADDRESS_STATE',
           'OFFICE_ADDRESS_ZIP',]]

wsl.rename(columns={'OFFICE_TELEPHONE': 'phone',
                    'OFFICE_ADDRESS_CITY': 'wsl_city',
                    'OFFICE_ADDRESS_STATE':'wsl_state',
                    'OFFICE_ADDRESS_ZIP': 'wsl_zip',
                    }, inplace=True)

for col in wsl.columns:
    wsl[col] = wsl[col].astype(str)
    
    
wsl['isDisconnected'] = wsl['COMMENTS'].apply(lambda x: 1 if x == 'NOT IN SERVICE' else 0)
wsl['isWrong'] = [1 if com=='COMPLETE' and v=='1' else 0 for com, v in zip(wsl['COMMENTS'], wsl['OFFICE_ADDRESS_VERIFIED_UPDATED'])]

wsl.drop(columns=['COMMENTS','OFFICE_ADDRESS_VERIFIED_UPDATED'],
         axis=1, 
         inplace=True)


### Import GPI Results

In [83]:
# all GPI results
#gpi = pd.read_csv('U:\\Source Files\\Data Analytics\\Data-Science\\Data\\Get_Phone_Info\\gpi_all.csv', dtype=object)
gpi = pd.read_csv('gpi_all.csv', dtype=object)

keep = [
    'phone',
    'ProviderNameOut',
    'LineTypeOut',
    'ContactNameOut',
    'ContactAddressOut',
    'ContactCityOut',
    'ContactStateOut',
    'ContactZipOut',
    'ContactPhoneType',
    'ContactQualityScore',
    'DateOfPorting',
    'NoteDescriptions'
]
gpi = gpi[keep]
for col in gpi:
    gpi[col] = gpi[col].astype(str)

In [84]:
print('GPI Results:', len(gpi))

GPI Results: 1459


### Import RPV Results

In [85]:
rpv_archive=pd.read_csv('U:\\Source Files\\Data Analytics\\Data-Science\\Data\\RPV\output\\_archive\\RPV_archive.csv', 
                        dtype=object).drop_duplicates()

### Merge

In [86]:
gpi = gpi.merge(rpv_archive, on='phone', how='inner')
print(len(gpi))
gpi = gpi.merge(wsl, on='phone', how='inner')
print(len(gpi))

1412
1846


### Observing duplication

In [87]:
gpi[gpi['phone'].duplicated()].sort_values(by='phone')

Unnamed: 0,phone,ProviderNameOut,LineTypeOut,ContactNameOut,ContactAddressOut,ContactCityOut,ContactStateOut,ContactZipOut,ContactPhoneType,ContactQualityScore,...,iscell,carrier,date_checked,OFFICE_ADDRESS_LINE_1,OFFICE_ADDRESS_LINE_2,wsl_city,wsl_state,wsl_zip,isDisconnected,isWrong
1251,2014883131,"AIRUS, INC. - NJ",,ANDREW FINK MD PHYSICIANS SURGEONS,,HACKENSACK,NJ,7601,BUSINESS,,...,N,"Airus, Inc.",2019-09-04,,385 PROSPECT AVE STE 200,HACK,NJ,07601,0,0
1256,2027458000,CAVALIER TELEPHONE,,ERDULFO ANTONIO ROMERO JR MD INTERNAL MEDICINE,,WASHINGTON,DC,20422-0001,BUSINESS,,...,N,Cavalier Telephone,2019-09-04,,3800 RESERVOIR RD NW,WASHINGTON,DC,20007,0,0
1257,2027458000,CAVALIER TELEPHONE,,ERDULFO ANTONIO ROMERO JR MD INTERNAL MEDICINE,,WASHINGTON,DC,20422-0001,BUSINESS,,...,N,Cavalier Telephone,2019-09-04,,50 IRVING ST NW,WASHINGTON,DC,20422,0,0
1258,2027458000,CAVALIER TELEPHONE,,ERDULFO ANTONIO ROMERO JR MD INTERNAL MEDICINE,,WASHINGTON,DC,20422-0001,BUSINESS,,...,N,Cavalier Telephone,2019-09-04,VETERNAS AFFAIRS MEDICAL CENTER,50 IRVING ST NW,WASHINGTON,DC,20422,0,0
1259,2027458000,CAVALIER TELEPHONE,,ERDULFO ANTONIO ROMERO JR MD INTERNAL MEDICINE,,WASHINGTON,DC,20422-0001,BUSINESS,,...,N,Cavalier Telephone,2019-09-04,,50 IRVING ST NW,WASHINGTON,DC,20422,0,0
1255,2027458000,CAVALIER TELEPHONE,,ERDULFO ANTONIO ROMERO JR MD INTERNAL MEDICINE,,WASHINGTON,DC,20422-0001,BUSINESS,,...,N,Cavalier Telephone,2019-09-04,VETERANS HOSPITAL,50 IRVING ST NW,WASHINGTON,DC,20422,0,0
581,2028776429,PAETEC COMM - DC,LANDLINE,WASHINGTON HOSPITAL CENTER,,WASHINGTON,DC,20024,BUSINESS,LOW,...,N,PAETEC,2019-09-25,BA2,110 IRVING ST NW,WASHINGTON,DC,20010,0,0
361,2029943893,LEVEL 3 COMM - MD,VOIP,GWU,,WASHINGTON,DC,20036,BUSINESS,LOW,...,N,Level 3,2019-09-25,GEORGE WASHINGTON UNIV HOSP,900 23RD ST NW,WASHINGTON,DC,20037,1,0
169,2032352505,FRONTIER COMM - CT,LANDLINE,HORTON PAUL C MD,321 RESEARCH PKWY STE 106,MERIDEN,CT,06450-8341,BUSINESS,HIGH,...,N,Frontier Communications,2019-09-04,STE 106,321 RESEARCH PKWY,MERIDEN,CT,06450,1,0
456,2032842800,FRONTIER COMM - CT,LANDLINE,GAYLORD HOSPITAL HOSPITALS,50 GAYLORD FARM RD,WALLINGFORD,CT,06492-2828,BUSINESS,HIGH,...,N,Frontier Communications,2019-09-25,,50 GAYLORD FARM RD,WALLINGFORD,CT,06492,0,1


In [88]:
gpi = gpi.groupby('phone').first().reset_index() # remove duplicates (typically differing address)
len(gpi)  # should be back to value on first merge

1412

### Save Intermediate data

In [89]:
gpi.to_csv('U:\\Source Files\\Data Analytics\\Data-Science\\Data\\Get_Phone_Info\\gpi_merged.csv', index=False)
#gpi.to_csv('U:\\Source Files\\Data Analytics\\Data-Science\\Data\\Get_Phone_Info\\gpi_merged.csv', index=False)

# Data transformation

In [90]:
addr_translate = {
    'DR': 'DRIVE',
    'PKWY': 'PARKWAY',
    'RD': 'ROAD',
    'ST': 'STREET',
    'STE': 'SUITE',
    'N': 'NORTH',
    'S': 'SOUTH',
    'E': 'EAST',
    'W': 'WEST',
    'LN': 'LANE',
    'CTR': 'CENTER',
    'CT': 'COURT',
    'BLVD': 'BOULEVARD',
    'CIR': 'CIRCLE',
    'HTS': 'HEIGHTS',
    'AVE': 'AVENUE',
    'HWY': 'HIGHWAY',
    'JCT': 'JUNCTION',
    'LK': 'LAKE',
    'MTN': 'MTN',
    'APT': 'APARTMENT',
    'RM': 'ROOM',
    'PL': 'PLACE',
    'PLZ': 'PLACA',
    'RDG': 'RIDGE',
    'SQ': 'SQUARE',
    'STA': 'STATION',
    'TER': 'TERRACE',
    'TRL': 'TRAIL',
    'TPKE': 'TURNPIKE',
    'VLY': 'VALLEY',
    'IS': 'ISLAND'
}

# returns a standardized address string. Extends abbreviations to full words.
def translate_addr(addr_string):
    addr_string = addr_string.upper()
    new_addr = []
    
    tokens = addr_string.split()
    
    for t in tokens:
        if t in addr_translate:
            new_addr.append(addr_translate[t])
        else:
            new_addr.append(t)
    
    return ' '.join(new_addr)

In [91]:
def addr_potential_match(addr1, addr2):
    return addr1==addr2 or addr1 in addr2 or addr2 in addr1

In [92]:
to_del = []  # list of columns to delete after transformation


##############################################################################
############################ WSLive ########################################


# Translating WSLive address

gpi['OFFICE_ADDRESS_LINE_1'] = gpi['OFFICE_ADDRESS_LINE_1'].apply(lambda x: ' '.join([t if t not in ['nan', 'NAN'] else '' for t in str(x).split()]))
gpi['OFFICE_ADDRESS_LINE_2'] = gpi['OFFICE_ADDRESS_LINE_2'].apply(lambda x: ' '.join([t if t not in ['nan', 'NAN'] else '' for t in str(x).split()]))

gpi['OFFICE_ADDRESS_LINE_1'] = gpi['OFFICE_ADDRESS_LINE_1'].apply(translate_addr)
gpi['OFFICE_ADDRESS_LINE_2'] = gpi['OFFICE_ADDRESS_LINE_2'].apply(translate_addr)
gpi['wsl_address'] = gpi['OFFICE_ADDRESS_LINE_1'] + gpi['OFFICE_ADDRESS_LINE_2']

to_del.append('OFFICE_ADDRESS_LINE_1')
to_del.append('OFFICE_ADDRESS_LINE_2')



##############################################################################
############################ GPI ########################################


# Translating GetPhoneInfo address and name
gpi['ContactAddressOut'] = gpi['ContactAddressOut'].apply(lambda x: ' '.join([t if t not in ['nan', 'NAN'] else '' for t in str(x).split()]))
gpi['gpi_address']       = gpi['ContactAddressOut'].apply(translate_addr)
gpi['ContactNameOut']    = gpi['ContactNameOut'].apply(lambda x: ' '.join([t if t not in ['nan', 'NAN'] else '' for t in str(x).split()]))
gpi['gpi_name']          = gpi['ContactNameOut'].apply(translate_addr)
to_del.append('ContactAddressOut')
to_del.append('ContactNameOut')

# encoding GetPhoneInfo 'Notes'

gpi['gpi_isConnected'] = gpi['NoteDescriptions'].apply(lambda x: 1 if 'IsConnected' in x else 0)
gpi['gpi_isPorted']    = gpi['NoteDescriptions'].apply(lambda x: 1 if 'IsPorted' in x else 0)
gpi['gpi_isMailable']  = gpi['NoteDescriptions'].apply(lambda x: 1 if 'IsMailable' in x else 0)
gpi['gpi_isWireless']  = gpi['NoteDescriptions'].apply(lambda x: 1 if 'IsWireless' in x else 0)
gpi['gpi_isPossibleDisconnected'] = gpi['NoteDescriptions'].apply(lambda x: 1 if 'IsPossibleDisconnected' in x else 0)
gpi['gpi_isPossiblePortableVOIP'] = gpi['NoteDescriptions'].apply(lambda x: 1 if 'IsPossiblePortableVOIP' in x else 0)
gpi['gpi_INF'] = gpi['NoteDescriptions'].apply(lambda x: 1 if 'INF' in x else 0)

to_del.append('NoteDescriptions')


# encoding GetPhoneInfo 'QualityScore'
quality = {
    'HIGH': 3,
    'MED': 2,
    'LOW': 1,
    'nan': 1,
    None: 1
}
gpi['gpi_qs_null'] = gpi['ContactQualityScore'].apply(lambda x: 1 if x in ['nan', None] else 0)
gpi['gpi_qs']      = gpi['ContactQualityScore'].apply(lambda x: quality[x] if x in quality else 1)

to_del.append('ContactQualityScore')


# encoding GetPhoneInfo name
med_terms = list(set([t.upper() for t in med_words]))

gpi['ContactNameOut'] = gpi['ContactNameOut'].apply(lambda x: ' '.join([t if t not in ['nan', 'NAN'] else '' for t in str(x).split()]))
gpi['gpi_name_meddy'] = gpi['ContactNameOut'].apply(lambda x: 1 if any([t in x for t in med_terms]) else 0)



# standardizing Zipcode to first 5 digits

gpi['gpi_zip'] = gpi['ContactZipOut'].apply(lambda x: x[:5] if len(x)>=5 else x)  # reduce extended Zipcodes
gpi['gpi_zip'] = gpi['gpi_zip'].apply(lambda x: ('000' + x)[-5:] if x.isdigit() else x)

to_del.append('ContactZipOut')


# standardizing PhoneType

gpi['gpi_phonetype'] = gpi['ContactPhoneType'].apply(lambda x: x if x in ['BUSINESS', 'RESIDENTIAL'] else 'UNKNOWN')

to_del.append('ContactPhoneType')


# parse 'Date' as Datetime Date
gpi['gpi_date'] = gpi['DateOfPorting'].apply(lambda x: parse(x) if x not in ['nan', 'None', None] else x)
to_del.append('DateOfPorting')
# get date diff
gpi['gpi_date_diff']    = gpi['gpi_date'].apply(lambda x: (datetime.now() - x).days if x not in ['nan','None'] else x)
notnull_date_diffs      = [d for d in gpi['gpi_date_diff'].values if d not in ['nan', 'None', None]]
gpi['gpi_date_diff']    = gpi['gpi_date_diff'].apply(lambda x: x if x not in ['nan', 'None', None] else 2*max(notnull_date_diffs))
gpi['gpi_date_missing'] = gpi['gpi_date'].apply(lambda x: 1 if x in ['nan', 'None', None] else 0)

to_del.append('gpi_date')

# Line type
gpi['gpi_linetype_m'] = gpi['LineTypeOut'].apply(lambda x: 1 if x=='WIRELESS' else 0)  # wireless - mobile
gpi['gpi_linetype_v'] = gpi['LineTypeOut'].apply(lambda x: 1 if x=='VOIP' else 0)
gpi['gpi_linetype_l'] = gpi['LineTypeOut'].apply(lambda x: 1 if x=='LANDLINE' else 0)

to_del.append('LineTypeOut')


##############################################################################
############################ AREA CODE #######################################


gpi.drop(columns=to_del, axis=1, inplace=True)


# area and prefix features
gpi['gpi_area_code']   = gpi['phone'].apply(lambda x: x[:3])
gpi['gpi_area+prefix'] = gpi['phone'].apply(lambda x: x[:6])


# renaming address fields
gpi.rename(columns={'ContactCityOut': 'gpi_city',
                    'ContactStateOut':'gpi_state',
                    'ProviderNameOut': 'gpi_provider'}, inplace=True)


##############################################################################
############################ GPI CATEGORICALS ################################


# categorize
gpi_categoricals = [
    'gpi_city',
    'gpi_provider',
    'gpi_state',
    'gpi_zip',
    'gpi_phonetype',
    'gpi_area_code',
    'gpi_area+prefix'
]

for col in gpi_categoricals:
    gpi[col] = gpi[col].astype('category')
    
#########################################################################
############################ RPV ########################################


# encode RPV status
rpv_statuses = {
    'disconnected': 4,
    'disconnected-70': 3,
    'connected-75': 2,
    'connected': 1
}
gpi['rpv_status'] = gpi['status'].apply(lambda x: rpv_statuses[x] if x in rpv_statuses else 0)

# encode rpv iscell
gpi['rpv_iscell_V'] = gpi['iscell'].apply(lambda x: 1 if x=='V' else 0)
gpi['rpv_iscell_Y'] = gpi['iscell'].apply(lambda x: 1 if x=='Y' else 0)

# categorize carrier
gpi['carrier'] = gpi['carrier'].astype('category')

# drop unnecessary columns after transformation
gpi.drop(columns=['status','error_text','iscell','date_checked'], axis=1, inplace=True)

# rename carrier
gpi.rename(columns={'carrier': 'rpv_carrier'},inplace=True)


##############################################################################
############################ MATCHING ########################################

# finding potential provider matches (gpi_provder and rpv_carrier) -- different naming conventions so it's a little tricky
gpi['match_provider'] = [1 if any([t in g for t in r.split()]) else 0 for r, g in zip(gpi['rpv_carrier'], gpi['gpi_provider'])]
# WSLive city match GPI
gpi['match_city']     = [1 if r==g else 0 for r, g in zip(gpi['gpi_city'], gpi['wsl_city'])] 
# WSLive state match GPI
gpi['match_state']    = [1 if r==g else 0 for r, g in zip(gpi['gpi_state'], gpi['wsl_state'])] 
# WSLive ZIP match GPI
gpi['match_zip']      = [1 if r==g else 0 for r, g in zip(gpi['gpi_zip'], gpi['wsl_zip'])] 
# WSLive Address match GPI
gpi['match_address']  = [1 if any([t in g for t in r.split()]) else 0 for r, g in zip(gpi['wsl_address'], gpi['gpi_address'])]

to_del = [
    'gpi_city',
    'gpi_state',
    'gpi_zip',
    'gpi_provider',
    'wsl_city',
    'wsl_state',
    'wsl_zip',
    'rpv_carrier',
    'wsl_address',
    'gpi_address',
    'gpi_name'
]

gpi.drop(columns=to_del, axis=1, inplace=True)


##############################################################################
############################ GPI PHONE TYPE AND BUSINESS #####################


gpi['gpi_type_business']    = gpi['gpi_phonetype'].apply(lambda x: 1 if x=='BUSINESS' else 0)
gpi['gpi_type_residential'] = gpi['gpi_phonetype'].apply(lambda x: 1 if x=='RESIDENTIAL' else 0)

gpi.drop(columns='gpi_phonetype', axis=1, inplace=True)

##############################################################################
############################ MATCH GPI RPV LINE TYPE#### #####################
gpi['match_linetype_voip']   = [1 if r==g else 0 for r, g in zip(gpi['gpi_linetype_v'], gpi['rpv_iscell_V'])]
gpi['match_linetype_mobile'] = [1 if r==g else 0 for r, g in zip(gpi['gpi_linetype_m'], gpi['rpv_iscell_Y'])]


In [93]:
gpi.dtypes

phone                           object
isDisconnected                   int64
isWrong                          int64
gpi_isConnected                  int64
gpi_isPorted                     int64
gpi_isMailable                   int64
gpi_isWireless                   int64
gpi_isPossibleDisconnected       int64
gpi_isPossiblePortableVOIP       int64
gpi_INF                          int64
gpi_qs_null                      int64
gpi_qs                           int64
gpi_name_meddy                   int64
gpi_date_diff                    int64
gpi_date_missing                 int64
gpi_linetype_m                   int64
gpi_linetype_v                   int64
gpi_linetype_l                   int64
gpi_area_code                 category
gpi_area+prefix               category
rpv_status                       int64
rpv_iscell_V                     int64
rpv_iscell_Y                     int64
match_provider                   int64
match_city                       int64
match_state              

In [94]:
savepath = 'U:\\Source Files\\Data Analytics\\Data-Science\\Data\\Get_Phone_Info\\gpi_merged_processed.csv'


In [95]:
### Saving off data
gpi.to_csv(savepath, index=False)

In [96]:
### loading data above
#gpi = pk.load(open('savepath', 'rb'))

# Done transforming.

.

.

.

# Setting X and y for dataset

#### All data

In [97]:
Xy = gpi.set_index('phone')
target = gpi['isDisconnected']
#target = gpi['isWrong']

del Xy['isWrong']
del Xy['isDisconnected']

#### Only records RPV says are "disconnected"

In [99]:
Xy_dc = gpi[gpi['rpv_status'] == 4].copy().set_index('phone')
target_dc = gpi[gpi['rpv_status'] == 4]['isDisconnected']
del Xy_dc['isDisconnected']
del Xy_dc['isWrong']

#### Only records says are "disconnected" or "disconnected-70"

In [100]:
Xy_dc70 = gpi[gpi['rpv_status'] >=3].copy().set_index('phone')
target_dc70 = gpi[gpi['rpv_status'] >= 3]['isDisconnected']
del Xy_dc70['isDisconnected']
del Xy_dc70['isWrong']

In [101]:
data = {
    'all': {
        'X': Xy,
        'y': target
    },
    'dc': {
        'X': Xy_dc,
        'y': target_dc
    },
    'dc70': {
        'X': Xy_dc70,
        'y': target_dc70
    }
}

In [98]:
Xy.dtypes

gpi_isConnected                  int64
gpi_isPorted                     int64
gpi_isMailable                   int64
gpi_isWireless                   int64
gpi_isPossibleDisconnected       int64
gpi_isPossiblePortableVOIP       int64
gpi_INF                          int64
gpi_qs_null                      int64
gpi_qs                           int64
gpi_name_meddy                   int64
gpi_date_diff                    int64
gpi_date_missing                 int64
gpi_linetype_m                   int64
gpi_linetype_v                   int64
gpi_linetype_l                   int64
gpi_area_code                 category
gpi_area+prefix               category
rpv_status                       int64
rpv_iscell_V                     int64
rpv_iscell_Y                     int64
match_provider                   int64
match_city                       int64
match_state                      int64
match_zip                        int64
match_address                    int64
gpi_type_business        

In [108]:
for dataset in data:
    
    X = data[dataset]['X']
    y = data[dataset]['y']
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

    train_dataset = lgb.Dataset(X_train, y_train)
    test_dataset = lgb.Dataset(X_test, y_test)


    # T - no. of total samples
    # P - no. of positive samples
    T = len(target)
    P = sum(target)
    pos = ((1.0 * T)/P) - 1


    # experiment with different parameters
    param = {
        'num_leaves': 31, 
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': 0.1,
        'scale_pos_weight': pos
    }
    n_iter = 500

    bst = lgb.train(param, train_dataset, n_iter, valid_sets=test_dataset, early_stopping_rounds=50)

    y_pred = bst.predict(X_test)
    y_pred_c = [1 if p >= 0.5 else 0 for p in y_pred]

    print('LightGBM')
    thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
    for t in thresholds:
        y_pred_c = [1 if p >= t else 0 for p in y_pred]
        print()
        print('LightGBM', 'Threshold:', t)
        print('\t\tNumber DC: {}, Number classified as DC: {}'.format(sum(y_test), sum(y_pred_c)))
        print('\t\tNumber DC correctly classified: {}'.format(sum([1 if p==1 and t==1 else 0 for p, t in zip(y_pred_c, y_test)])))
        false_positive = [1 if p==1 and t==0 else 0 for p, t in zip(y_pred_c, y_test)]
        print('\t\tNumber False Positives: {}'.format(sum(false_positive)))
        print('\t\tFalsePositiveRate: {}'.format(
            round(
                sum(false_positive)/(len(y_test)-sum(y_test)),
                2)))
        print(metrics.classification_report(y_true=y_test, y_pred=y_pred_c))
        print('###########################################')


    classifiers = {
        "Nearest Neighbors": KNeighborsClassifier(3),
        "Gaussian Process": GaussianProcessClassifier(1.0 * RBF(1.0)),
        "Decision Tree": DecisionTreeClassifier(max_depth=8), 
        "Random Forest": RandomForestClassifier(max_depth=10, n_estimators=15, max_features='auto'), 
        "Neural Net": MLPClassifier(alpha=1, max_iter=2000), 
        "AdaBoost": AdaBoostClassifier(),
        "Naive Bayes": GaussianNB(), 
    }

    for clf_name in classifiers:

        clf = classifiers[clf_name]
        clf.fit(X_train, y_train)

        y_pred = clf.predict(X_test)
        y_pred_p = clf.predict_proba(X_test)

        print(clf_name)
        for t in thresholds:
            y_pred_c = [1 if p[1] >= t else 0 for p in y_pred_p]
            precision = round(metrics.precision_score(y_true=y_test, y_pred=y_pred_c),2)
            recall = round(metrics.recall_score(y_true=y_test, y_pred=y_pred_c),2)
            print()
            print(clf_name, 'Threshold:', t)
            print('\t\tNumber DC: {}, Number classified as DC: {}'.format(sum(y_test), sum(y_pred_c)))
            print('\t\tNumber DC correctly classified: {}'.format(sum([1 if p==1 and t==1 else 0 for p, t in zip(y_pred_c, y_test)])))
            false_positive = [1 if p==1 and t==0 else 0 for p, t in zip(y_pred_c, y_test)]
            print('\t\tNumber False Positives: {}'.format(sum(false_positive)))
            print('\t\tFalsePositiveRate: {}'.format(
                round(
                    sum(false_positive)/(len(y_test)-sum(y_test)),
                    2)))

            print(metrics.classification_report(y_true=y_test, y_pred=y_pred_c))
            print('###########################################')




[1]	valid_0's auc: 0.767916
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.777694
[3]	valid_0's auc: 0.776939
[4]	valid_0's auc: 0.791853
[5]	valid_0's auc: 0.791372
[6]	valid_0's auc: 0.791795
[7]	valid_0's auc: 0.79303
[8]	valid_0's auc: 0.794689
[9]	valid_0's auc: 0.795158
[10]	valid_0's auc: 0.797639
[11]	valid_0's auc: 0.795821
[12]	valid_0's auc: 0.796267
[13]	valid_0's auc: 0.799732
[14]	valid_0's auc: 0.80083
[15]	valid_0's auc: 0.80051
[16]	valid_0's auc: 0.801768
[17]	valid_0's auc: 0.802752
[18]	valid_0's auc: 0.8029
[19]	valid_0's auc: 0.800453
[20]	valid_0's auc: 0.801825
[21]	valid_0's auc: 0.799355
[22]	valid_0's auc: 0.797182
[23]	valid_0's auc: 0.798131
[24]	valid_0's auc: 0.799824
[25]	valid_0's auc: 0.800579
[26]	valid_0's auc: 0.80019
[27]	valid_0's auc: 0.801173
[28]	valid_0's auc: 0.800556
[29]	valid_0's auc: 0.799321
[30]	valid_0's auc: 0.800384
[31]	valid_0's auc: 0.799607
[32]	valid_0's auc: 0.79844
[33]	valid_0's auc: 0.7974

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



		Number DC: 177, Number classified as DC: 132
		Number DC correctly classified: 78
		Number False Positives: 54
		FalsePositiveRate: 0.22
              precision    recall  f1-score   support

           0       0.66      0.78      0.72       247
           1       0.59      0.44      0.50       177

   micro avg       0.64      0.64      0.64       424
   macro avg       0.63      0.61      0.61       424
weighted avg       0.63      0.64      0.63       424

###########################################

Neural Net Threshold: 0.9
		Number DC: 177, Number classified as DC: 132
		Number DC correctly classified: 78
		Number False Positives: 54
		FalsePositiveRate: 0.22
              precision    recall  f1-score   support

           0       0.66      0.78      0.72       247
           1       0.59      0.44      0.50       177

   micro avg       0.64      0.64      0.64       424
   macro avg       0.63      0.61      0.61       424
weighted avg       0.63      0.64      0.63       4

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


[1]	valid_0's auc: 0.808285
Training until validation scores don't improve for 50 rounds.
[2]	valid_0's auc: 0.79961
[3]	valid_0's auc: 0.807851
[4]	valid_0's auc: 0.805465
[5]	valid_0's auc: 0.803947
[6]	valid_0's auc: 0.80026
[7]	valid_0's auc: 0.797332
[8]	valid_0's auc: 0.796465
[9]	valid_0's auc: 0.791477
[10]	valid_0's auc: 0.793971
[11]	valid_0's auc: 0.796248
[12]	valid_0's auc: 0.78779
[13]	valid_0's auc: 0.788115
[14]	valid_0's auc: 0.784862
[15]	valid_0's auc: 0.783344
[16]	valid_0's auc: 0.782477
[17]	valid_0's auc: 0.78226
[18]	valid_0's auc: 0.781609
[19]	valid_0's auc: 0.785296
[20]	valid_0's auc: 0.784103
[21]	valid_0's auc: 0.783669
[22]	valid_0's auc: 0.776946
[23]	valid_0's auc: 0.776838
[24]	valid_0's auc: 0.775862
[25]	valid_0's auc: 0.775862
[26]	valid_0's auc: 0.776079
[27]	valid_0's auc: 0.775862
[28]	valid_0's auc: 0.77391
[29]	valid_0's auc: 0.772175
[30]	valid_0's auc: 0.770657
[31]	valid_0's auc: 0.769573
[32]	valid_0's auc: 0.768488
[33]	valid_0's auc: 0.76

AdaBoost

AdaBoost Threshold: 0.5
		Number DC: 87, Number classified as DC: 99
		Number DC correctly classified: 74
		Number False Positives: 25
		FalsePositiveRate: 0.47
              precision    recall  f1-score   support

           0       0.68      0.53      0.60        53
           1       0.75      0.85      0.80        87

   micro avg       0.73      0.73      0.73       140
   macro avg       0.72      0.69      0.70       140
weighted avg       0.72      0.73      0.72       140

###########################################

AdaBoost Threshold: 0.6
		Number DC: 87, Number classified as DC: 0
		Number DC correctly classified: 0
		Number False Positives: 0
		FalsePositiveRate: 0.0
              precision    recall  f1-score   support

           0       0.38      1.00      0.55        53
           1       0.00      0.00      0.00        87

   micro avg       0.38      0.38      0.38       140
   macro avg       0.19      0.50      0.27       140
weighted avg       0.14     

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)



###########################################

LightGBM Threshold: 0.6
		Number DC: 130, Number classified as DC: 130
		Number DC correctly classified: 103
		Number False Positives: 27
		FalsePositiveRate: 0.31
              precision    recall  f1-score   support

           0       0.69      0.69      0.69        87
           1       0.79      0.79      0.79       130

   micro avg       0.75      0.75      0.75       217
   macro avg       0.74      0.74      0.74       217
weighted avg       0.75      0.75      0.75       217

###########################################

LightGBM Threshold: 0.7
		Number DC: 130, Number classified as DC: 93
		Number DC correctly classified: 78
		Number False Positives: 15
		FalsePositiveRate: 0.17
              precision    recall  f1-score   support

           0       0.58      0.83      0.68        87
           1       0.84      0.60      0.70       130

   micro avg       0.69      0.69      0.69       217
   macro avg       0.71      0.71     

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


AdaBoost

AdaBoost Threshold: 0.5
		Number DC: 130, Number classified as DC: 139
		Number DC correctly classified: 111
		Number False Positives: 28
		FalsePositiveRate: 0.32
              precision    recall  f1-score   support

           0       0.76      0.68      0.72        87
           1       0.80      0.85      0.83       130

   micro avg       0.78      0.78      0.78       217
   macro avg       0.78      0.77      0.77       217
weighted avg       0.78      0.78      0.78       217

###########################################

AdaBoost Threshold: 0.6
		Number DC: 130, Number classified as DC: 2
		Number DC correctly classified: 1
		Number False Positives: 1
		FalsePositiveRate: 0.01
              precision    recall  f1-score   support

           0       0.40      0.99      0.57        87
           1       0.50      0.01      0.02       130

   micro avg       0.40      0.40      0.40       217
   macro avg       0.45      0.50      0.29       217
weighted avg       0.46

  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


# Narrowing down to the most promising models: 

1. LightGBM (0.6 - 0.9 threshold)
    * Precision: 0.89, 0.70, 1.00,
    * Recall: 0.27, 0.98, 0.18
    * False positive rate: 0.02, 0.68, 0.00


2. Random Forest (0.9 threshold)
    * Precision: 0.81, 0.96, 0.97
    * Recall:    0.30, 0.21, 0.16
    * False positive rate: 0.11, 0.01, 0.01

### Other

Models performed best when training on only records RPV labels as "disconnected" or "disconnected-70" or only "disconnected"

# Hyperparameterization Gridsearch - Random Forest

In [109]:
from sklearn.model_selection import RandomizedSearchCV

In [162]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 120, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 7, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4, 7]
# Method of selecting samples for training each tree
bootstrap = [True, False]

In [163]:
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [164]:
def evaluate(model, test_features, test_labels):
    predictions = model.predict(test_features)
    
    precision = metrics.precision_score(y_true=test_labels, y_pred=predictions) 
    
    thresholds = [0.5, 0.6, 0.7, 0.8, 0.9]
    for t in thresholds:
        pred_p = model.predict_proba(test_features)
        pred_classes = [1 if p[1] >= t else 0 for p in pred_p]
        
        precision = round(metrics.precision_score(y_true=test_labels, y_pred=pred_classes),2)
        recall    = round(metrics.recall_score(y_true=test_labels, y_pred=pred_classes),2)
        num_false_positives = sum([1 if p==1 and t==0 else 0 for p, t in zip(pred_classes, test_labels)])
        fpr = round(1.0*num_false_positives/(len(test_labels)-sum(test_labels)),2)
        print('Threshold:', t)
        print('\tPrecision:', precision)
        print('\tRecall:', recall)
        print('\tNumFalsePositives:', num_false_positives)
        print('\tFalsePositiveRate:', fpr)

### Part 1: dc only

In [165]:
# Data for ONLY RPV DISCONNECTED

X = data['dc']['X']
y = data['dc']['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [166]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)

rf_random.fit(X_train, y_train)

rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   10.4s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   35.2s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.1min finished


{'n_estimators': 1000,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 32,
 'bootstrap': False}

### Base RandomForest Model - DC only

In [167]:
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Threshold: 0.5
	Precision: 0.78
	Recall: 0.8
	NumFalsePositives: 19
	FalsePositiveRate: 0.35
Threshold: 0.6
	Precision: 0.78
	Recall: 0.73
	NumFalsePositives: 17
	FalsePositiveRate: 0.31
Threshold: 0.7
	Precision: 0.79
	Recall: 0.67
	NumFalsePositives: 15
	FalsePositiveRate: 0.27
Threshold: 0.8
	Precision: 0.78
	Recall: 0.51
	NumFalsePositives: 12
	FalsePositiveRate: 0.22
Threshold: 0.9
	Precision: 0.83
	Recall: 0.41
	NumFalsePositives: 7
	FalsePositiveRate: 0.13


### RandomSearch RandomForest Model - DC only

In [168]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Threshold: 0.5
	Precision: 0.78
	Recall: 0.82
	NumFalsePositives: 20
	FalsePositiveRate: 0.36
Threshold: 0.6
	Precision: 0.8
	Recall: 0.75
	NumFalsePositives: 16
	FalsePositiveRate: 0.29
Threshold: 0.7
	Precision: 0.81
	Recall: 0.66
	NumFalsePositives: 13
	FalsePositiveRate: 0.24
Threshold: 0.8
	Precision: 0.9
	Recall: 0.45
	NumFalsePositives: 4
	FalsePositiveRate: 0.07
Threshold: 0.9
	Precision: 1.0
	Recall: 0.18
	NumFalsePositives: 0
	FalsePositiveRate: 0.0


### Comparison - RPV "DISCONNECTED" ONLY

The RandomSearch RandomForest Model performed much better than the base model--with multiple thresholds yielding over 90% precision and false positive rates under 15%

# Part 2: dc and dc70

In [169]:
# Data for RPV DISCONNECTED and DISCONNECTED-70

X = data['dc70']['X']
y = data['dc70']['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [170]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)

rf_random.fit(X_train, y_train)

rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    9.7s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   38.9s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.2min finished


{'n_estimators': 1600,
 'min_samples_split': 10,
 'min_samples_leaf': 1,
 'max_features': 'auto',
 'max_depth': 10,
 'bootstrap': True}

### Base RandomForest Model - DC and DC70

In [171]:
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Threshold: 0.5
	Precision: 0.7
	Recall: 0.82
	NumFalsePositives: 42
	FalsePositiveRate: 0.42
Threshold: 0.6
	Precision: 0.7
	Recall: 0.75
	NumFalsePositives: 38
	FalsePositiveRate: 0.38
Threshold: 0.7
	Precision: 0.74
	Recall: 0.66
	NumFalsePositives: 28
	FalsePositiveRate: 0.28
Threshold: 0.8
	Precision: 0.76
	Recall: 0.55
	NumFalsePositives: 21
	FalsePositiveRate: 0.21
Threshold: 0.9
	Precision: 0.78
	Recall: 0.39
	NumFalsePositives: 13
	FalsePositiveRate: 0.13


### RandomSearch RandomForest Model - DC and DC70

In [172]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Threshold: 0.5
	Precision: 0.7
	Recall: 0.8
	NumFalsePositives: 40
	FalsePositiveRate: 0.4
Threshold: 0.6
	Precision: 0.74
	Recall: 0.66
	NumFalsePositives: 28
	FalsePositiveRate: 0.28
Threshold: 0.7
	Precision: 0.78
	Recall: 0.54
	NumFalsePositives: 18
	FalsePositiveRate: 0.18
Threshold: 0.8
	Precision: 0.9
	Recall: 0.4
	NumFalsePositives: 5
	FalsePositiveRate: 0.05
Threshold: 0.9
	Precision: 1.0
	Recall: 0.15
	NumFalsePositives: 0
	FalsePositiveRate: 0.0


# Part 3: Trying All Data Again

In [173]:
# All data

X = data['all']['X']
y = data['all']['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [174]:
rf = RandomForestClassifier()

rf_random = RandomizedSearchCV(estimator=rf,
                               param_distributions=random_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)

rf_random.fit(X_train, y_train)

rf_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:   11.9s
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed:   52.3s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:  1.6min finished


{'n_estimators': 800,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'sqrt',
 'max_depth': 43,
 'bootstrap': True}

### Base RandomForest Model - all

In [175]:
base_model = RandomForestClassifier(n_estimators = 10, random_state = 42)
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Threshold: 0.5
	Precision: 0.68
	Recall: 0.65
	NumFalsePositives: 60
	FalsePositiveRate: 0.26
Threshold: 0.6
	Precision: 0.71
	Recall: 0.56
	NumFalsePositives: 45
	FalsePositiveRate: 0.2
Threshold: 0.7
	Precision: 0.76
	Recall: 0.48
	NumFalsePositives: 29
	FalsePositiveRate: 0.13
Threshold: 0.8
	Precision: 0.82
	Recall: 0.39
	NumFalsePositives: 17
	FalsePositiveRate: 0.07
Threshold: 0.9
	Precision: 0.87
	Recall: 0.28
	NumFalsePositives: 8
	FalsePositiveRate: 0.03


### RandomSearch RandomForest Model - all

In [176]:
best_random = rf_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Threshold: 0.5
	Precision: 0.75
	Recall: 0.63
	NumFalsePositives: 42
	FalsePositiveRate: 0.18
Threshold: 0.6
	Precision: 0.84
	Recall: 0.51
	NumFalsePositives: 19
	FalsePositiveRate: 0.08
Threshold: 0.7
	Precision: 0.86
	Recall: 0.36
	NumFalsePositives: 11
	FalsePositiveRate: 0.05
Threshold: 0.8
	Precision: 0.96
	Recall: 0.23
	NumFalsePositives: 2
	FalsePositiveRate: 0.01
Threshold: 0.9
	Precision: 0.92
	Recall: 0.06
	NumFalsePositives: 1
	FalsePositiveRate: 0.0


# LIGHTGBM

In [179]:
param_grid = {
    'learning_rate': [.01, .05, .1, .15, .2, 1],
    'n_estimators': [21, 31, 51, 81, 151],
    'max_depth': [int(x) for x in np.linspace(10, 120, num = 11)],
    'bagging_fraction': [1.0, 0.9, 0.8],
    'pos_bagging_fraction': [1.0, 0.9, 0.8],
    'neg_bagging_fraction': [1.0, 0.9, 0.8],
    'bagging_freq': [0, 3, 5, 10],
    'feature_fraction': [1.0, 0.9, 0.8]
}

### Part 1 - DC only

In [180]:
# Data for ONLY RPV DISCONNECTED

X = data['dc']['X']
y = data['dc']['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [181]:
m = lgb.LGBMClassifier()

m_random = RandomizedSearchCV(estimator=m,
                               param_distributions=param_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)
m_random.fit(X_train, y_train)

m_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  37 tasks      | elapsed:    2.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    5.3s finished


{'pos_bagging_fraction': 0.8,
 'neg_bagging_fraction': 0.9,
 'n_estimators': 31,
 'max_depth': 54,
 'learning_rate': 0.05,
 'feature_fraction': 1.0,
 'bagging_freq': 5,
 'bagging_fraction': 0.8}

### Base - DC only

In [182]:
base_model = lgb.LGBMClassifier()
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Threshold: 0.5
	Precision: 0.74
	Recall: 0.87
	NumFalsePositives: 25
	FalsePositiveRate: 0.44
Threshold: 0.6
	Precision: 0.76
	Recall: 0.82
	NumFalsePositives: 21
	FalsePositiveRate: 0.37
Threshold: 0.7
	Precision: 0.81
	Recall: 0.75
	NumFalsePositives: 15
	FalsePositiveRate: 0.26
Threshold: 0.8
	Precision: 0.84
	Recall: 0.63
	NumFalsePositives: 10
	FalsePositiveRate: 0.18
Threshold: 0.9
	Precision: 0.94
	Recall: 0.36
	NumFalsePositives: 2
	FalsePositiveRate: 0.04


### LightGBM RandomSearch - DC Only

In [183]:
best_random = m_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Threshold: 0.5
	Precision: 0.75
	Recall: 0.93
	NumFalsePositives: 26
	FalsePositiveRate: 0.46
Threshold: 0.6
	Precision: 0.77
	Recall: 0.83
	NumFalsePositives: 21
	FalsePositiveRate: 0.37
Threshold: 0.7
	Precision: 0.81
	Recall: 0.69
	NumFalsePositives: 13
	FalsePositiveRate: 0.23
Threshold: 0.8
	Precision: 0.84
	Recall: 0.37
	NumFalsePositives: 6
	FalsePositiveRate: 0.11
Threshold: 0.9
	Precision: 0.0
	Recall: 0.0
	NumFalsePositives: 0
	FalsePositiveRate: 0.0


  'precision', 'predicted', average, warn_for)


### Part 2 - DC and DC70 only

In [184]:
# Data for ONLY RPV DISCONNECTED

X = data['dc70']['X']
y = data['dc70']['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

m = lgb.LGBMClassifier()

m_random = RandomizedSearchCV(estimator=m,
                               param_distributions=param_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)
m_random.fit(X_train, y_train)

m_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  36 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    5.8s finished


{'pos_bagging_fraction': 0.9,
 'neg_bagging_fraction': 0.9,
 'n_estimators': 51,
 'max_depth': 43,
 'learning_rate': 0.05,
 'feature_fraction': 1.0,
 'bagging_freq': 0,
 'bagging_fraction': 0.8}

### LightGBM Base - DC + DC70 only

In [185]:
base_model = lgb.LGBMClassifier()
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Threshold: 0.5
	Precision: 0.81
	Recall: 0.8
	NumFalsePositives: 23
	FalsePositiveRate: 0.25
Threshold: 0.6
	Precision: 0.83
	Recall: 0.76
	NumFalsePositives: 19
	FalsePositiveRate: 0.2
Threshold: 0.7
	Precision: 0.85
	Recall: 0.65
	NumFalsePositives: 14
	FalsePositiveRate: 0.15
Threshold: 0.8
	Precision: 0.88
	Recall: 0.52
	NumFalsePositives: 9
	FalsePositiveRate: 0.1
Threshold: 0.9
	Precision: 0.91
	Recall: 0.31
	NumFalsePositives: 4
	FalsePositiveRate: 0.04


### LightGBM RandomSearch - DC + DC70 only

In [186]:
best_random = m_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Threshold: 0.5
	Precision: 0.81
	Recall: 0.76
	NumFalsePositives: 22
	FalsePositiveRate: 0.24
Threshold: 0.6
	Precision: 0.81
	Recall: 0.68
	NumFalsePositives: 20
	FalsePositiveRate: 0.22
Threshold: 0.7
	Precision: 0.84
	Recall: 0.62
	NumFalsePositives: 15
	FalsePositiveRate: 0.16
Threshold: 0.8
	Precision: 0.89
	Recall: 0.4
	NumFalsePositives: 6
	FalsePositiveRate: 0.06
Threshold: 0.9
	Precision: 0.96
	Recall: 0.18
	NumFalsePositives: 1
	FalsePositiveRate: 0.01


### Part 3 - LightGBM - all

In [187]:
# Data for ONLY RPV DISCONNECTED

X = data['all']['X']
y = data['all']['y']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

m = lgb.LGBMClassifier()

m_random = RandomizedSearchCV(estimator=m,
                               param_distributions=param_grid, 
                               n_iter = 100, 
                               cv = 3, 
                               verbose=2, 
                               random_state=42, 
                               n_jobs = -1)
m_random.fit(X_train, y_train)

m_random.best_params_

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:    2.9s
[Parallel(n_jobs=-1)]: Done 293 out of 300 | elapsed:    7.1s remaining:    0.1s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    7.2s finished


{'pos_bagging_fraction': 0.9,
 'neg_bagging_fraction': 0.9,
 'n_estimators': 31,
 'max_depth': 54,
 'learning_rate': 0.1,
 'feature_fraction': 0.8,
 'bagging_freq': 3,
 'bagging_fraction': 0.8}

### LightGBM Base Model - All

In [188]:
base_model = lgb.LGBMClassifier()
base_model.fit(X_train, y_train)
base_accuracy = evaluate(base_model, X_test, y_test)

Threshold: 0.5
	Precision: 0.72
	Recall: 0.55
	NumFalsePositives: 41
	FalsePositiveRate: 0.18
Threshold: 0.6
	Precision: 0.76
	Recall: 0.51
	NumFalsePositives: 32
	FalsePositiveRate: 0.14
Threshold: 0.7
	Precision: 0.8
	Recall: 0.44
	NumFalsePositives: 21
	FalsePositiveRate: 0.09
Threshold: 0.8
	Precision: 0.83
	Recall: 0.3
	NumFalsePositives: 12
	FalsePositiveRate: 0.05
Threshold: 0.9
	Precision: 0.87
	Recall: 0.17
	NumFalsePositives: 5
	FalsePositiveRate: 0.02


### LightGBM RandomSearch - All

In [189]:
best_random = m_random.best_estimator_
random_accuracy = evaluate(best_random, X_test, y_test)

Threshold: 0.5
	Precision: 0.75
	Recall: 0.57
	NumFalsePositives: 37
	FalsePositiveRate: 0.16
Threshold: 0.6
	Precision: 0.8
	Recall: 0.49
	NumFalsePositives: 25
	FalsePositiveRate: 0.11
Threshold: 0.7
	Precision: 0.86
	Recall: 0.41
	NumFalsePositives: 13
	FalsePositiveRate: 0.06
Threshold: 0.8
	Precision: 0.87
	Recall: 0.24
	NumFalsePositives: 7
	FalsePositiveRate: 0.03
Threshold: 0.9
	Precision: 0.88
	Recall: 0.11
	NumFalsePositives: 3
	FalsePositiveRate: 0.01


In [191]:
Xy['rpv_status'].value_counts()

1    678
4    464
3    259
0      9
2      2
Name: rpv_status, dtype: int64

In [192]:
len(Xy[Xy['rpv_status']>=3])

723

In [193]:
len(Xy)

1412

In [197]:

y = data['all']['y']

n_dc = sum(y)

n_dc

632

In [198]:
rpv_archive.head()

Unnamed: 0,phone,status,error_text,iscell,carrier,date_checked
0,2012000318,connected,,N,Comcast of MD,2019-08-29
1,2012002626,connected,,N,MCImetro Former MCI,2019-08-29
2,2012040004,connected,,N,Level 3,2019-08-29
3,2012071052,connected,,Y,Verizon Wireless,2019-08-29
4,2012074846,connected,,Y,Verizon Wireless,2019-08-29


In [200]:
rpv_archive['status'].value_counts() / len(rpv_archive)

connected          0.815012
disconnected       0.107565
disconnected-70    0.064618
connected-75       0.008865
invalid-phone      0.003940
Name: status, dtype: float64