In [146]:
import os
import re
import pandas as pd
import networkx as nx
import numpy as np
import json
import matplotlib.pyplot as plt
from pandas.io.json import json_normalize

In [147]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.filterwarnings('ignore')

## Pre-processing

In [148]:
#find all JSON files in 'data_aggregation' & add name list json_files
FILE_PATH = './data_aggregation'
files = os.listdir(FILE_PATH)
json_files = [x for x in files if x[-5:]=='.json'] # regex: _\d{4}.json

In [149]:
#iterate through all JSON files, add data to master_df, drop cols, & remove duplicates

master_df = pd.DataFrame()

for i in json_files:
    with open(f"{FILE_PATH}/{i}",'r') as f:
        data = json.load(f)
        temp_df = json_normalize(data['result'])        
        master_df = master_df.append(temp_df, sort=True, ignore_index=True)
        
master_df_sub = master_df[['amount', 'client.gvkey', 'client.legal_name', 'has_amendments', 'id',
       'is_latest_amendment', 'registrant', 'specific_issues', 'year']]

master_df_sub.sort_values(by=['id','has_amendments','is_latest_amendment'],inplace=True, na_position='first')

master_df_sub.drop_duplicates(subset=['amount','client.gvkey','client.legal_name',
                                              'id','registrant','year'], keep='last', inplace=True)

master_df_sub.drop(columns=['has_amendments','is_latest_amendment'],inplace=True)

In [150]:
def get_sponsors(issues):
    """
    parameters: expects the 'specific_issues' column from master_df_sub.
    
    description: finds sponsor names in 'specific issues', if available, and adds them to a list.
    
    returns: a list of sponsor names.
    """
    spons = []
    try:
        if 'bills_by_algo' in list(issues[0].keys()):
            for sponsors in issues[0]['bills_by_algo']:
                lastname = sponsors['sponsor']['lastname']
                firstname = sponsors['sponsor']['firstname']
                name = lastname + ", " + firstname
                party = sponsors['sponsor']['party']
#                 spons.append((name, party)) #may want to switch back to nested list for processing (?)
                spons.append(name)
    except:
        pass
    
    return spons

In [151]:
master_df_sub['sponsors'] = master_df_sub.apply(lambda x: get_sponsors(x['specific_issues']), axis=1)

In [152]:
def get_lobbyists(issues):
    """
    parameters: expects the 'specific_issues' column from master_df_sub.
    
    description: finds lobbyist names in 'specific issues', if available, and adds them to a list.
    
    returns: a list of lobbyist names.
    """
    lobbs = []
    try:
        if 'lobbyists' in issues[0].keys():
            lobbs = issues[0]['lobbyists']
    except:
        pass
    
    return lobbs

In [153]:
master_df_sub['lobbyists'] = master_df_sub.apply(lambda x: get_lobbyists(x['specific_issues']), axis=1)

In [154]:
import itertools

def get_combos(sponsors, lobbyists):
    """
    parameters: expects two parameters, the 'sponsors' column from master_df_sub and the 'lobbyists' column.
    
    description: calculates the Cartesian product between the two lists.
    
    returns: a list of tuples of every possible pairing between a sponsor and a lobbyist.
    """
    combos = list(itertools.product(sponsors, lobbyists))
    
    return combos

In [155]:
master_df_sub['sponsor_lobbyist_combos'] = master_df_sub.apply(lambda x: get_combos(x['sponsors'],x['lobbyists']), axis=1)

In [156]:
master_df_sub.head()

Unnamed: 0,amount,client.gvkey,client.legal_name,id,registrant,specific_issues,year,sponsors,lobbyists,sponsor_lobbyist_combos
499,30000.0,12141,Microsoft Corp,0015DE1E-7D20-4A9F-A450-D2EE176A452F,"The Gibson Group, LLC",[{'text': 'H1-B Visas. Issues relating to high...,2015,[],"[Gibson, Joseph]",[]
500,220000.0,12141,Microsoft Corp,0016DDE7-E7C1-45F1-A73C-C98887F16EF9,"Barbour Griffith & Rogers, LLC d/b/a BGR Holding","[{'text': '', 'gov_entities': ['U.S. House of ...",2001,[],"[Barbour, Reeves Haley Sr, Griffith, Go Jr, Mo...",[]
502,40000.0,12141,Microsoft Corp,002625A4-7560-4F27-A87C-3E0791FFB55A,"Aduston Consulting, LLC",[{'text': 'H.R. 1736 Investment in America Act...,2006,"[Johnson, Nancy, Goodlatte, Bob, Thomas, Willi...","[Gallant, Karl]","[(Johnson, Nancy, Gallant, Karl), (Goodlatte, ..."
503,180000.0,12141,Microsoft Corp,0048ECB8-4194-4927-83FF-8BFBDD0F09FA,K&L GATES LLP,"[{'text': '', 'gov_entities': ['U.S. Senate'],...",2006,[],"[Carnevale, Amy, Gorton, Slade, Punke, Timothy...",[]
501,60000.0,12141,Microsoft Corp,006D68DB-897F-4DEB-A086-6AF6D9758C8B,FOLEY HOAG LLP,[{'text': 'Patient Protection and Affordable C...,2009,"[Hastert, J., Renzi, Rick, Schwarz, John, Stab...","[Childress, Kelly, Kim, Paul, Larsson, Maia]","[(Hastert, J., Childress, Kelly), (Hastert, J...."


In [157]:
#create a df with a column for (individual) sponsor and a column for (individual) lobbyist
combos = master_df_sub[master_df_sub['sponsor_lobbyist_combos'].str.len()>0]['sponsor_lobbyist_combos']

In [158]:
politician_lists = master_df_sub[master_df_sub['sponsors'].str.len()>0]['sponsors']
lobbyist_lists = master_df_sub[master_df_sub['lobbyists'].str.len()>0]['lobbyists']

In [159]:
all_politicians = [x for i in politician_lists for x in i]
all_lobbyists = [x for i in lobbyist_lists for x in i]

In [160]:
# master_df_sub[master_df_sub['client.legal_name'] == 'Time Warner Inc']

## Create Graph

In [161]:
G=nx.Graph()
G.add_edges_from([x for i in combos for x in i]) #add all nodes & edges at once

In [162]:
print(nx.info(G))

Name: 
Type: Graph
Number of nodes: 374
Number of edges: 1853
Average degree:   9.9091


In [163]:
# get all existing edges from graph
all_graph_edges = list(G.edges())

# get all non-existing edges from graph
targets = nx.non_edges(G)

In [164]:
targets = nx.non_edges(G)

In [165]:
len(list(targets))

67898

In [166]:
total_edges = all_graph_edges + list(targets)

In [167]:
len(total_edges)

1853

In [168]:
# list(targets)

In [169]:
# def get_graph_features(G, list_of_edges):
    
#     # get common neighbors for nodes
#     common_neighbors = [len(list(nx.common_neighbors(G, edge[0], edge[1]))) for edge in all_graph_edges]

In [170]:
# shortest_paths = []

# for edge in all_graph_edges:
#     path = nx.shortest_path_length(G, edge[0], edge[1])
#     shortest_paths.append(path)

In [171]:
# jaccard_coefficient = [item[2] for item in list(nx.jaccard_coefficient(G, ebunch=all_graph_edges))]

In [172]:
Time_Warner = master_df_sub[master_df_sub['client.legal_name'] == 'Time Warner Inc']

In [173]:
combos_year = Time_Warner[['year', 'sponsor_lobbyist_combos']].sort_values(by = ['year'])

In [174]:
existing_combos = combos_year[combos_year['year'] < 2013]
future_combos = combos_year[combos_year['year'] > 2012]

In [175]:
# TW_existing

In [176]:
# all existing lobbyist, politician pairs
TW_existing = [x for i in existing_combos['sponsor_lobbyist_combos'] for x in i]
# all future lobbyist, politician pairs (may include nodes not in "existing" graph)
TW_future = [x for i in future_combos['sponsor_lobbyist_combos'] for x in i]

In [177]:
TW_current_graph = nx.Graph()
TW_current_graph.add_edges_from(TW_existing) #add all nodes & edges at once

In [178]:
# TW_existing

In [179]:
TW_future_graph = nx.Graph()
TW_future_graph.add_edges_from(TW_future) #add all nodes & edges at once

In [180]:
# all possible politicians and lobbyists at time graph was made
TW_current_graph_nodes = list(TW_current_graph.nodes())

# all possible politicians and lobbyists for future time graph
TW_future_graph_nodes = list(TW_future_graph.nodes())

In [181]:
# non_existent edges at time graph is made
TW_targets = list(nx.non_edges(TW_current_graph))

In [182]:
len(TW_targets)

0

In [183]:
for pair in TW_targets:
    if pair[0] in all_politicians and pair[1] in all_politicians:
        TW_targets.remove(pair)
    if pair[0] in all_lobbyists and pair[1] in all_lobbyists:
        TW_targets.remove(pair)
    else:
        continue

In [184]:
len(TW_targets)

0

In [185]:
future_classifications = []

future = 0
not_future = 0
ignore = 0

# for lobbyist, politican not connected
for pair in TW_targets:
    #check if lobbyist and politician, independently are in future graph list
    if pair[0] in TW_future_graph_nodes or pair[1] in TW_future_graph_nodes:
        #if yes, check if lobbyist and politician are a pair in future
        if pair in TW_future:
            future +=1
            future_classifications.append([pair, 1])
        else:
            future_classifications.append([pair, 0])
            not_future +=1
    else:
        ignore +=1
        continue
        
print(future, not_future, ignore)
# print(future_classifications)

0 0 0


In [186]:
# future_classifications = []

# future = 0
# not_future = 0
# ignore = 0

# # # for lobbyist, politican connection in the future
# for pair in TW_future:
#     #check if lobbyist and politician are in original graph (past)
#     if pair[0] in TW_current_graph_nodes and pair[1] in TW_current_graph_nodes:
#         #if yes, check if lobbyist and politician were a non-exist pair in original graph (past)
#         if pair in TW_targets:
#             future +=1
#             future_classifications.append([pair, 1])
#         else:
#             not_future +=1
#             future_classifications.append([pair, 0])
#     else:
#         ignore +=1
#         continue
        
# print(future, not_future, ignore)

In [187]:
def prediction_function(cutoff_year):
    
    # create graph for entire dataset
    yearly_data = master_df_sub[['year', 'sponsor_lobbyist_combos']].sort_values(by = ['year'])
    total_existing = [x for i in yearly_data['sponsor_lobbyist_combos'] for x in i]
    total_graph = nx.Graph()
    total_graph.add_edges_from(total_existing) #add all nodes & edges at once
    
    #create two databases, one for current year only, and other for every year after
    existing_combos = yearly_data[yearly_data['year'] == cutoff_year]
    future_combos = yearly_data[yearly_data['year'] > cutoff_year]
    
    # get lists of every lobbyist and politician
    politician_lists = master_df_sub[master_df_sub['sponsors'].str.len()>0]['sponsors']
    lobbyist_lists = master_df_sub[master_df_sub['lobbyists'].str.len()>0]['lobbyists']
    
    # get final list of each unique politician and lobbyist
    all_politicians = [x for i in politician_lists for x in i]
    all_lobbyists = [x for i in lobbyist_lists for x in i]
    
    # all existing lobbyist, politician pairs
    all_existing = [x for i in existing_combos['sponsor_lobbyist_combos'] for x in i]
    # all future lobbyist, politician pairs (may include nodes not in "existing" graph)
    all_future = [x for i in future_combos['sponsor_lobbyist_combos'] for x in i]
    
    # graph for cutoff_year
    yearly_current_graph = nx.Graph()
    yearly_current_graph.add_edges_from(all_existing) #add all nodes & edges at once
    
    #graph for future years
    yearly_future_graph = nx.Graph()
    yearly_future_graph.add_edges_from(all_future) #add all nodes & edges at once
    
    # all possible politicians and lobbyists at time graph was made
    yearly_current_graph_nodes = list(yearly_current_graph.nodes())

    # all possible politicians and lobbyists for future time graph
    yearly_future_graph_nodes = list(yearly_future_graph.nodes())
    
    # non_existent edges at time graph is made
    future_targets = list(nx.non_edges(yearly_current_graph))
    
    print('number of possible edges, before editing: ' + str(len(future_targets)))
    
    # get rid of non-existent edges which contain two lobbyists or two politicians 
    for pair in future_targets:
        if pair[0] in all_politicians and pair[1] in all_politicians:
            try:
                future_targets.remove(pair)
            except:
                continue
        if pair[0] in all_lobbyists and pair[1] in all_lobbyists:
            try:
                future_targets.remove(pair)
            except:
                continue
        else:
            continue

    # create a list of node pairs with connection label, 1 for future connection, 0 for no connection
    future_classifications = []
    
    all_existing

    future = 0
    not_future = 0

    print('number of possible edges, after editing: ' + str(len(future_targets)))
    
    # for lobbyist, politican not connected
    for pair in future_targets:
        #check if lobbyist and politician, independently are in future graph nodes
        if pair[0] in yearly_future_graph_nodes or pair[1] in yearly_future_graph_nodes:
            #if yes, check if lobbyist and politician are a pair in future pairs
            if pair in all_future:
                future +=1
                future_classifications.append([pair, 1])
            else:
                future_classifications.append([pair, 0])
                not_future +=1
        else:
            future_classifications.append([pair, 0])
            not_future +=1
            continue

    print('number of succesful connections: ' + str(future) + '\n' +  'number of unsuccesful connections: ' + str(not_future))
    
#     print(future_classifications)
    
    return total_graph, future_classifications

In [188]:
predicitions_2011 = prediction_function(2011)

number of possible edges, before editing: 1224
number of possible edges, after editing: 774
number of succesful connections: 4
number of unsuccesful connections: 770


'\nnumber of possible edges, before editing: 56601\nnumber of possible edges, after editing: 36359\nnumber of succesful connections: 516\nnumber of unsuccesful connections: 35843\n'

In [189]:
def get_graph_features(all_data_graph, list_of_edges):
    pairs = []
    connections = []
    
    # create dataframe with 
    edges_df = pd.DataFrame(columns= ['pairs', 'connection'])
    for pair in list_of_edges:
        pairs.append(pair[0])
        connections.append(pair[1])
    edges_df["pairs"] = pairs
    edges_df["connection"] = connections
        
    # get common neighbors for nodes
    common_neighbors = [len(list(nx.common_neighbors(all_data_graph, edge[0], edge[1]))) for edge in edges_df.pairs]
    
    # get resource allocation index
    resource_allocation_index = [item[2] for item in list(nx.resource_allocation_index(all_data_graph, ebunch = edges_df.pairs))]
    
    # get jaccard coefficent 
    jaccard_coefficient = [item[2] for item in list(nx.jaccard_coefficient(all_data_graph, ebunch=edges_df.pairs))]
   
    # get shortest paths
    shortest_paths = []
    for edge in edges_df.pairs:
        path = nx.shortest_path_length(all_data_graph, edge[0], edge[1])
        shortest_paths.append(path)
    
    # get preferential treatment
    preferential_attachment = [item[2] for item in list(nx.preferential_attachment(all_data_graph, ebunch = edges_df.pairs))]
     
    edges_df['Common Neighbors'] = common_neighbors
    edges_df['Jaccard Coefficient'] = jaccard_coefficient
    edges_df['Resource Allocation Index'] = resource_allocation_index
    edges_df['Preferential Attachment'] = preferential_attachment
    edges_df['Shortest Paths'] = shortest_paths
    
    edges_df.set_index('pairs', inplace = True)
    
    return edges_df

In [190]:
get_graph_features(predicitions_2011[0], predicitions_2011[1])

Unnamed: 0_level_0,connection,Common Neighbors,Jaccard Coefficient,Resource Allocation Index,Preferential Attachment,Shortest Paths
pairs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(Sampson, John, Gelman, Matt)",0,60,0.833333,4.872675,4320,2
"(Sampson, John, Moran, James)",0,0,0.000000,0.000000,180,3
"(Sampson, John, Humphries, Fred)",0,38,0.575758,2.758294,2640,2
"(Sampson, John, Grab, Francis)",0,1,0.016393,0.034483,120,2
"(Sampson, John, Garrett-Nelson, Labrenda)",0,3,0.046875,0.168870,420,2
...,...,...,...,...,...,...
"(Reid, Harry, Giordano, Nick)",0,0,0.000000,0.000000,675,3
"(Reid, Harry, Grab, Francis)",0,0,0.000000,0.000000,90,3
"(Giordano, Nick, Smith, Lamar)",0,0,0.000000,0.000000,465,1
"(Howell, Andrew, Smith, Lamar)",0,0,0.000000,0.000000,837,3


# Predictive Models

In [191]:
# predictions_2010 = prediction_function(2010)
# get_graph_features(predictions_2010[0], predictions_2010[1])
# """
# number of possible edges, before editing: 54532
# number of possible edges, after editing: 35528
# number of succesful connections: 375
# number of unsuccesful connections: 35153
# """

In [192]:
predictions_2011 = prediction_function(2011)
model_df = get_graph_features(predictions_2011[0], predictions_2011[1])
"""
number of possible edges, before editing: 56601
number of possible edges, after editing: 36359
number of succesful connections: 516
number of unsuccesful connections: 35843
"""

number of possible edges, before editing: 1224
number of possible edges, after editing: 774
number of succesful connections: 4
number of unsuccesful connections: 770


'\nnumber of possible edges, before editing: 56601\nnumber of possible edges, after editing: 36359\nnumber of succesful connections: 516\nnumber of unsuccesful connections: 35843\n'

In [193]:
# predictions_2012 = prediction_function(2012)
# get_graph_features(predictions_2012[0], predictions_2012[1])
# """
# number of possible edges, before editing: 74121
# number of possible edges, after editing: 47483
# number of succesful connections: 496
# number of unsuccesful connections: 46987
# """

In [194]:
# predictions_2013 = prediction_function(2013)
# get_graph_features(predictions_2013[0], predictions_2013[1])
# """
# number of possible edges, before editing: 55865
# number of possible edges, after editing: 35734
# number of succesful connections: 433
# number of unsuccesful connections: 35301
# """

### Pre-processing

In [195]:
model_df.head()

Unnamed: 0_level_0,connection,Common Neighbors,Jaccard Coefficient,Resource Allocation Index,Preferential Attachment,Shortest Paths
pairs,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
"(Sampson, John, Gelman, Matt)",0,60,0.833333,4.872675,4320,2
"(Sampson, John, Moran, James)",0,0,0.0,0.0,180,3
"(Sampson, John, Humphries, Fred)",0,38,0.575758,2.758294,2640,2
"(Sampson, John, Grab, Francis)",0,1,0.016393,0.034483,120,2
"(Sampson, John, Garrett-Nelson, Labrenda)",0,3,0.046875,0.16887,420,2


In [196]:
#move index to col
model_df.reset_index(inplace=True)

In [197]:
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer

trans_cols = ['Common Neighbors', 'Jaccard Coefficient',
       'Resource Allocation Index', 'Preferential Attachment',
       'Shortest Paths']

other_cols = ['pairs','connection']

ct = ColumnTransformer([("std",StandardScaler(),trans_cols)]
                       ,remainder='passthrough')

model_std = ct.fit_transform(model_df)

In [198]:
model_df_std = pd.DataFrame(data=model_std, columns=trans_cols+other_cols)
model_df_std

Unnamed: 0,Common Neighbors,Jaccard Coefficient,Resource Allocation Index,Preferential Attachment,Shortest Paths,pairs,connection
0,10.6543,5.66387,11.3785,8.02545,-0.900799,"(Sampson, John, Gelman, Matt)",0
1,-0.268846,-0.30331,-0.21843,-0.327374,0.19202,"(Sampson, John, Moran, James)",0
2,6.64918,3.81947,6.34627,4.6359,-0.900799,"(Sampson, John, Humphries, Fred)",0
3,-0.0867928,-0.185923,-0.136362,-0.448429,-0.900799,"(Sampson, John, Grab, Francis)",0
4,0.277314,0.0323436,0.183478,0.156848,-0.900799,"(Sampson, John, Garrett-Nelson, Labrenda)",0
...,...,...,...,...,...,...,...
769,-0.268846,-0.30331,-0.21843,0.671334,0.19202,"(Reid, Harry, Giordano, Nick)",0
770,-0.268846,-0.30331,-0.21843,-0.508957,0.19202,"(Reid, Harry, Grab, Francis)",0
771,-0.268846,-0.30331,-0.21843,0.24764,-1.99362,"(Giordano, Nick, Smith, Lamar)",0
772,-0.268846,-0.30331,-0.21843,0.998183,0.19202,"(Howell, Andrew, Smith, Lamar)",0


In [199]:
#move pairs back to index
model_df_std.set_index(keys='pairs', inplace=True)

In [200]:
model_df_std['connection'] = model_df_std['connection'].astype(str)

### Train test split

In [201]:
from sklearn.model_selection import train_test_split
X=model_df_std[['Common Neighbors', 'Jaccard Coefficient', 'Resource Allocation Index',
       'Preferential Attachment', 'Shortest Paths']]
# X=model_df_std[['Jaccard Coefficient','Preferential Attachment', 'Shortest Paths']] #results are generally better with all 5 features above
y=model_df_std['connection']

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=5)

### Dummy Classifier

In [202]:
from sklearn.dummy import DummyClassifier

In [203]:
# dummy = DummyClassifier(strategy='most_frequent',random_state=5) #achieves 0.99391256, 0.0
dummy = DummyClassifier(strategy='stratified',random_state=5) #achieves 0.98619696, 0.00668896
# dummy = DummyClassifier(strategy='prior',random_state=5) # achieves 0.99391256, 0.0
# dummy = DummyClassifier(strategy='constant',random_state=5, constant='1') #achieves 0.0, 0.02391304

dummy.fit(X_train, y_train)

DummyClassifier(constant=None, random_state=5, strategy='stratified')

In [204]:
from sklearn.metrics import f1_score
from sklearn.metrics import confusion_matrix

# f1_score(y_test, dummy.predict(X_test), average='weighted') #this scoring = 0.982
# f1_score(y_test, dummy.predict(X_test), average='binary', pos_label='1') #this scoring = 0.0
f1_score(y_test, dummy.predict(X_test), average=None) #this scoring = 0.0 for 1/positive class, 0.994 for 0/negative class

array([0.99352052, 0.        ])

In [205]:
# np.unique(dummy.predict(X_test),return_counts=True)

confusion_matrix(y_test, dummy.predict(X_test))

array([[230,   1],
       [  2,   0]])

> ### A baseline Dummy Classifier model achieves an f1 score of <span style="color:magenta"> 0.01 </span> for the positive label (a connection being formed) for which it predicts <span style="color:magenta"> 140 </span> such instances in the test data and <span style="color:magenta"> 0.986 </span> for the negative label (no connection being formed) for which it predicts <span style="color:magenta"> 10,864 </span> such instances in the test data.

### Logistic Regression

In [206]:
#Logistic Regression
from sklearn.linear_model import LogisticRegression

base_lr = LogisticRegression(random_state=5)

base_lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=5, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [207]:
from sklearn.metrics import f1_score

# f1_score(y_test, base_lr.predict(X_test), average='weighted') #this scoring = 0.984
# f1_score(y_test, base_lr.predict(X_test), average='binary', pos_label='1') #this scoring = 0.191
f1_score(y_test, base_lr.predict(X_test), average=None) #this scoring = 0.191 for 1/positive class, 0.994 for 0/negative class

array([0.99568966, 0.        ])

In [208]:
# np.unique(base_lr.predict(X_test),return_counts=True)

confusion_matrix(y_test, base_lr.predict(X_test))

array([[231,   0],
       [  2,   0]])

> ### A baseline Logistic Regression model achieves an f1 score of 0.024 for the positive label (a connection being formed) for which it predicts 3 such instances in the test data and 0.992 for the negative label (no connection being formed) for which it predicts 11,001 such instances in the test data.

In [209]:
from sklearn.model_selection import GridSearchCV
lr_params = {'C':[.001,.01,1,10,100],
            'class_weight':['balanced',None],
            'penalty':['l1', 'l2']}
lr_clf = GridSearchCV(estimator=base_lr, param_grid=lr_params, cv=5, n_jobs=-1, 
                      scoring='f1_weighted') #including a scoring metric helped A LOT

lr_clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=5, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [0.001, 0.01, 1, 10, 100],
                         'class_weight': ['balanced', None],
                         'penalty': ['l1', 'l2']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_weighted', verbose=0)

In [210]:
lr_clf.best_estimator_

LogisticRegression(C=0.001, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=5, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [211]:
f1_score(y_test, lr_clf.predict(X_test), average=None) #this scoring = 0.556 for 1/positive class, 0.990 for 0/negative class
# f1_score(y_test, lr_clf.predict(X_test), average='weighted') #this scoring = 0.985
# f1_score(y_test, base_lr.predict(X_test), average='binary', pos_label='1') #this scoring = 0.191

array([0.99568966, 0.        ])

In [212]:
# np.unique(lr_clf.predict(X_test),return_counts=True)

confusion_matrix(y_test, lr_clf.predict(X_test))

array([[231,   0],
       [  2,   0]])

> ### A Logistic Regression model improved by Grid Search achieves an f1 score of <span style="color:magenta"> 0.605 </span> for the positive label (a connection being formed) for which it predicts <span style="color:magenta"> 383 </span> such instances in the test data and <span style="color:magenta"> 0.990 </span> for the negative label (no connection being formed) for which it predicts <span style="color:magenta"> 10,621 </span> such instances in the test data.

### Support Vector Machine

In [213]:
from sklearn.svm import SVC

In [214]:
base_svm = SVC(random_state=5)

In [215]:
base_svm.fit(X_train, y_train)

SVC(C=1.0, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=5, shrinking=True, tol=0.001,
    verbose=False)

In [216]:
# f1_score(y_test, base_svm.predict(X_test), average='weighted') #this scoring = 0.983
# f1_score(y_test, base_svm.predict(X_test), average='binary', pos_label='1') #this scoring = 0.058
f1_score(y_test, base_svm.predict(X_test), average=None) #this scoring = 0.058 for 1/positive class, 0.994 for 0/negative class

array([0.99568966, 0.        ])

In [217]:
# np.unique(base_svm.predict(X_test),return_counts=True)

confusion_matrix(y_test, base_svm.predict(X_test))

array([[231,   0],
       [  2,   0]])

> ### A baseline Support Vector Machine model achieves an f1 score of 0.0 for the positive label (a connection being formed) for which it predicts 0 such instances in the test data and 0.992 for the negative label (no connection being formed) for which it predicts 11,004 such instances in the test data.

In [218]:
# svm_params = {'C':[.001,.01,1,10,100],
#               'kernel':['rbf','linear','sigmoid','poly'],
#             'class_weight':['balanced',None],
#             'decision_function_shape':['ovo','ovr']} 
# """
# best estimator for the above is:
# best estimator here is SVC(C=10, cache_size=200, class_weight='balanced', coef0=0.0,
#     decision_function_shape='ovo', degree=3, gamma='auto_deprecated',
#     kernel='rbf', max_iter=-1, probability=False, random_state=5,
#     shrinking=True, tol=0.001, verbose=False)
# """

svm_params = {'C':[1,10,100],
              'kernel':['rbf'],
            'class_weight':['balanced',None],
            'decision_function_shape':['ovo','ovr'],
             'gamma':['scale','auto']}
svm_clf = GridSearchCV(estimator=base_svm, param_grid=svm_params, cv=5, n_jobs=-1, 
                      scoring='f1_weighted')

svm_clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=SVC(C=1.0, break_ties=False, cache_size=200,
                           class_weight=None, coef0=0.0,
                           decision_function_shape='ovr', degree=3,
                           gamma='scale', kernel='rbf', max_iter=-1,
                           probability=False, random_state=5, shrinking=True,
                           tol=0.001, verbose=False),
             iid='deprecated', n_jobs=-1,
             param_grid={'C': [1, 10, 100], 'class_weight': ['balanced', None],
                         'decision_function_shape': ['ovo', 'ovr'],
                         'gamma': ['scale', 'auto'], 'kernel': ['rbf']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring='f1_weighted', verbose=0)

In [219]:
svm_clf.best_estimator_

SVC(C=1, break_ties=False, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovo', degree=3, gamma='scale', kernel='rbf',
    max_iter=-1, probability=False, random_state=5, shrinking=True, tol=0.001,
    verbose=False)

In [220]:
# f1_score(y_test, svm_clf.predict(X_test), average='weighted') #this scoring = 0.985
# f1_score(y_test, svm_clf.predict(X_test), average='binary', pos_label='1') #this scoring = 0.551
f1_score(y_test, svm_clf.predict(X_test), average=None) #this scoring = 0.551 for 1/positive class, 0.990 for 0/negative class

array([0.99568966, 0.        ])

In [221]:
# np.unique(svm_clf.predict(X_test),return_counts=True)

confusion_matrix(y_test, svm_clf.predict(X_test))

array([[231,   0],
       [  2,   0]])

> ### A Support Vector Machine model improved by Grid Search achieves an f1 score of <span style="color:magenta"> 0.605 </span> for the positive label (a connection being formed) for which it predicts <span style="color:magenta"> 383 </span> such instances in the test data and <span style="color:magenta"> 0.990 </span> for the negative label (no connection being formed) for which it predicts <span style="color:magenta"> 10,621 </span> such instances in the test data.

### Decision Tree

In [222]:
from sklearn.tree import DecisionTreeClassifier

In [223]:
base_dt = DecisionTreeClassifier(random_state=5)

In [224]:
base_dt.fit(X_train, y_train)

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=5, splitter='best')

In [225]:
# f1_score(y_test, base_dt.predict(X_test), average='weighted') #this scoring = 0.985
# f1_score(y_test, base_dt.predict(X_test), average='binary', pos_label='1') #this scoring = 0.367
f1_score(y_test, base_dt.predict(X_test), average=None) #this scoring = 0.367 for 1/positive class, 0.993 for 0/negative class

array([0.99568966, 0.        ])

In [226]:
# np.unique(base_dt.predict(X_test),return_counts=True)

confusion_matrix(y_test, base_dt.predict(X_test))

array([[231,   0],
       [  2,   0]])

> ### A baseline Decision Tree model achieves an f1 score of 0.508 for the positive label (a connection being formed) for which it predicts 133 such instances in the test data and 0.993 for the negative label (no connection being formed) for which it predicts 10,871 such instances in the test data.

In [227]:
dt_params = {'criterion':['gini','entropy'],
             'max_depth':[2,3,5,None],
             'splitter':['best','random'],
            'class_weight':['balanced',None],
            'max_features':['auto','sqrt','log2',None]}
dt_clf = GridSearchCV(estimator=base_dt, param_grid=dt_params, cv=5, n_jobs=-1, 
                      scoring='f1_weighted')

dt_clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features=None,
                                              max_leaf_nodes=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              presort='deprecated',
                                              random_state=5, splitter='best'),
             iid='deprecated', n_jobs=-1,
             param_grid={'class_weight': ['balanced', None],
                         'criterion': ['gini', 'entropy'],
         

In [228]:
dt_clf.best_estimator_

DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=2, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=5, splitter='best')

In [229]:
# f1_score(y_test, dt_clf.predict(X_test), average='weighted') #this scoring = 0.985
# f1_score(y_test, dt_clf.predict(X_test), average='binary', pos_label='1') #this scoring = 0.448
f1_score(y_test, dt_clf.predict(X_test), average=None) #this scoring = 0.448 for 1/positive class, 0.992 for 0/negative class

array([0.99568966, 0.        ])

In [230]:
# np.unique(dt_clf.predict(X_test),return_counts=True)

confusion_matrix(y_test, dt_clf.predict(X_test))

array([[231,   0],
       [  2,   0]])

> ### A Decision Tree model improved by Grid Search achieves an f1 score of <span style="color:magenta"> 0.508 </span> for the positive label (a connection being formed) for which it predicts <span style="color:magenta"> 133 </span> such instances in the test data and <span style="color:magenta"> 0.993 </span> for the negative label (no connection being formed) for which it predicts <span style="color:magenta"> 10,871 </span> such instances in the test data.

## Random Forest

In [231]:
from sklearn.ensemble import RandomForestClassifier

In [232]:
base_rf = RandomForestClassifier(random_state=5)

base_rf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=100,
                       n_jobs=None, oob_score=False, random_state=5, verbose=0,
                       warm_start=False)

In [233]:
# f1_score(y_test, base_rf.predict(X_test), average='weighted')
# f1_score(y_test, base_rf.predict(X_test), average='binary', pos_label='1') 
f1_score(y_test, base_rf.predict(X_test), average=None) #this scoring = 0.495 for 1/positive class, 0.993 for 0/negative class

array([0.99568966, 0.        ])

In [234]:
confusion_matrix(y_test, base_rf.predict(X_test))

array([[231,   0],
       [  2,   0]])

In [235]:
rf_params = {'criterion':['gini','entropy'],
             'max_depth':[2,3,5,None],
            'class_weight':['balanced',None],
            'max_features':['auto','sqrt','log2',None]}
rf_clf = GridSearchCV(estimator=base_rf, param_grid=rf_params, cv=5, n_jobs=-1, 
                      scoring='f1_weighted')

rf_clf.fit(X_train, y_train)

GridSearchCV(cv=5, error_score=nan,
             estimator=RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                              class_weight=None,
                                              criterion='gini', max_depth=None,
                                              max_features='auto',
                                              max_leaf_nodes=None,
                                              max_samples=None,
                                              min_impurity_decrease=0.0,
                                              min_impurity_split=None,
                                              min_samples_leaf=1,
                                              min_samples_split=2,
                                              min_weight_fraction_leaf=0.0,
                                              n_estimators=100, n_jobs=None,
                                              oob_score=False, random_state=5,
                                   

In [236]:
# f1_score(y_test, rf_clf.predict(X_test), average='weighted')
# f1_score(y_test, rf_clf.predict(X_test), average='binary', pos_label='1')
f1_score(y_test, rf_clf.predict(X_test), average=None) #this scoring = 0.495 for 1/positive class, 0.993 for 0/negative class

array([0.99568966, 0.        ])

In [237]:
confusion_matrix(y_test, rf_clf.predict(X_test))

array([[231,   0],
       [  2,   0]])

### Best RF & get feature importances

In [238]:
best_rf = RandomForestClassifier(class_weight=None,criterion='entropy',max_depth= None,max_features='auto',random_state=5)

In [239]:
best_rf.fit(X_train, y_train)
f1_score(y_test, best_rf.predict(X_test), average=None) #this scoring = 0.495 for 1/positive class, 0.993 for 0/negative class

array([0.99568966, 0.        ])

In [240]:
confusion_matrix(y_test, best_rf.predict(X_test))

array([[231,   0],
       [  2,   0]])

In [241]:
best_rf.feature_importances_

array([0.00477019, 0.01076559, 0.00925277, 0.66936835, 0.30584311])

In [242]:
# list(zip(X_train.columns, best_rf.feature_importances_))
sorted(zip(X_train.columns, best_rf.feature_importances_),key = lambda x: x[1], reverse=True)

[('Preferential Attachment', 0.6693683480176223),
 ('Shortest Paths', 0.3058431109018061),
 ('Jaccard Coefficient', 0.010765585109202391),
 ('Resource Allocation Index', 0.009252769459628229),
 ('Common Neighbors', 0.004770186511740942)]

> ### <span style="color:magenta"> Feature importances ^ </span>

### Get model scores & counts

In [243]:
#get all model scores
models = {'Decision Tree':dt_clf,
          'SVM':svm_clf,
         'Dummy':dummy,
         'Logistic Regression':lr_clf,
         'Random Forest':rf_clf}

for k,v in models.items():
    score = f1_score(y_test, v.predict(X_test), average=None)
    matrix = confusion_matrix(y_test, v.predict(X_test))
    print(f"{k}\n{score}\n{matrix}\n")

Decision Tree
[0.99568966 0.        ]
[[231   0]
 [  2   0]]

SVM
[0.99568966 0.        ]
[[231   0]
 [  2   0]]

Dummy
[0.99352052 0.        ]
[[230   1]
 [  2   0]]

Logistic Regression
[0.99568966 0.        ]
[[231   0]
 [  2   0]]

Random Forest
[0.99568966 0.        ]
[[231   0]
 [  2   0]]

