In [None]:
import pandas as pd
import numpy as np
from geopy.geocoders import Nominatim 
from geopy.distance import geodesic
import re


# Notes on Pre-Cleaning
Before this data was pulled into Python, column labels were changed from the original Qualtrics export. Column names were standardized to camel case and shortened to useable lengths.

Additionally, the data was run through the TAMU geocoding service (http://geoservices.tamu.edu/Services/Geocode/) to get latitude adn longitude for each address.

In [None]:
#read in the simplified final stakeholders - these are our nodes
df = pd.read_csv('FinalStakeholders-simplified.csv')

df.info()

## One Hot Encoding
For SNA/ERGM, we need to one-hot encode all the variables that are multi-valued categories. 

In [None]:
#mapping the collectivism/individualism to numbers
df['PersonalSelfDependenceScore'] = df['PersonalSelfDependence'].map({'Strongly Disagree': -2, 
                                                                      'Disagree': -1,
                                                                     'Neutral': 0,
                                                                     'Agree': 1,
                                                                     'Strongly Disagree': 2})

df['WorkSelfDependenceScore'] = df['WorkSelfDependence'].map({'Strongly Disagree': -2, 
                                                                      'Disagree': -1,
                                                                     'Neutral': 0,
                                                                     'Agree': 1,
                                                                     'Strongly Disagree': 2})

df['CollaborativeEconomicAdvantageScore'] = df['CollaborativeEconomicAdvantage'].map({'Strongly Disagree': -2, 
                                                                      'Disagree': -1,
                                                                     'Neutral': 0,
                                                                     'Agree': 1,
                                                                     'Strongly Disagree': 2})

df['CollaborationsNotWorthItScore'] = df['CollaborationsNotWorthIt'].map({'Strongly Disagree': -2, 
                                                                      'Disagree': -1,
                                                                     'Neutral': 0,
                                                                     'Agree': 1,
                                                                     'Strongly Disagree': 2})

In [None]:
# #mapping the collectivism/individualism to smaller categories - this didn't help
# df['PersonalSelfDependence'] = df['PersonalSelfDependence'].map({'Strongly Disagree': 'Negative', 
#                                                                       'Disagree': 'Negative',
#                                                                      'Neutral': 'Neutral',
#                                                                      'Agree': 'Positive',
#                                                                      'Strongly Disagree': 'Positive'})

# df['WorkSelfDependence'] = df['WorkSelfDependence'].map({'Strongly Disagree': 'Negative', 
#                                                                       'Disagree': 'Negative',
#                                                                      'Neutral': 'Neutral',
#                                                                      'Agree': 'Positive',
#                                                                      'Strongly Disagree': 'Positive'})

# df['CollaborativeEconomicAdvantage'] = df['CollaborativeEconomicAdvantage'].map({'Strongly Disagree': 'Negative', 
#                                                                       'Disagree': 'Negative',
#                                                                      'Neutral': 'Neutral',
#                                                                      'Agree': 'Positive',
#                                                                      'Strongly Disagree': 'Positive'})

# df['CollaborationsNotWorthIt'] = df['CollaborationsNotWorthIt'].map({'Strongly Disagree': 'Negative', 
#                                                                       'Disagree': 'Negative',
#                                                                      'Neutral': 'Neutral',
#                                                                      'Agree': 'Positive',
#                                                                      'Strongly Disagree': 'Positive'})

In [None]:
#some categories we can just use get_dummies

out = pd.get_dummies(df['Location'], 'Loc')
df = df.join(out)
out = pd.get_dummies(df['PersonalSelfDependence'], 'PSD')
df=df.join(out)
out = pd.get_dummies(df['WorkSelfDependence'], 'WSD')
df=df.join(out)
out = pd.get_dummies(df['CollaborativeEconomicAdvantage'], 'CEA')
df=df.join(out)
out = pd.get_dummies(df['CollaborationsNotWorthIt'], 'CNWI')
df=df.join(out)


In [None]:
#one-hot encode utility
def oneHot(df, col, indexLabel):
    print(col)
    cleanedCol = df.set_index(indexLabel).apply(lambda x:pd.Series(x[col]),axis=1).stack().to_frame()
    colDummies = pd.get_dummies(cleanedCol, prefix=col, columns=[0]).groupby(level=0).sum()
    colDummies[col+'_Combined'] = colDummies.apply(lambda x: ''.join([str(x[c]) for c in list(colDummies.columns)]), axis=1)
    outDF = df.join(colDummies, on=indexLabel)
    return outDF


In [None]:
#deal with the na's - for each of the categorical questions, add a category of unknown for those that didn't answer
sna = df.copy().fillna({'Address': '', 'City': '', 'State': '', 'Zip': '', 'Location': 'Unknown',
               'Roles': 'Unknown', 'RolesOther': 'None', 'SelfEmployedFlag': 'Unknown', 'BusinessName': 'Unknown',
               'HasSecondJob': 'Unknown', 'Employer': 'Unknown',  'VendorMarkets': 'Unknown', 'ConsumerMarkets': 'Unknown',
               'GroupAffiliations': 'Unknown', 'GroupAffiliationsOther': 'Unknown', 
                'SocialMediaFlag': 'Unknown', 'SocialMediaChoices': 'Unknown',
               'SocialMediaOther': 'Unknown', 'HoursOnSocialMedia': 'Unknown', 'WebsiteFlag': 'Unknown', 'Website': 'Unknown', 
                'OnlineMarketingFlag': 'Unknown', 'AnswerSources': 'Unknown', 'AnswerSourcesOther': 'Unknown', 
                'NetworkOverlap': 'Unknown', 'PersonalSelfDependence': 'Unknown', 'WorkSelfDependence': 'Unknown', 
                'CollaborativeEconomicAdvantage': 'Unknown','CollaborationsNotWorthIt': 'Unknown', 
                  'PersonalSelfDependenceScore': 0, #put nonresponders in neutral
                     'WorkSelfDependenceScore': 0,
                      'CollaborativeEconomicAdvantageScore': 0,
                        'CollaborationsNotWorthItScore':0,
                'IncomePercent': 0, #people that didn't answer this are for the most part not getting any income
                'IntrovertExtrovertScale': 5}) #put people thad didn't answer in the middle of the scale

#convert the list cols to lists
#all of these columns 
listCols = ['Roles', 'VendorMarkets', 'ConsumerMarkets', 'GroupAffiliations','SocialMediaChoices', 'AnswerSources']


for col in listCols:
    sna[col] = sna[col].apply(lambda x: x.split(','))
    sna = oneHot(sna, col, 'Label')   
    

#the onehotencoding results in some terrible column names with spaces and punctuation. Let's clean that up.
colList = list(sna.columns)
newColNames = {}
for col in colList:
    newColNames[col] = re.sub("[- .()/']", '', col)

sna.rename(columns=newColNames, inplace=True)
sna

In [None]:
colList = list(sna.columns)
colList

In [None]:
#in r we have to set a bajillion node variables, let's get the code to do it
colList = list(sna.columns)
for col in colList:
    print(f"V(allPossibleEdges)${col} <- as.character(nodeData${col}[ix])")

## Geodistancing
We need to create a dataframe of all nodes and determine the distance between each node.

In [None]:
display(sna[['Label', 'Latitude', 'Longitude']])

cols = sna['Label']#[sna['ERGMFlag'] == True]

geoMatrix = pd.DataFrame(0, index=cols, columns=cols)

geolocator = Nominatim(user_agent="CheqBaySNA")

colnames = list(geoMatrix)
for i,j in geoMatrix.iterrows():
    rowTuple = (float(sna['Latitude'][sna['Label'] == i]), float(sna['Longitude'][sna['Label'] == i]))
    for c in colnames:
        colTuple = (float(sna['Latitude'][sna['Label'] == c]), float(sna['Longitude'][sna['Label'] == c]))
        geoMatrix[i][c] = float(geodesic(rowTuple, colTuple).miles)

geoMatrix =  geoMatrix.sort_index(1).sort_index(0)
geoMatrix.to_csv('R_distance_matrix.csv')

geoMatrix


## SNA Edges
For SNA, we need a complete matrix.

In [None]:
snaEdges = pd.read_csv('FromQualtricsNumericEdges.csv', header=1)
snaEdges = (snaEdges.drop([0], axis=0)
         .drop(['Start Date', 'End Date', 'Response Type', 'IP Address', 'Progress', 
                'Duration (in seconds)', 'Finished', 'Recorded Date', 'Response ID', 'Recipient Last Name',
               'Recipient First Name', 'Recipient Email', 'Location Latitude', 'Location Longitude',
               'Distribution Channel', 'User Language'], axis=1))


colNames = list(snaEdges.columns)
newColNames = {'External Data Reference': 'Label'}
for c in colNames:
    if "Choose" in c:
        newColNames[c] = str.split(c, ' - ')[1]

colNames = list(snaEdges.columns)
newColNames = {'External Data Reference': 'Label'}
for c in colNames:
    if "Choose" in c:
        newColNames[c] = str.split(c, ' - ')[1]


        
#rename the columns and fill unknowns
snaEdges = snaEdges.rename(columns=newColNames)

#fetch the labels
labels = snaEdges['Label'].copy().tolist()

#Fill with unknown
snaEdges = snaEdges.fillna(1)

#add the extra column - fill with Unknown - she's a weird case
snaEdges["Judie Moyer (Port Wing Market)"] = [int(1) for i in range(53)] 

#figure out the missing (non-respondent) rows
missing = [c for c in snaEdges.columns if c not in labels and c != 'Label']

#missing



missingCols = np.full((len(missing), 10), None)

missingDF = pd.DataFrame(missing, columns=['Label'])
for c in snaEdges.columns:
    if c != 'Label':
        missingDF[c] = [np.nan for i in range(len(missing))]

#fill Judy with unknown. She's a special case.
snaEdges["Judie Moyer (Port Wing Market)"] = [int(1) for i in range(53)]   

#set the label and fill with zeros (for missing)
snaEdges = snaEdges.append(missingDF).set_index('Label').fillna(0)



snaEdges.to_csv('R_complete_matrix.csv')
snaEdges



## Review the complete ties

Complete ties are those for which we have both incoming and outgoing tie information. We want to review these to see what our inter-rater reliability is. In other words, how often do the 2 halves of the dyad disagree with each other about the type of relationship that they have?

In [None]:
#convert the zeroes to NaN and drop all incomplete cases
snastripped = snaEdges.copy().replace(['0', 0], np.nan).dropna()
#drop the columns for nonrespondents, giving us a square matrix
colsToDrop = [c for c in snastripped.columns if c not in snastripped.index]

#sort the rows and columns so that both are in the same order
snastripped = snastripped.drop(columns=colsToDrop).sort_index(1).sort_index(0)

snastripped.index

snastripped

In [None]:
#cast to numpy matrix with integer types
npsna = snastripped.to_numpy().astype(int)
#fill the diagonal with zero
np.fill_diagonal(npsna, 0)
#transpose the matrix and see what doesn't match
print(f'The total number of relationships in which the 2 halves of the dyad disagree {np.count_nonzero(np.subtract(npsna.transpose(), npsna))}.')

In [None]:
#get the cleaned up column names
colNames = list(snaEdges.columns)
#set up 3 lists for holding our 3 columns of data
kFrom = []
kTo = []
kType = []
kWeights = []

weights = {
    'Unknown': 0,
    'Co-exist': 1,
    'Communicate': 2,
    'Coordinate': 3,
    'Collaborate': 4
}

#iterate through each row and column and get the edge combinations and their type
for i,j in snaEdges.iterrows():
    #for each column
    for c in colNames:
        #connection = ''.join(snaEdges.loc[[i],[c]].values[0])
        #don't include self-loops, if anyone set one
        if(i != c):
            kFrom.append(i)
            kTo.append(c)
            kType.append(snaEdges.loc[[i],[c]].values[0][0])
            #kWeights.append(weights[connection])

dfKEdgeList = pd.DataFrame({'From': kFrom, 'To':kTo, 'Weight': kType})

#make sure we didn't generate duplicates along the way
dfKEdgeList = dfKEdgeList.groupby(['From', 'To', 'Weight']).count().reset_index()
dfKEdgeList

### Add the Distance
Now that we have an edge list with all possible relationships, we need to add the distance between each pair.

In [None]:
# fetch the distance for each pair
def fetchDistanceIfAvailable(x):
    try:
         return geoMatrix[x['From']][x['To']] 
    except(ValueError,TypeError,KeyError):
        pass
    return np.nan    

#geoMatrix['Blake Gross (Law Office of Blake Gross, Ltd.)']['Brenda Halter (Shaggy Dog Farm )']
dfKEdgeList['Distance'] = dfKEdgeList.apply(fetchDistanceIfAvailable, axis=1)



In [None]:
#sanity checking that this worked.
display(geoMatrix['Blake Gross (Law Office of Blake Gross, Ltd.)'])
dfKEdgeList[dfKEdgeList['From'] == 'Blake Gross (Law Office of Blake Gross, Ltd.)']

## Write to CSV
Write these out for use in R.

In [None]:
dfKEdgeList.to_csv('R_edges.csv', index=False)
sna.to_csv('R_nodes.csv', index=False)