/
ossPyFuncs.py
382 lines (297 loc) · 17 KB
/
ossPyFuncs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""
Created on Mon Jun 15 12:57:42 2020
@author: dnb3k
"""
def queryToPDTable(postgreSql_selectQuery):
"""Return the output of a SDAD mysql query as a table
Keyword arguments:
postgreSql_selectQuery -- a properly formatted mysql query
"""
import os
import psycopg2
import pandas as pd
#basic query function to the database using environmental variables for
#the user name and password
conn=psycopg2.connect(host="postgis1",
dbname="sdad",
user=os.environ.get('UVA_uname'),
password=os.environ.get('UVA_pass'))
#convert it to a pandas dataframe
dataOut=pd.read_sql_query(postgreSql_selectQuery,conn)
return dataOut
def composeWorkplaceOntology():
"""Create a table featuring valid workplace institutions
"""
import ossPyFuncs
import pandas as pd
#mysql query to extract full table from government organizations
#certian table columns feature capital letters which cases uproblems
postgreSql_selectQuery="SELECT * FROM us_gov_manual.us_govman_2019 ;"
#pass querry and obtain table
govTable=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)
#mysql query to obtain academic instutions
postgreSql_selectQuery="SELECT institution FROM hipolabs.universities ;"
#pass querry and obtain table
univTable=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)
postgreSql_selectQuery="SELECT company FROM forbes.fortune2018_us1000;"
businesses1=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)
postgreSql_selectQuery="SELECT company FROM forbes.fortune2019_us1000;"
businesses2=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)
postgreSql_selectQuery="SELECT company FROM forbes.fortune2020_global2000;"
businesses3=ossPyFuncs.queryToPDTable(postgreSql_selectQuery)
#combine theinsitutions into a vector
combinedSeries=[govTable['AgencyName'],univTable['institution'],businesses1['company'],businesses2['company'],businesses3['company']]
#turn the multi item vector into a single series
fullWordbank=pd.concat(combinedSeries)
#turn that series into a pd dataframe
wordbankTable=pd.DataFrame(fullWordbank.unique())
return wordbankTable
def checkColumnMapping(inputRawColumn,targetOntology):
"""Assess which unique entries in an input column are in target ontology
Keyword arguments:
inputRawColumn -- a single column from a pandas table featuring potentially
non-unique entries
targetOntology -- a single column from a pandas table featuring only unique
entries. Values from [inputRawColumn] will be assessed against this column
"""
import pandas as pd
import numpy as np
#extract counts of unique values from input column
columnUniqueCounts=inputRawColumn.iloc[:,0].value_counts()
#convert that output to a proper table
tableUniqueCounts=columnUniqueCounts.reset_index()
#obtain a boolean vector which indicates the presence of terms from
#inputRawColumn in targetOntology
presentBoolVec=np.in1d(tableUniqueCounts.index,targetOntology)
#set the boolean vector as a column of the output
tableUniqueCounts['present']=presentBoolVec
tableUniqueCounts.rename(columns={tableUniqueCounts.columns[0]:tableUniqueCounts.columns[1],tableUniqueCounts.columns[1]:"count"},inplace=True)
return tableUniqueCounts
def eraseFromColumn(inputColumn,eraseList):
"""iteratively delete regex query matches from input list
Keyword arguments:
inputColumn -- a column from a pandas dataframe, this will be the set of
target words/entries that deletions will be made from
eraseList -- a column containing strings (regex expressions) which will be
deleted from the inputColumn, in an iterative fashion
"""
import pandas as pd
import re
eraseList['changeNum']=0
eraseList['changeIndexes']=''
#necessary, due to escape nonsense
inputColumn=inputColumn.replace(regex=True, to_replace='\\\\',value='/')
for index, row in eraseList.iterrows():
curReplaceVal=row[0]
currentRegexExpression=re.compile(curReplaceVal)
CurrentBoolVec=inputColumn.str.contains(currentRegexExpression,na=False)
eraseList['changeIndexes'].iloc[index]=[i for i, x in enumerate(CurrentBoolVec) if x]
eraseList['changeNum'].iloc[index]=len(eraseList['changeIndexes'].iloc[index])
inputColumn.replace(regex=True, to_replace=currentRegexExpression,value='', inplace=True)
return inputColumn, eraseList;
def expandFromColumn(inputColumn,replaceList):
"""iteratively replace regex query matches from input list
Keyword arguments:
inputColumn -- a column from a pandas dataframe, this will be the set of
target words/entries that replacements will be made to
replaceList -- a two column (pandas) object. The first column containing strings
(regex expressions) which will be
replaced by the entry (from the same row) from the second column, in an iterative fashion
"""
import pandas as pd
import re
#necessary, due to escape nonsense
inputColumn=inputColumn.replace(regex=True, to_replace='\\\\',value='/')
replaceList['changeNum']=0
replaceList['changeIndexes']=''
for index, row in replaceList.iterrows():
curReplaceVal=row[0]
currentRegexExpression=re.compile(curReplaceVal)
CurrentBoolVec=inputColumn.str.contains(currentRegexExpression,na=False)
replaceList['changeIndexes'].iloc[index]=[i for i, x in enumerate(CurrentBoolVec) if x]
replaceList['changeNum'].iloc[index]=len(replaceList['changeIndexes'].iloc[index])
inputColumn=inputColumn.replace(regex=True, to_replace=currentRegexExpression,value=row[1])
return inputColumn, replaceList;
def uniquePandasIndexMapping(inputColumn):
"""quickly mapps the unique name entries back to input entries
Keyword arguments:
inputDataToAssess -- a SINGLE column from a pandas dataframe, presumably with
duplications. Will create a frequency table and a mapping back to the source entries.
"""
import numpy as np
inputColumn.sort_values(by=['company'], inplace=True)
sortedInputColumn=inputColumn.reset_index()
sortedInputColumn.rename(columns={"index":"userIndex"},inplace=True)
tableUniqueFullNameCounts=inputColumn.iloc[:,0].value_counts()
tableUniqueFullNameCounts=tableUniqueFullNameCounts.reset_index()
tableUniqueFullNameCounts.rename(columns={"company":"count","index":"company"},inplace=True)
tableUniqueFullNameCounts.sort_values(by=['company'], inplace=True)
sortedTableUniqueFullNameCounts=tableUniqueFullNameCounts.reset_index()
sortedTableUniqueFullNameCounts['inputIndexMapping']=''
currentSum=0
for index, row in sortedTableUniqueFullNameCounts.iterrows():
currentRange=np.arange(currentSum,currentSum+sortedTableUniqueFullNameCounts['count'].iloc[index])
sortedTableUniqueFullNameCounts['inputIndexMapping'].iloc[index]=sortedInputColumn['userIndex'].iloc[currentRange].array
currentSum=currentSum+sortedTableUniqueFullNameCounts['count'].iloc[index]
return sortedInputColumn, sortedTableUniqueFullNameCounts;
def addBooleanColumnFromCriteria(inputDataToAssess,assessItems,newColumnName):
"""iteratively determine if input column contains member of other column
Keyword arguments:
inputDataToAssess -- a SINGLE column from a pandas dataframe, this will be the set of
target words/entries that deletions will be made from
assessItems -- a seriers or dataframe containing strings
(regex expressions) which will be searched for (as substrings)
in the inputDataToAssess. This will be done in an iterative fashion, and
a bolean vector will be created and appended to the output, indicating
which entries in inputDataToAssess contained a substring from assessItems.
newColumnName -- name of the new column (i.e. 'government', 'academic', etc.)
"""
import pandas as pd
import re
inputDataToAssess[newColumnName]=False
#necessary, due to escape nonsense
inputDataToAssess=inputDataToAssess.replace(regex=True, to_replace='\\\\',value='/')
for index, row in assessItems.iterrows():
curReplaceVal=row[0]
currentRegexExpression=re.compile(curReplaceVal)
CurrentBoolVec=inputDataToAssess[inputDataToAssess.columns[0]].str.contains(currentRegexExpression,na=False)
inputDataToAssess[newColumnName].loc[CurrentBoolVec]=True
return inputDataToAssess;
def iterativeFullFuzzyMatch(inputColumn):
"""iteratively perform a fuzzy match on entire input column
Keyword arguments:
inputColumn -- a SINGLE column from a pandas dataframe, this will be the set of
target words/entries will be iteratively matched against (except self) seeking
close matches
"""
#get the input column names
inputColumnName=inputColumn.columns
#get the unique values (and counts)
tableUniqueFullNameCounts=inputColumn[inputColumnName[0]].value_counts()
#convert that output to a proper table
tableUniqueFullNameCounts=tableUniqueFullNameCounts.reset_index()
#rename the columns
tableUniqueFullNameCounts.rename(columns={inputColumnName[0]:"count","index":inputColumnName[0]},inplace=True)
import difflib
import numpy as np
#create blank column
tableUniqueFullNameCounts['guesses']=''
for index, row in tableUniqueFullNameCounts.iterrows():
blankBool=np.full((len(tableUniqueFullNameCounts.index)),True)
curBool=blankBool
curBool[index]=False
currentChecklist=tableUniqueFullNameCounts[inputColumnName[0]].loc[curBool]
tableUniqueFullNameCounts['guesses'].loc[index]=difflib.get_close_matches(tableUniqueFullNameCounts[inputColumnName[0]].loc[index],currentChecklist,cutoff=0.8)
return tableUniqueFullNameCounts
def createSubstringMatrix(inputColumn):
"""iteratively perform substring assesment across all entries and map these
relations into a directed matrix
Keyword arguments:
inputColumn -- a SINGLE column from a pandas dataframe, this will be the set of
target words/entries will be iterated over when performing the substring matching
NOTE: The returned matrix is in a strange format and mary require converstion to
interact with
"""
#get the input column names
inputColumnName=inputColumn.columns
#get the unique values (and counts)
tableUniqueFullNameCounts=inputColumn[inputColumnName[0]].value_counts()
#convert that output to a proper table
tableUniqueFullNameCounts=tableUniqueFullNameCounts.reset_index()
#rename the columns
tableUniqueFullNameCounts.rename(columns={inputColumnName[0]:"count","index":inputColumnName[0]},inplace=True)
tableUniqueFullNameCounts=tableUniqueFullNameCounts.sort_values(by=['count',inputColumnName[0]],ascending=[False,False])
tableUniqueFullNameCounts=tableUniqueFullNameCounts.reset_index(drop=True)
import scipy.sparse as sparse
import re
#create a sparse matrix of appropriate dimensions
substringMatrix=sparse.coo_matrix((len(tableUniqueFullNameCounts.index),len(tableUniqueFullNameCounts.index)),dtype=bool)
#convert to format the permits modification
accessibleMatrix=substringMatrix.tolil()
#iterate acroos unique listings
for index, row in tableUniqueFullNameCounts.iterrows():
#formulate a good regex expression
currentRegex=re.compile('(?i)\\b'+re.escape(tableUniqueFullNameCounts[inputColumnName[0]].loc[index])+'\\b')
#print(currentRegex)
#get all company listings that feature the current company string
currentBool=tableUniqueFullNameCounts[inputColumnName[0]].str.contains(currentRegex)
#fill in boolean values
accessibleMatrix[index,:]=currentBool
return tableUniqueFullNameCounts, accessibleMatrix
def spaceSymbolRemap(inputColumn):
"""remapps entries with same space and symbol free string to most common element
Keyword arguments:
inputColumn -- a column from a pandas dataframe, presumably with duplicate
entires, as frequency will guide this process.
space/symbol/case variants of the same string will be remapped to most common element
"""
import pandas as pd
import re
import numpy as np
import ossPyFuncs
#get the input column names
inputColumnName=inputColumn.columns
#get the unique values (and counts)
tableUniqueFullNameCounts=inputColumn[inputColumnName[0]].value_counts()
#convert that output to a proper table
tableUniqueFullNameCounts=tableUniqueFullNameCounts.reset_index()
#rename the columns
tableUniqueFullNameCounts.rename(columns={inputColumnName[0]:"count","index":inputColumnName[0]},inplace=True)
tableUniqueFullNameCounts=tableUniqueFullNameCounts.sort_values(by=['count',inputColumnName[0]],ascending=[False,False])
tableUniqueFullNameCounts=tableUniqueFullNameCounts.reset_index(drop=True)
uniqueNoSpaceSymbol=pd.DataFrame(tableUniqueFullNameCounts[inputColumnName[0]].str.replace('[^a-zA-Z0-9]',''))
allNoSpaceSymbol=pd.DataFrame(inputColumn[inputColumnName[0]].str.replace('[^a-zA-Z0-9]',''))
tableUniqueFullNameCounts['remapping']=''
#get the remaping to users
sortedInputColumn, sortedTableUniqueFullNameCounts=ossPyFuncs.uniquePandasIndexMapping(inputColumn)
#iterate across entries with guesses
for index, row in tableUniqueFullNameCounts.iterrows():
#because our entries are sorted by frequency we can safely assume
#that the fist time we encounter a nospacesymbol variant of a substring
#we are also encountering its most frequent form. As such, it is safe
#for us to use this most frequent version to replace less frequent versions.
#Further more, we can remove less frequent substring variants from consideration.
#This ideally speeds up our computations as there are less entries to check against
#set current entry number
currentEntry=tableUniqueFullNameCounts[inputColumnName[0]].loc[index]
#get the lowercase form of it
#currentLower=currentEntry.lower()
#extract current string from company vector
#DID YOU KNOW that underscore counts as a letter character with \W ?
currentNoSpaceOrSymbol=re.sub('[^a-zA-Z0-9]','',currentEntry)
#if that's not empty
if len(currentNoSpaceOrSymbol)>0:
#remove leading white space, just in case this is an entry that has been deleted from
currentEntry=re.sub('^[ \\t]+','',currentEntry)
#make replacement to the noSpaceSymbol vector
allNoSpaceSymbol=pd.DataFrame(allNoSpaceSymbol[inputColumnName[0]].str.replace('(?i)\\b'+currentNoSpaceOrSymbol+'(?i)\\b',currentEntry))
#find the unique name indexes these correspond to
noSpaceSymbolMatches=uniqueNoSpaceSymbol[inputColumnName[0]].str.contains('(?i)\\b'+currentNoSpaceOrSymbol+'\\b')
#get the names this corresponds to
toRemoveEntries=pd.DataFrame(tableUniqueFullNameCounts[inputColumnName[0]].loc[noSpaceSymbolMatches])
#iteratively remove it from the unique list
for index, row in toRemoveEntries.iterrows():
#some of the to remove terms have regex features. We need to compile them to prevent this from causing problems
currentToRemove=re.compile('(?i)\\b'+re.escape(toRemoveEntries[inputColumnName[0]].loc[index])+'\\b')
#perform the deletion
tableUniqueFullNameCounts=pd.DataFrame(tableUniqueFullNameCounts[inputColumnName[0]].str.replace(currentToRemove,''))
#create an unchanged noSpaceSymbol frame
#I guess we dont need this?
#noSpaceSymbolCheckFrame=pd.DataFrame(tableUniqueFullNameCounts[inputColumnName[0]].str.replace('[^a-zA-Z0-9]',''))
#check and see if the entry has been changed from this form. In such cases
#we can assume that its origional form was not equivalent to its No Space Form
#I guess we dont need this?
#NoSpaceUnchanged=allNoSpaceSymbol.eq(noSpaceSymbolCheckFrame)
#check and see if the entry has been changed from its origional input
#In such cases that it has changed, we can assume the new form is the correct one,
#or that the origional form was the correct one.
inputUnchanged=allNoSpaceSymbol.eq(inputColumn)
#also identify Empty listings
emptyListings=allNoSpaceSymbol[inputColumnName[0]].str.len().fillna(value=0)==0
#now we alter the input colum to reflect these new values
inputColumn[inputColumnName[0]].loc[inputColumn]=allNoSpaceSymbol.loc[inputColumn]
#we have to use sum, because empty is not zero, and is thus non zero, and so np.count_nonzero won't work
print(str(np.sum(np.logical_and(~inputUnchanged[inputColumnName[0]], ~emptyListings)))+' entries changed')
return inputColumn