In [9]:
import pandas as pd
import pyodbc
from fuzzywuzzy import fuzz
import datetime as dt
from nameparser import HumanName

In [11]:
#PPD Directory
PPD = pd.read_csv('../../Data/PPD/ppd_data_20200404.csv')

#Out Directory
medscape = pd.read_csv('../../Data/Medscape/2020-07-01/Memorium_USA_2020-07-01.csv')
twitter = pd.read_csv('../../Data/Medscape/2020-07-01/Twitter_Doctors_2020-07-01.csv')
heroes = pd.read_excel('../../Data/Medscape/2020-07-01/Heroes_Physicians_2020-07-01.xlsx')

#Specialty Table
SPEC_FILE = pd.read_csv('../../Data/PPD/speciality_id.csv')

In [13]:
def get_processed_list():
    '''Get latest processed ME list from EDW'''
    username = 'vigrose'
    password = 'Hufflepuff10946'
    sql_query = \
        """
        SELECT DISTINCT
        M.MED_EDU_NBR
        FROM
        AMAEDW.PERSON M
        WHERE
        M.MORTALITY_STS_CD ='C'
        OR
        M.MORTALITY_STS_CD ='P';
        """
    s = "DSN=PRDDW; UID={}; PWD={}".format(username, password)
    AMAEDW = pyodbc.connect(s)
    known_mortality_status = pd.read_sql(con=AMAEDW, sql=sql_query)

    processed_mes = list(known_mortality_status.MED_EDU_NBR)

    return processed_mes

def remove_processed_mes(physicians):
    '''Remove physicians whose ME numbers are already processed'''
    processed_list = get_processed_list()
    unprocessed_physicians = physicians[physicians.ME.isin(processed_list) == False]

    return unprocessed_physicians

def match_spec(ppd_df, specialty, spec_table):
    '''Checks fuzzy matching on specialty'''
    mf_spec_cd = ppd_df.iloc[0]['PRIM_SPEC_CD']
    specialty_og = specialty.title()
    specialty = specialty_og.replace('Doctor ', '')
    mf_spec = spec_table[spec_table.SPEC_CD == mf_spec_cd].iloc[0]['DESC'].title()
    specialty = specialty.replace('Primary Care', 'Family Medicine')
    specialty = specialty.replace('Ob-Gyn', 'Obstetrics & Gynecology')
    specialty = specialty.replace('Ob/Gyn', 'Obstetrics & Gynecology')
    specialty = specialty.replace('trist', 'try').replace('gist', 'gy')
    specialty = specialty.replace('eon', 'ery').replace('cian', 's')
    if mf_spec in specialty:
        return True
    elif fuzz.ratio(mf_spec, specialty) > 40:
        return True
    elif mf_spec == 'Unspecified' and 'Resident' in specialty:
        return True
    elif mf_spec == 'Unspecified':
        return True
    elif 'Physician' in specialty_og or specialty_og == 'Doctor':
        return True
    elif 'Family Medicine' in specialty_og and mf_spec == 'Pediatrics':
        return True
    else:
        return False

In [17]:
def fix_me(me_list):
    '''Add leading zeroes to ME numbers'''
    nums = []
    for num in me_list:
        num = str(num)
        num = num.replace('.0', '')
        if len(num) == 10:
            num = '0' + num
        elif len(num) == 9:
            num = '00' + num
        elif len(num) == 8:
            num = '000' + num
        nums.append(num)
    return nums

In [115]:
def split_names(roster_df):
    '''Splits name column into components'''
    roster_df = roster_df.drop_duplicates()
    dict_list = []
    for row in roster_df.itertuples():
        name_parsed = HumanName(row.NAME)
        name_dict = {
            'NAME': row.NAME,
            'FIRST_NAME': name_parsed.first.upper(),
            'LAST_NAME': name_parsed.last.upper(),
            'MIDDLE_NAME': name_parsed.middle.upper(),
            'SUFFIX': name_parsed.suffix.upper(),
            'NICKNAME': name_parsed.nickname.upper(),
            'TITLE': name_parsed.title.upper(),
        }
        dict_list.append(name_dict)
    name_df = pd.DataFrame(dict_list)
    new_df = pd.merge(name_df, roster_df, on='NAME')
    return new_df

def append_me(roster_df, spec_df):
    potential = []
    '''Matches to PPD and appends ME'''
    from_twitter = False
    if 'DATE' in roster_df.columns or 'Date' in roster_df.columns:
        from_twitter = True
        data_split = roster_df
    else:
        data_split = split_names(roster_df)

    bad_spec_words = [
        'NURS',
        'VET',
        'TRANSPORT',
        'ASSISTANT',
        'RECEPTIONIST',
        'TECHNICIAN',
        'PARAMEDIC',
        'AIDE',
        'SOCIAL WORKER',
        'ENTREPRENEUR',
        'SERVICES',
        'GROUPHOME',
        'SECURITY',
        'PHARMACIST',
        'FIRE',
        'EMPLOYEE',
        'DEVELOPER',
        'ADMINISTRATOR',
        'LEADER',
        'LPN',
        'THERAPIST',
        'CLERK',
        'COUNSELOR',
        'ATTENDANT',
        'ADMIN',
        'SUPPLY',
        'CLEAN',
        'PRIEST',
        'STAFF',
        'INVESTIGATOR',
        'MRI',
        'EDUCATOR',
        'OFFICER',
        'MAINTENANCE',
        'CNA',
        'SUPERVISOR',
        'COORDINATOR',
        'SUPERVISOR',
        'TECHNOLOGIST',
        'MECHANIC',
        'EMT'
    ]
    mes = []
    for row in data_split.itertuples():
        physician_me = 'None'
        keep = True
        if from_twitter == False:
            for word in bad_spec_words:
                if word in row.SPECIALTY.upper():
                    keep = False
#         if keep == False:
#             print('---')
#             print(row.NAME)
#             print(row.SPECIALTY)
        if keep:
            print('---')
            print(row.NAME)
            new_df = PPD[(PPD.FIRST_NAME == row.FIRST_NAME) & (PPD.LAST_NAME == row.LAST_NAME)]
            try:
                years = [2019.0 - int(row.AGE), 2020.0 - int(row.AGE)]
            except ValueError:
                years = []
                if len(new_df)>1:
                    if row.STATE == "New York":
                        new_df = new_df[new_df.POLO_STATE == 'NY']
                    elif from_twitter == False:
                        new_df = new_df[new_df.POLO_CITY == row.CITY.upper()]
                    else:
                        new_df = new_df[new_df.STATE == row.STATE]
            if len(new_df) == 0 and len(years) > 0:
                if '-' in row.LAST_NAME:
                    last = row.LAST_NAME.replace('-', ' ')
                elif ' ' in row.LAST_NAME:
                    last = row.LAST_NAME.replace(' ', '')
                else:
                    last = row.LAST_NAME.replace('J', 'G')
                    print(last)
                new_df = PPD[(PPD.LAST_NAME == last) & (PPD.BIRTH_YEAR.isin(years))]
                if len(new_df) == 0:
                    pass
                if len(new_df) > 1:
                    if from_twitter:
                        print(f'{row.NAME} potentially matched to multiple ME numbers.')
                    else:
                        new_df = new_df[new_df.CITY == row.CITY.upper()]
            elif len(new_df) > 1 and len(years) > 0:
                new_df = new_df[new_df.BIRTH_YEAR.isin(years)]
                if len(new_df) > 1 and not from_twitter:
                    new_df = new_df[new_df.CITY == row.CITY.upper()]   
            if len(new_df) == 1:
                if from_twitter:
                    physician_me = new_df.iloc[0]['ME']
                else:
                    if match_spec(new_df, row.SPECIALTY, spec_df):
                        physician_me = new_df.iloc[0]['ME']
            elif len(new_df) > 1:
                print(f'{row.NAME} potentially matched to multiple ME numbers.')
        mes.append(physician_me)
        if keep==True:
            print(physician_me)
            potential.append(row.NAME)

    data_split['ME'] = fix_me(mes)
    data_me = data_split[data_split.ME != 'None']
    return data_split, data_me, potential

In [110]:
ALL_MED, MED_ME, name_list = append_me(medscape, SPEC_FILE)

---
Ashraf Abdo
General Practitioner
91502920371
---
Nancy Ajemian
Family Medicine
42201850562
---
Niaz Ali
Pediatrics
30810840192
---
Nerissa Armesto
Nephrology Renal Dietician Hemodialysis
ARMESTO
None
---
Glenn Barquet
Physician Invasive Cardiologist
01103970403
---
Doug Bass
Medical Director
BASS
None
---
Joseph A. Bonjiorno
Psychiatrist
BONGIORNO
05605670093
---
James Boudwin
Family Practice Physician
03306800069
---
Kenneth Bradshaw
Director of Facilities (retired)
None
---
Irving Buterman
OB-GYN
66001710037
---
Luis Caldera-Nieves
OB-GYN
04201831399
---
John P. Careccia
Chief and Training Director
None
---
Evelyn Caro
Holy Cross Hospital
None
---
Ricardo Castaneda
Psychiatrist
42901790064
---
Sudheer Chauhan
Internal Medicine Chief of Medicine
49541720081
---
Charlie Chin Song Chen
Family Medicine/General Practitioner
02878920972
---
Reza Chowdhury
Internist and Infectious Diseases Physician,Westchester Medical Healthcare
16004870089
---
Kenneth Conte
Family Medicine/General pra

In [55]:
me_name = list(MED_ME['NAME'])

In [46]:
MED_ME[60:]

Unnamed: 0,NAME,FIRST_NAME,LAST_NAME,MIDDLE_NAME,SUFFIX,NICKNAME,TITLE,AGE,SPECIALTY,CITY,STATE,COUNTRY,LOCATION,LINK,ME
278,Ronald Verrier,RONALD,VERRIER,,,,,59,Trauma Surgeon,Bronx,New York,USA,,https://twitter.com/DoctorChrys/status/1248015...,44001860078
287,Barry Webber,BARRY,WEBBER,,,,,67,General Surgery,Queens,New York,USA,Mount Sinai Hospital,https://www.facebook.com/1053125492/posts/1021...,1002821971
291,David Wolin,DAVID,WOLIN,,,,,74,Radiologist Mammography and Breast Imaging,Piermont,New York,USA,Brooklyn Hospital Center,https://www.legacy.com/obituaries/nytimes/obit...,3005731398
294,Sina Zaim,SINA,ZAIM,,,,,68,"Physician,Electrophysiology,Hackensack Univers...",Hackensack,New Jersey,USA,,https://www.legacy.com/amp/obituaries/nytimes/...,3520811361
295,Jesus Zambrano,JESUS,ZAMBRANO,,,,,54,"Pediatrician,Mt. Sinai South NassauOceanside,",Oceanside,New York,USA,,https://www.liherald.com/stories/three-employe...,30801900221
297,Jack Zoller,JACK,ZOLLER,,,,,91,OB-GYN,New Orleans,Louisiana,USA,,https://obits.nola.com/obituaries/nola/obituar...,2105500664


In [36]:
bad_spec_words = ['NURS',
        'VET',
        'TRANSPORT',
        'ASSISTANT',
        'RECEPTIONIST',
        'TECHNICIAN',
        'PARAMEDIC',
        'AIDE',
        'SOCIAL WORKER',
        'ENTREPRENEUR',
        'SERVICES',
        'GROUPHOME',
        'RN',
        'SECURITY',
        'PHARMACIST',
        'FIRE',
        'EMPLOYEE',
        'DEVELOPER',
        'ADMINISTRATOR',
        'LEADER',
        'LPN',
        'THERAPIST',
        'CLERK',
        'COUNSELOR',
        'ATTENDANT',
        'ADMIN',
        'SUPPLY',
        'CLEAN',
        'PRIEST',
        'STAFF',
        'INVESTIGATOR',
        'MRI',
        'EDUCATOR',
        'OFFICER',
        'MAINTENANCE',
        'CNA',
        'SUPERVISOR',
        'COORDINATOR',
        'SUPERVISOR',
        'TECHNOLOGIST',
        'MECHANIC',
        'EMT'
    ]

In [37]:
sp = 'Internal Medicine Chief of Medicine'
for word in bad_spec_words:
    if word in sp.upper():
        print(word)

RN


In [50]:
len(name_list)

95

In [58]:
beep = medscape[(medscape.NAME.isin(name_list))&(medscape.NAME.isin(me_name)==False)]

In [67]:
len(MED_ME)

66

In [68]:
len(beep)

33

In [71]:
zz = pd.merge(beep, MED_ME, on='NAME', how='left')

In [73]:
PPD['ME']=fix_me(PPD['ME'])

In [80]:
ppd = PPD[['ME','POLO_CITY','POLO_STATE','MAILING_NAME','LAST_NAME','FIRST_NAME','MIDDLE_NAME','BIRTH_YEAR','PRIM_SPEC_CD']]

In [82]:
ppd = pd.merge(ppd, SPEC_FILE, left_on='PRIM_SPEC_CD', right_on='SPEC_CD')

In [83]:
pd.merge(zz, ppd, on='ME', how='left')

Unnamed: 0,NAME,AGE_x,SPECIALTY_x,CITY_x,STATE_x,COUNTRY_x,LOCATION_x,LINK_x,FIRST_NAME_x,LAST_NAME_x,...,POLO_STATE,MAILING_NAME,LAST_NAME_y,FIRST_NAME_y,MIDDLE_NAME_y,BIRTH_YEAR,PRIM_SPEC_CD,SPEC_CD,SPEC_ID,DESC
0,Nerissa Armesto,64,Nephrology Renal Dietician Hemodialysis,Brooklyn,New York,USA,Rogosin Institute NFB Brooklyn East,https://www.gofundme.com/f/nerissa-armesto-mem...,NERISSA,ARMESTO,...,PA,DAVID M ARMESTO MD,ARMESTO,DAVID,MICHAEL,1955.0,OPH,OPH,1783.0,OPHTHALMOLOGY
1,Doug Bass,64,Medical Director,New York City,New York,USA,,https://abc7ny.com/doctor-dead-doug-bass-kille...,,,...,,,,,,,,,,
2,Kenneth Bradshaw,64,Director of Facilities (retired),Memphis,Tennessee,USA,University of Health Science Center,https://www.legacy.com/obituaries/commercialap...,KENNETH,BRADSHAW,...,,KENNETH G BRADSHAW DO,BRADSHAW,KENNETH,G,1955.0,FM,FM,1691.0,FAMILY MEDICINE
3,John P. Careccia,age unknown,Chief and Training Director,Woodbridge,New Jersey,USA,Woodbridge Township Ambulance & Rescue Squad,https://www.firehouse.com/safety-health/news/2...,,,...,,,,,,,,,,
4,Evelyn Caro,69,Holy Cross Hospital,Silver Spring,Maryland,USA,,https://www.baltimoresun.com/coronavirus/bs-md...,EVELYN,CARO,...,CA,EVELYN A CARO MD,CARO,EVELYN,A,1937.0,P,P,1793.0,PSYCHIATRY
5,Kenneth Conte,75,Family Medicine/General practitioner,Garfield,New Jersey,USA,,https://www.northjersey.com/story/news/bergen/...,KENNETH,CONTE,...,NJ,KENNETH S CONTE DO,CONTE,KENNETH,S,1941.0,R,R,1853.0,RADIOLOGY
6,Jeannie Danker,60,Director of Radiology,Columbus,Ohio,USA,OSU Wexner Medical Center,https://www.dispatch.com/news/20200331/coronav...,,,...,,,,,,,,,,
7,"Ernesto ""Audie"" DeLeon",61,ADN,New York City,New York,USA,,https://www.dignitymemorial.com/obituaries/new...,ERNESTO,DELEON,...,,ERNESTO DELEON MD,DELEON,ERNESTO,,1969.0,EM,EM,1682.0,EMERGENCY MEDICINE
8,Elizabeth Edwards,age unknown,Phlebotomist,New Bern,North Carolina,USA,CarolinaEast Medical Center,https://www.newbernsj.com/news/20200416/craven...,,,...,,,,,,,,,,
9,Norman Einhorn,69,Other Optometrist Neuro Optometry,Belmar,New Jersey,USA,Center for Visual Rehabilitation,https://www.dignitymemorial.com/obituaries/oce...,,,...,,,,,,,,,,


In [91]:
ppd[(ppd.LAST_NAME=='BASS')&(ppd.FIRST_NAME=='DOUGLAS')]

Unnamed: 0,ME,POLO_CITY,POLO_STATE,MAILING_NAME,LAST_NAME,FIRST_NAME,MIDDLE_NAME,BIRTH_YEAR,PRIM_SPEC_CD,SPEC_CD,SPEC_ID,DESC
820920,3503830077,ASTORIA,NY,DOUGLAS H BASS MD,BASS,DOUGLAS,HOWARD,1956.0,AN,AN,1639,ANESTHESIOLOGY


In [108]:
ppd[(ppd.LAST_NAME=='MBAH')]

Unnamed: 0,ME,POLO_CITY,POLO_STATE,MAILING_NAME,LAST_NAME,FIRST_NAME,MIDDLE_NAME,BIRTH_YEAR,PRIM_SPEC_CD,SPEC_CD,SPEC_ID,DESC
16073,2002141259,PITTSBURGH,PA,NSEHNIITOOH A MBAH MD,MBAH,NSEHNIITOOH,APPOLINARISE,1980.0,EM,EM,1682,EMERGENCY MEDICINE
44836,4802182151,HOUSTON,TX,BRIAN N MBAH MD,MBAH,BRIAN,NNAMDI,1991.0,EM,EM,1682,EMERGENCY MEDICINE
278671,69012970013,NEWARK,OH,MAY U MBAH MD,MBAH,MAY,UZOAMAKA,1971.0,IM,IM,1724,INTERNAL MEDICINE
348943,69007940092,GREEN BAY,WI,NGOZI N MBAH MD,MBAH,NGOZI,NWAMAKA,1966.0,OBG,OBG,1769,OBSTETRICS & GYNECOLOGY
573770,4814100109,PLANO,TX,LYNDA E MBAH MD,MBAH,LYNDA,EBERE,1975.0,FM,FM,1691,FAMILY MEDICINE
1147637,69014980041,,,ENJONG M MBAH MD,MBAH,ENJONG,MARY,1970.0,US,US,1883,UNSPECIFIED


In [87]:
ppd[(ppd.LAST_NAME=='CONTE')&(ppd.FIRST_NAME=='KENNETH')]

Unnamed: 0,ME,POLO_CITY,POLO_STATE,MAILING_NAME,LAST_NAME,FIRST_NAME,MIDDLE_NAME,BIRTH_YEAR,PRIM_SPEC_CD,SPEC_CD,SPEC_ID,DESC
945129,2878660733,GARFIELD,NJ,KENNETH S CONTE DO,CONTE,KENNETH,S,1941.0,R,R,1853,RADIOLOGY


In [111]:
MED_ME

Unnamed: 0,NAME,FIRST_NAME,LAST_NAME,MIDDLE_NAME,SUFFIX,NICKNAME,TITLE,AGE,SPECIALTY,CITY,STATE,COUNTRY,LOCATION,LINK,ME
0,Ashraf Abdo,ASHRAF,ABDO,,,,,60,General Practitioner,New York City,New York,USA,,https://www.egypttoday.com/Article/1/83243/Egy...,91502920371
5,Nancy Ajemian,NANCY,AJEMIAN,,,,,60,Family Medicine,Grosse Pointe,Michigan,USA,Beaumont Hospital,https://twitter.com/DexterCMason/status/124988...,42201850562
9,Niaz Ali,NIAZ,ALI,,,,,71,Pediatrics,Red Bank,New Jersey,USA,Riverview Medical Center,,30810840192
21,Glenn Barquet,GLENN,BARQUET,,,,,50,Physician Invasive Cardiologist,Miami,Florida,USA,Mercy Hospital and South Miami Hospital,https://www.cardiovascularbusiness.com/topics/...,1103970403
38,Joseph A. Bonjiorno,JOSEPH,BONJIORNO,A.,,,,78,Psychiatrist,Chicago,Illinois,USA,St. Joseph Hospital (affiliated),https://www.cremation-society.com/obituaries/D...,5605670093
39,James Boudwin,JAMES,BOUDWIN,,,,,67,Family Practice Physician,New Brunswick,New Jersey,USA,Robert Wood Johnson University Hospital,https://www.legacy.com/obituaries/name/james-b...,3306800069
51,Irving Buterman,IRVING,BUTERMAN,,,,,78,OB-GYN,New York City,New York,USA,,https://www.google.com/amp/s/www.legacy.com/ob...,66001710037
55,Luis Caldera-Nieves,LUIS,CALDERA-NIEVES,,,,,63,OB-GYN,Miami,Florida,USA,,https://www.local10.com/news/local/2020/04/09/...,4201831399
63,Ricardo Castaneda,RICARDO,CASTANEDA,,,,,64,Psychiatrist,New York City,New York,USA,,,42901790064
66,Sudheer Chauhan,SUDHEER,CHAUHAN,,,,,age unknown,Internal Medicine Chief of Medicine,Queens,New York,USA,Jamaica Hospital Medical Center,https://www.moloneyfh.com/obituary/sudheer-sin...,49541720081


In [116]:
ALL_T, T_ME, T_name_list = append_me(twitter, SPEC_FILE)

---
Dr. Barry Fisher
Dr. Barry Fisher potentially matched to multiple ME numbers.
None
---
Dr. Guido Volcovici
78101620041
---
Dr. Hooshang Behroozi
51701560764
---
Dr. Robert Oglesbee
OGLESBEE
03901680727
---
Dr. John Makley
03841610490
---
Dr. Albasha Hume
HUME
None
---
Dr. Craig Smallwood
None
---
Dr. Michael Medici
01002670662
---
Dr. Stephan Kamholz
03509720559
---
Dr. Robert “Ray” Hull
04706711649
---
Dr. Herbert Henderson Jr
04501940441
---
Dr. Jean Lau Chin
None
---
Dr. Norman Einhorn
EINHORN
Dr. Norman Einhorn potentially matched to multiple ME numbers.
Dr. Norman Einhorn potentially matched to multiple ME numbers.
None
---
Dr. Chaihan Korn
89104710094
---
Dr. Franklin Sequeira
49552730029
---
Dr. Edgar Ednalino
74801722028
---
Dr. Niaz Ali
30810840192
---
Dr. Jay Kavet
64914753544
---
Dr. Ronald Brisman
02401650141
---
Dr. Ivan Rodriguez
64935850884
---
Dr. Earline Austin
AUSTIN
Dr. Earline Austin potentially matched to multiple ME numbers.
Dr. Earline Austin potentially matc

In [117]:
T_ME

Unnamed: 0,NAME,FIRST_NAME,LAST_NAME,MIDDLE_NAME,SUFFIX,NICKNAME,TITLE,AGE,DATE,STATE,LINK,ME
1,Dr. Guido Volcovici,GUIDO,VOLCOVICI,,,,DR.,79,2020-06-27 00:39:00+00:00,,https://www.pleasantmanorfh.com/obituary/Guido...,78101620041
2,Dr. Hooshang Behroozi,HOOSHANG,BEHROOZI,,,,DR.,91,2020-06-26 23:29:00+00:00,,https://www.legacy.com/obituaries/nytimes/obit...,51701560764
3,Dr. Robert Oglesbee,ROBERT,OGLESBEE,,,,DR.,80,2020-06-25 00:37:00+00:00,OK,http://totallytahlequah.blogspot.com/2020/04/o...,03901680727
4,Dr. John Makley,JOHN,MAKLEY,,,,DR.,84,2020-06-24 23:01:00+00:00,,https://ryortho.com/2020/06/renowned-orthopedi...,03841610490
7,Dr. Michael Medici,MICHAEL,MEDICI,,,,DR.,78,2020-06-13 13:40:09+00:00,CA,https://www.google.com/amp/s/www.legacy.com/ob...,01002670662
...,...,...,...,...,...,...,...,...,...,...,...,...
106,Dr. Ashraf Metwally,ASHRAF,METWALLY,,,,DR.,,2020-04-05 11:59:10+00:00,,,91503870156
108,Dr. Mark Allen Respler,MARK,RESPLER,ALLEN,,,DR.,66,2020-04-03 02:58:13+00:00,,https://hamodia.com/2020/03/27/bde-dr-mark-all...,03508801512
109,Dr. Frank Gabrin,FRANK,GABRIN,,,,DR.,,2020-04-01 13:19:36+00:00,,,04177851693
110,Dr. Stephen Schwartz,STEPHEN,SCHWARTZ,,,,DR.,78,2020-04-01 01:38:07+00:00,,,02405670502


In [123]:
xx = pd.merge(T_ME, heroes, on='ME', how='outer', suffixes=('_twitter','_hero'))

In [119]:
heroes['ME']=fix_me(heroes['ME'])

In [125]:
remove_processed_mes(xx)

Unnamed: 0,NAME_twitter,FIRST_NAME_twitter,LAST_NAME_twitter,MIDDLE_NAME_twitter,SUFFIX_twitter,NICKNAME_twitter,TITLE_twitter,AGE_twitter,DATE,STATE_twitter,...,SUFFIX_hero,NICKNAME_hero,TITLE_hero,AGE_hero,SPECIALTY,CITY,STATE_hero,COUNTRY,LOCATION,LINK_hero
0,Dr. Guido Volcovici,GUIDO,VOLCOVICI,,,,DR.,79.0,2020-06-27 00:39:00+00:00,,...,,,,,,,,,,
1,Dr. Hooshang Behroozi,HOOSHANG,BEHROOZI,,,,DR.,91.0,2020-06-26 23:29:00+00:00,,...,,,,,,,,,,
2,Dr. Robert Oglesbee,ROBERT,OGLESBEE,,,,DR.,80.0,2020-06-25 00:37:00+00:00,OK,...,,,,,,,,,,
3,Dr. John Makley,JOHN,MAKLEY,,,,DR.,84.0,2020-06-24 23:01:00+00:00,,...,,,,,,,,,,
27,Dr. Richard Miles,RICHARD,MILES,,,,DR.,,2020-05-20 19:08:35+00:00,MI,...,,,,,,,,,,
32,Dr. Richard Mills,RICHARD,MILLS,,,,DR.,,2020-05-12 14:28:13+00:00,WA,...,,,,,,,,,,
42,Dr. Sol Dan,SOL,DAN,,,,DR.,70.0,2020-05-02 17:55:29+00:00,NY,...,,,,,,,,,,
56,Dr. Jacob Zeiger Slepian,JACOB,SLEPIAN,ZEIGER,,,DR.,82.0,2020-04-18 12:46:48+00:00,MA,...,,,,,,,,,,
72,Dr. Gregory V. Miller,GREGORY,MILLER,V.,,,DR.,,2020-04-05 19:34:48+00:00,GA,...,,,,,,,,,,
73,Dr. Ashraf Metwally,ASHRAF,METWALLY,,,,DR.,,2020-04-05 11:59:10+00:00,,...,,,,,,,,,,


In [127]:
xx = xx.drop_duplicates()

In [161]:
med_me['ME']=fix_me(med_me['ME'])

In [162]:
remove_processed_mes(med_me)

Unnamed: 0,NAME,FIRST_NAME,LAST_NAME,MIDDLE_NAME,SUFFIX,NICKNAME,TITLE,AGE,SPECIALTY,CITY,STATE,COUNTRY,LOCATION,LINK,ME


In [157]:
remove_processed_mes(VV).to_excel('../../Data/Medscape/2020-07-01/Other_Unprocessed_2020-07-01.xlsx')

In [156]:
ppd[ppd['ME']=='56101680474']

Unnamed: 0,ME,POLO_CITY,POLO_STATE,MAILING_NAME,LAST_NAME,FIRST_NAME,MIDDLE_NAME,BIRTH_YEAR,PRIM_SPEC_CD,SPEC_CD,SPEC_ID,DESC
444246,56101680474,AKRON,OH,JACOB Z SLEPIAN MD,SLEPIAN,JACOB,ZEIGER,1938.0,OTO,OTO,1791,OTOLARYNGOLOGY-HEAD AND NECK SURGERY


In [130]:
MED_ME.to_excel('../../Data/Medscape/2020-07-01/Memorium_USA_Physicians_2020-07-01.xlsx')

In [159]:
med_me = pd.read_excel('../../Data/Medscape/2020-07-01/Memorium_USA_Physicians_2020-07-01.xlsx')

In [133]:
medscape[medscape['NAME']=='Kenneth Conte']

Unnamed: 0,NAME,AGE,SPECIALTY,CITY,STATE,COUNTRY,LOCATION,LINK
74,Kenneth Conte,75,Family Medicine/General practitioner,Garfield,New Jersey,USA,,https://www.northjersey.com/story/news/bergen/...


In [137]:
xx = xx.fillna('None')

In [144]:
def clean_other(xx):
    xx = xx.fillna('None')
    xx = xx.drop_duplicates()
    YY = xx[['ME','DATE']]
    dict__list = []
    for row in xx.itertuples():
        me=row.ME
        if row.NAME_twitter=='None':
            name = row.NAME_hero
            first_name = row.FIRST_NAME_hero
            middle_name = row.MIDDLE_NAME_hero
            title = row.TITLE_hero
            nickname = row.NICKNAME_hero
            last_name = row.LAST_NAME_hero
            age = row.AGE_hero
            state = row.STATE_hero
        else:
            name = row.NAME_twitter
            first_name = row.FIRST_NAME_twitter
            middle_name = row.MIDDLE_NAME_twitter
            title = row.TITLE_twitter
            nickname = row.NICKNAME_twitter
            last_name = row.LAST_NAME_twitter
            age = row.AGE_twitter
            state = row.STATE_twitter
        if row.LINK_twitter =='None':
            link = row.LINK_hero
        else:
            link = row.LINK_twitter
        new_dict = {
            'NAME':name,
            'FIRST_NAME':first_name,
            'MIDDLE_NAME': middle_name,
            'LAST_NAME': last_name,
            'NICKNAME': nickname,
            'TITLE': title,
            'AGE': age,
            'STATE': state,
            'LINK': link,
            'ME':me
        }
        dict__list.append(new_dict)
    WW = pd.DataFrame(dict__list)
    VV = pd.merge(YY, WW, on='ME')
    return('VV')

In [148]:
WW = pd.DataFrame(dict__list)

In [142]:
xx.columns

Index(['NAME_twitter', 'FIRST_NAME_twitter', 'LAST_NAME_twitter',
       'MIDDLE_NAME_twitter', 'SUFFIX_twitter', 'NICKNAME_twitter',
       'TITLE_twitter', 'AGE_twitter', 'DATE', 'STATE_twitter', 'LINK_twitter',
       'ME', 'NAME_hero', 'FIRST_NAME_hero', 'LAST_NAME_hero',
       'MIDDLE_NAME_hero', 'SUFFIX_hero', 'NICKNAME_hero', 'TITLE_hero',
       'AGE_hero', 'SPECIALTY', 'CITY', 'STATE_hero', 'COUNTRY', 'LOCATION',
       'LINK_hero'],
      dtype='object')

In [147]:
YY = xx[['ME','CITY','COUNTRY','LOCATION','DATE']]

In [150]:
VV = pd.merge(YY, WW, on='ME')

In [166]:
VV[['ME', 'DATE', 'NAME', 'FIRST_NAME',
       'MIDDLE_NAME', 'LAST_NAME', 'NICKNAME', 'TITLE', 'AGE', 'STATE',
       'LINK']].to_excel('../../Data/Medscape/2020-07-01/Other_Physicians_2020-07-01.xlsx')

In [170]:
VV['DATE'][0].strftime("%m-%d-%Y")

AttributeError: 'str' object has no attribute 'strftime'

In [175]:
ppd[ppd.ME=='00801680518']

Unnamed: 0,ME,POLO_CITY,POLO_STATE,MAILING_NAME,LAST_NAME,FIRST_NAME,MIDDLE_NAME,BIRTH_YEAR,PRIM_SPEC_CD,SPEC_CD,SPEC_ID,DESC
637880,801680518,,,RICHARD P MILLS MD,MILLS,RICHARD,PENCE,1943.0,OPH,OPH,1783,OPHTHALMOLOGY
