In [1]:
import pandas as pd
import numpy as np
import dask as dd
from typing import Tuple
import shortuuid

In [2]:
relevant_sheets = ['Actors', 'Organizations', 'Positions' , 'Project']
to_lower_lists = ['Last Name',	'MI', 'First Name'	,'Name'	,'Short Name']
ignore_list = ["XML"]
new_actors = {'full_name': [] , 'first_name':[] , 'last_name':[] , 'middle_initial':[] , 'act_id': []}
new_organizations = {'name': [] , 'short_name':[] ,'mnemnoic':[] ,'category': [] , 'org_id': [], 'sector':[] , 'second_sector':[]}
existing_actors = pd.read_csv('actors.csv')
existing_organizations = pd.read_csv('organizations.csv')

In [3]:
def create_actor(full_name:str, **kwargs)-> Tuple[bool , int]:
    first_name = str(kwargs.get('first_name' , None))
    last_name = str(kwargs.get('last_name' , None))
    middle_initial = str(kwargs.get('middle_initial' , None))
    if middle_initial is not None:
        middle_initial = middle_initial.strip('.,')
    id = shortuuid.ShortUUID().random(length=12)
    new_actors['full_name'].append(full_name)
    new_actors['first_name'].append(first_name)
    new_actors['last_name'].append(last_name)
    new_actors['middle_initial'].append(middle_initial)
    new_actors['act_id'].append(id)
    return id


def check_actor_exist(full_name:str, **kwargs)-> Tuple[bool , int]:
    actor = existing_actors.loc[existing_actors['full_name'] == full_name]
    first_name = kwargs.get('first_name' , None)
    last_name = kwargs.get('last_name' , None)
    middle_initial = kwargs.get('middle_initial' , None)
    if(full_name is None and first_name is None and last_name is None):
        return (False , None)
    if(actor.empty):
        if last_name != None:
            try:
                actor = existing_actors.loc[existing_actors['last_name'] == last_name]
                if not actor.empty and first_name != None:
                    actor = actor.loc[actor['first_name'] == first_name]
                    if not actor.empty:
                        return (True , actor.get('act_id').to_numpy()[0])
                    else:
                        id = create_actor(full_name , first_name=first_name , last_name=last_name , middle_initial=middle_initial)
                        return (True, id)
                else:
                    id = create_actor(full_name , first_name=first_name , last_name=last_name , middle_initial=middle_initial)
                    return (True , id)
            except Exception as e:
                raise e
    elif(actor.shape[0] > 1):
        return (False,)
    else:
        return (True , actor.get('act_id').to_numpy()[0])    


In [4]:
def check_all_actors(actors):
    try:
        print(actors.apply(lambda x : check_actor_exist(x['Name'] , first_name=x['First Name'] , last_name=x['Last Name'] , middle_initial=x['MI']) , axis=1))
        return (True , 1)
    except Exception as e: 
        return (False , -1 , e)




In [5]:

def create_organization(name:str , **kwargs)-> Tuple[bool, int]:
    short_name = kwargs.get('short_name', None)
    category = kwargs.get('category', None)
    sector = kwargs.get('sector', None)
    second_sector = kwargs.get('second_sector', None)
    mnemnoic = kwargs.get('mnemoic', None)
    org_id = shortuuid.ShortUUID().random(length=12)
    new_organizations['name'].append(name)
    new_organizations['short_name'].append(short_name)
    new_organizations['category'].append(category)
    new_organizations['sector'].append(sector)
    new_organizations['second_sector'].append(second_sector)
    new_organizations['mnemnoic'].append(mnemnoic)
    new_organizations['org_id'].append(org_id)
    return id


def check_organization_exists(name:str , **kwargs)-> Tuple[bool , int]:
    organization = existing_organizations.loc[existing_organizations['name'] == name]
    short_name = kwargs.get('short_name', None)
    category = kwargs.get('category', None)
    sector = kwargs.get('sector', None)
    second_sector = kwargs.get('second_sector', None)
    mnemnoic = kwargs.get('mnemoic', None)
    if organization.empty:
            try:
                if short_name is not None:
                    organization = existing_organizations.loc[existing_organizations['short_name'] == short_name]
                    if not organization.empty:
                        return (True , organization.get('org_id').to_numpy()[0])
                if mnemnoic is not None: 
                    organization = existing_organizations.loc[existing_organizations['mnemonic'] == mnemonic]
                    if not organization.empty:
                        return (True , organization.get('org_id').to_numpty()[0])
                id = create_organization(name , short_name=short_name , category=category , sector=sector ,second_sector=second_sector , mnemonic=mnemnoic)
                return (True , id)
            except Exception as e:
                raise e
    else:
        return (True , organization.get('org_id').to_numpy()[0])

def check_all_orgs(organizations):
    try:
        organizations.apply(lambda x : check_organization_exist(x['Name'], short_name=x['Short name'] , category=x['Category'], sector=x['Sector'], second_sector=x['Sector2']) , axis=1)
        return (True , 1)
    except Exception as e:
        raise e
        

In [6]:
def tryConvertStr(val):
    try:
        return str(val).lower()
    except:
        print(val)

def start_single_process(filename: str , combined_actor_dataframe , combined_organization_dataframe) -> bool:
    sheet = pd.ExcelFile(filename)
    dataframe_lists = {}
    act_id_maps = {}
    org_id_maps = {}
    global new_actors
    global new_organizations
    for i in relevant_sheets:
        dataframe_lists[i] = pd.read_excel(sheet , i , squeeze=False)
    dataframe_lists['Actors'] = dataframe_lists['Actors'].apply(lambda x : x.str.lower() if x.name in to_lower_lists and x.name not in ignore_list else x)
    dataframe_lists['Organizations'] = dataframe_lists['Organizations'].apply(lambda x: x.str.lower() if x.name not in ignore_list else x)
    print("finished conversion")
    #this is the section for actors
    try:
        for index , rows  in dataframe_lists['Actors'].iterrows():
            response = check_actor_exist(rows['Name'], first_name=rows['First Name'] , last_name=rows['Last Name'] , middle_initial=rows['MI'])
            if(not response[0]):
                print("check_actor_exist returned false")
                continue
            else:
                act_id_maps[rows["ACTID"]] = response[1]
        new_actors = pd.DataFrame.from_dict(new_actors)
        new_combined_actor_dataframe = pd.concat([combined_actor_dataframe , new_actors])
    except Exception as e:
        raise e
    print("finished the actors")
    #sections for organizations
    try:
        for index , rows in dataframe_lists['Organizations'].iterrows():
            response = check_organization_exists(rows["Name"], short_name=rows["Short name"], category=rows["Category"], sector=rows["Sector"], second_sector=rows["Sector2"], mnemonic=rows["Mnemnoic"])
            if(not response[0]):
                print("check_organization_exist returned false")
            org_id_maps[rows["ORGID"]] = response[1]
        new_organizations = pd.DataFrame.from_dict(new_organizations)
        new_combined_organization_dataframe = pd.concat([combined_organization_dataframe , new_organizations])
    except Exception as e:
        raise e
    print("function finished")
    return (True , {"new_actors":new_combined_actor_dataframe , "act_id_map":act_id_maps , "new_organizations":new_combined_organization_dataframe , "org_id_maps":org_id_maps})
try:
    response = start_single_process('../Malcom Feb 17 2020.xlsx', existing_actors , existing_organizations)
    response[1]["new_actors"].to_csv("./actors.csv" , index=False)
    response[1]["new_organizations"].to_csv("./organizations.csv",index=False)
except Exception as e:
    print("error")
    print(e)


finished conversion
finished the actors
function finished
