In [1]:
#import packages
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

#####-----Read all input data file name and path into dictionary-----#####  
def read_all_inputs(path_to_conf):
    df_dic = {}
    # read all inputs needed from input_data folder
    with open(path_to_conf, 'r') as conf:
        for line in conf.readlines():
            df_dic[line.strip('\n').split(':')[0]] = pd.read_csv(line.strip('\n').split(':')[1], index_col = 0)
    #output a dictionary of file names and path
    return df_dic

#####-----Prepare Organization's External Data-----#####  
def external_data(df_organization,df_external_data):
    
    #initial lists for metrics
    uid_list = []
    oname_list = []
    active_regions_list = []
    age_list = []
    community_outreach_list = []
    consumer_rating_list = []
    employee_diversity_list = []
    financial_situation_list = []
    media_coverage_list = []
    number_of_employee_list = []
    office_locations_list = []
    revenue_list = []
    social_media_activity_list = []
    stock_performance_list = []
    total_assets_list = []
    website_list = []
    international_student_employee_rate_list = []
    employee_average_salary_list = []
    retention_rate_list = []
    employee_benefits_list = []

    # set default value to formulate lists of metrics 
    for u in df_organization['uid'].unique():
        if u not in df_external_data['uid'].values:
            uid_list.append(u)
            oname_list.append(df_organization.loc[df_organization['uid']==u,'website'].values[0])
            active_regions_list.append(1)
            age_list.append(1)
            community_outreach_list.append(1)
            consumer_rating_list.append(1)
            employee_diversity_list.append(1)
            financial_situation_list.append(1)
            media_coverage_list.append(1)
            number_of_employee_list.append(1)
            office_locations_list.append(1)
            revenue_list.append(1)
            social_media_activity_list.append(1)
            stock_performance_list.append(0)
            total_assets_list.append(1)
            website_list.append(1)
            international_student_employee_rate_list.append(0)
            employee_average_salary_list.append(30000)  #set a defalut salary $30,000
            retention_rate_list.append(0.5) #set a defalut retention rate $0.5
            employee_benefits_list.append(2.5) #set a defalut employee_benefits 2.5

    # cast lists of metrics to default_data dataframe 
    df_default_data = pd.DataFrame({'uid': uid_list,'oname': oname_list, 'active_regions': active_regions_list, 
                              'age': age_list, 'community_outreach': community_outreach_list, 
                              'international_student_employee_rate': international_student_employee_rate_list,
                              'consumer_rating': consumer_rating_list, 'employee_diversity': employee_diversity_list, 
                              'financial_situation': financial_situation_list, 
                              'employee_average_salary': employee_average_salary_list,'retention_rate': retention_rate_list,
                              'employee_benefits': employee_benefits_list, 'media_coverage': media_coverage_list, 
                              'number_of_employee': number_of_employee_list, 'office_locations': office_locations_list, 
                              'revenue': revenue_list, 'social_media_activity': social_media_activity_list, 
                              'stock_performance': stock_performance_list, 'total_assets': total_assets_list, 
                              'website': website_list}).fillna(0)

    # merge the default dataframe with df_external_data
    df_external_data = pd.concat([df_external_data, df_default_data], axis=0, sort=False, ignore_index=True)
    
    #output final external_data
    return df_external_data

#####-----Prepare Organization's Internal Data-----#####                                        
def internal_data(df_external_data, df_organization, df_users,df_users_rank):
    # get organization's internal data 
    uid_list = []
    oname_list = []
    num_individ_from_company_list = []
    sum_individ_impact_score_list = []
    mean_individ_impact_score_list = []
    number_of_upload_list = []
    sum_of_the_number_of_followers_list = []


    for ids in df_external_data['uid']:
        name = df_external_data.loc[df_external_data['uid']==ids, 'oname'].values[0]
        # matching employee email with organization's name
        uids = df_users[df_users['email'].astype(str).str.contains(name)]['uid']
        # get empolyees' individual impact score
        member_info = df_users_rank.loc[df_users_rank['uid'].isin(uids),:]

        uid_list.append(ids)
        oname_list.append(name)
        num_individ_from_company_list.append(member_info['uid'].nunique())
        sum_individ_impact_score_list.append(round(member_info['score'].sum()))
        mean_individ_impact_score_list.append(round(member_info['score'].sum()/member_info['uid'].nunique(),2))
        number_of_upload_list.append(len(df_organization[(df_organization['service_type'].isin(['job_posting', 'marketing_posting'])) & 
                    (df_organization['website'].astype(str).str.contains(name))]))
        sum_of_the_number_of_followers_list.append(len(df_organization[(df_organization['service_type'].isin(['company_member'])) & 
                    (df_organization['website'].astype(str).str.contains(name))]))


    # cast lists of metrics to internal_data dataframe   
    df_internal_data = pd.DataFrame({'uid': uid_list,'oname': oname_list, 
                                  'number_of_individual_from_the_company': num_individ_from_company_list, 
                                  'sum_of_the_individuals_impact_scores': sum_individ_impact_score_list, 
                                  'mean_of_the_individuals_impact_scores': mean_individ_impact_score_list,
                                  'number_of_upload': number_of_upload_list, 
                                  'sum_of_the_number_of_followers': sum_of_the_number_of_followers_list}).fillna(0)

    # set default values for the metrics' data that are not available now  
    columns_list = ['number_of_self_organization_comment', 'number_of_self_organization_follower',
           'number_of_self_organization_rating', 'sum_of_the_number_of_comment',
           'sum_of_the_number_of_competition_completed', 'sum_of_the_number_of_views',
           'sum_of_the_rating_score']
    for i in columns_list:
        if i not in df_internal_data:
            df_internal_data[i] = 1

    # calculate means and proportions 
    df_internal_data['mean_of_the_number_of_views']=round(df_internal_data['sum_of_the_number_of_views']/(df_internal_data['number_of_upload']+1),2)
    df_internal_data['mean_of_the_rating_score']=round(df_internal_data['sum_of_the_rating_score']/(df_internal_data['number_of_upload']+1),2)
    df_internal_data['mean_of_the_number_of_comment']=round(df_internal_data['sum_of_the_number_of_comment']/(df_internal_data['number_of_upload']+1),2)
    df_internal_data['proportion_of_the_company_employees_involved']=round(df_internal_data['number_of_individual_from_the_company']/df_external_data['number_of_employee'],3)
    
    #output final internal_data
    return df_internal_data

#####-----Merge Organization's Internal And External Data-----#####
def merge_internal_external(df_internal_data,df_external_data):
    
    # merge external data and internal data 
    df_internal_data=df_internal_data.drop(['oname'], axis=1)
    df_organ_all = pd.merge(left=df_external_data, right=df_internal_data, on='uid', how='inner')

    # format column names
    df_organ_all.columns = [x.lower().replace(" ", "_").replace("-","_").replace(r"/","_").replace("\\","_") \
                  .replace("$","").replace("%","").replace("#","number") for x in df_organ_all.columns]

    # convert column "international_student_employee_rate" to float
    df_organ_all['international_student_employee_rate'] = df_organ_all['international_student_employee_rate'].astype(float)
    
    return df_organ_all

#####-----Calculate Organizational Impact Score-----#####
def score_calculation(df_organ_all,scoring_cols,weights):
    
    # convert the some columns data type from object to float64 for calcuation
    # scoring_cols
    scoring_cols['a'] = scoring_cols['a'].astype('float')
    scoring_cols['c'] = scoring_cols['c'].astype('float')
    scoring_cols['k'] = scoring_cols['k'].astype('float')
    scoring_cols['upper_bound'] = scoring_cols['upper_bound'].astype('float')

    # weights
    weights['category_weight'] = weights['category_weight'].astype('float')
    weights['metric_weight'] = weights['metric_weight'].astype('float')
    
    ############Calculation part01!! (use cols_calculation table)############
    user_score=df_organ_all['uid']
    for index, row in scoring_cols.iterrows():
        score=pd.DataFrame()
        column=row['metrics']
        calc_type=row['function']
        upper=row['upper_bound']
        k=row['k']
        output='score_'+str(column)
        isc_col = df_organ_all[column]


        if calc_type == 'Log':
            z=-k*isc_col
            z = z.astype(float)
            score[output] = 200/(1+np.exp(z))-100
            user_score = pd.concat([user_score, score], axis=1, sort=False)
            user_score = pd.concat([user_score, isc_col], axis=1, sort=False)
        elif calc_type == 'Linear':
            score[output] = isc_col.apply(lambda x: (100*x/upper) if (100*x/upper) < 100 else 100)
            user_score = pd.concat([user_score, score], axis=1, sort=False)
            user_score = pd.concat([user_score, isc_col], axis=1, sort=False)
        elif calc_type == 'Boolean':
            score[output] = isc_col.apply(lambda x: 0 if x == 0 else 100)
            user_score = pd.concat([user_score, score], axis=1, sort=False)
            user_score = pd.concat([user_score, isc_col], axis=1, sort=False)
    
    ################ Calculation part02!! (use weights table)####################
    df_user=user_score
    score_cols = [col for col in df_user.columns if 'score_' in col]
    score_cols = ['uid'] + score_cols
    df_user = df_user[score_cols]
    df_user = df_user.fillna(0)


    # initial empty dictionaries
    dimension_to_metric_dict = dict()
    dimension_weight_dict = dict()
    metric_weight_dict = dict()


    # use the wieght table from our database
    for row in weights.values:
        dimension_weight_dict[row[1]] = float(row[2])
        metric_weight_dict[row[0]] = float(row[3])
        if row[1] in dimension_to_metric_dict:
            dimension_to_metric_dict[row[1]].append(row[0])
        else:
            dimension_to_metric_dict[row[1]] = []
            dimension_to_metric_dict[row[1]].append(row[0])

    for dimension in dimension_to_metric_dict:
        df_user[dimension] = np.zeros(len(df_user))

    for dimension in dimension_to_metric_dict:
        for metric in dimension_to_metric_dict[dimension]:
            if metric in user_score.columns:
                df_user[dimension] += df_user[metric] * metric_weight_dict[metric]

    df_user["total_score"] = np.zeros(len(df_user))
    for dimension in dimension_to_metric_dict:
        df_user["total_score"] += df_user[dimension] * dimension_weight_dict[dimension]
    
    ##### main calculation is done,then calculate the rank #######
    # format columns name
    df_user.columns = [x.lower().replace(" ", "").replace("-","_").replace(r"/","_").replace("\\","_") \
              .replace("$","").replace("%","").replace("#","number") for x in df_user.columns]
    
    # rank results by total score
    df_user = df_user.sort_values(by=["total_score"], ascending=False)
    df_user['rank'] = df_user['total_score'].rank(method='min',ascending=False)
    df_user = df_user[['uid','projects_products_impact','perceived_capacity','perceived_participation','social_and_environmental_impact','employee_individual_impact_score','organization_impact_score','organization_and_employee_interaction','total_score','rank']]
    df_user.columns = ['uid','projects_products_impact','perceived_capacity','perceived_participation','social_and_environmental_impact','employee_individual_impact_score','organization_impact_score','organization_and_employee_interaction','total_score','rank']
    
    #output organizational impact scores to file
    df_user.to_csv(output_file_path, index=False)
    
    return df_user


if __name__ == "__main__":
    config_file_path = "input_files.conf"
    output_file_path = "organ_impact_scores.csv"
    #read all dataframes into a dataframe of dataframe
    all_dfs = read_all_inputs(config_file_path)
    df_external_data = external_data(all_dfs['df_organization'], all_dfs['df_external_data'])
    df_internal_data = internal_data(df_external_data, all_dfs['df_organization'], all_dfs['df_users'], all_dfs['df_users_rank'])
    df_organ_all = merge_internal_external(df_internal_data, df_external_data)
    df_organ_impact_score = score_calculation(df_organ_all, all_dfs['scoring_cols'], all_dfs['weights'])

df_organ_impact_score

Unnamed: 0,uid,projects_products_impact,perceived_capacity,perceived_participation,social_and_environmental_impact,employee_individual_impact_score,organization_impact_score,organization_and_employee_interaction,total_score,rank
2,1287,151.758824,271.084412,161.764706,131.176471,0.0,26.828457,13.856554,208.30294,1.0
0,1286,143.11621,243.77202,125.103961,146.898824,0.0,26.828457,13.856554,191.728463,2.0
1,1285,112.274179,249.464272,110.649377,131.176471,0.0,26.828457,13.856554,177.439778,3.0
3,1067,17.774194,105.788759,40.588412,131.176471,0.0,28.181834,13.856554,88.99177,4.0
5,1091,17.774194,105.788759,40.588412,131.176471,0.0,28.181834,13.856554,88.99177,4.0
6,1193,17.774194,105.788759,40.588412,131.176471,0.0,28.181834,13.856554,88.99177,4.0
4,1079,17.774194,105.788759,40.588412,131.176471,0.0,20.150394,13.856554,85.760297,7.0
7,1288,17.774194,105.788759,40.588412,131.176471,0.0,20.150394,13.856554,85.760297,7.0
