In [130]:
import pymysql
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from dbeaver_connect import create_df_pymysql

In [131]:
# Define connection details in a format that is expected by dbeaver_connect functions:

connection_details = {"host": 'eufmd-database-1.cqodkl4vazie.eu-north-1.rds.amazonaws.com',
                      "user": 'root',
                      "password": 'YOUR_PASSWORD',
                      "db": 'db_tompilot',
                      "port": 3306
}

In [132]:
# Define the SQL query that is expected by the dbeaver_connect functions:

df_query_dict = {"competencies": "SELECT * FROM db_tompilot.competencies",
                 "countries": "SELECT * FROM db_tompilot.countries",
                 "course_categories": "SELECT * FROM db_tompilot.course_categories",
                 "courses": "SELECT * FROM db_tompilot.courses",
                 "position_competency_by_country": "SELECT * FROM db_tompilot.position_competency_by_country",
                 "positions": "SELECT * FROM db_tompilot.positions",
                 "regions_geolocation": "SELECT * FROM db_tompilot.regions_geolocation",
                 "roles": "SELECT * FROM db_tompilot.roles",
                 "self_assessment": "SELECT * FROM db_tompilot.self_assessment",
                 "tom_enrolments": "SELECT * FROM db_tompilot.tom_enrolments",
                 "trainings": "SELECT * FROM db_tompilot.trainings",
                 "user_course_enrolled": "SELECT * FROM db_tompilot.user_course_enrolled",
                 "user_course_enrolled_moodle": "SELECT * FROM db_tompilot.user_course_enrolled_moodle",
                 "users": "SELECT * FROM db_tompilot.users",  
}

In [133]:
# Convert SQL queries to dataframes:

for item in df_query_dict:
    query = df_query_dict[item]
    df_query_dict[item] = create_df_pymysql(connection_details, query)

  df = pd.read_sql(query, conn)


In [134]:
for item in df_query_dict:
    print(item, df_query_dict[item].shape)

competencies (13, 2)
countries (12, 8)
course_categories (34, 3)
courses (428, 8)
position_competency_by_country (179, 5)
positions (23, 4)
regions_geolocation (129, 7)
roles (3, 3)
self_assessment (109, 4)
tom_enrolments (1988, 18)
trainings (142, 12)
user_course_enrolled (700, 7)
user_course_enrolled_moodle (234, 10)
users (223, 15)


In [135]:
# Create tom_enrolments_df
tom_enrolments_df = df_query_dict["tom_enrolments"]

# Make course_category values lowercase:
tom_enrolments_df["course_category"] = tom_enrolments_df["course_category"].str.lower()

# Check output
tom_enrolments_df.head(2)

Unnamed: 0,id,full_name,email,course_fullname,course_shortname,course_category,city,country,institution,role,completed,time_enrolled,time_completed,progress,status,course_id,moodle_category,new_id
0,6379,Jon Simonyan,jsimmk19@mail.ru,Open Session Online 2014,Open Session Online 2014,nu,Erevan,Armenia,,,No,2020-07-10 00:00:00,,50.0,Open,45.0,Resources / Open Session,179
1,6392,Mariam Militosyan,militosyan_96@mail.ru,Обучающий Онлайн Курс по Изучению Ящура на рус...,FITC_RU4,fmd investigation training course,,Armenia,,,Yes,2020-09-17 00:00:00,2020-10-21 00:00:00,100.0,Completed,244.0,FMD Investigation Training Course,180


In [144]:

# First import the two data mapping files that have information about which skills (aka bricks) are covered in each course main_topic:

brick_mapping_df = pd.read_excel("data/GET Prepared Training menu mapping_May2024.xlsx", skiprows=1)
additional_mappings_df = pd.read_excel("data/Master_courses.xlsx", "main_topic vs bricks")

# Do some initial cleaning to get that data in a format that we want:

# Make "Main topic" values lowercase for both dataframes:
brick_mapping_df["Main topic"] = brick_mapping_df["Main topic"].str.lower()
additional_mappings_df["main_topic"] = additional_mappings_df["main_topic"].str.lower()

# Insead of having an 'X' indicate whether a course covers a particular skill we will change it to 1
brick_mapping_df.replace(to_replace='X', value=1, inplace=True)
brick_mapping_df.replace(to_replace='x', value=1, inplace=True)

# Insead of keeping the Nan value within the "disease covered" column we will change this to a blank string.
brick_mapping_df["Disease covered"].fillna(value='', inplace=True)
additional_mappings_df["Disease covered"].fillna(value='', inplace=True)

# For all other cells, we want Nan values to be changed to ones. This is because the Nan values represent the brick/ skill NOT being present in the course content
brick_mapping_df.fillna(value=0, inplace=True)
additional_mappings_df.fillna(value=0, inplace=True)

# Select only the columns of interest from the brick_mapping_df. Choose the columns that have a match in additional_mappings_df:

brick_mapping_df = brick_mapping_df[['Main topic', 'Disease covered',
       'Simulation exercises','Training', 'Laboratories', 
       'Contingency planning', 'Assessment',
       'Identification, Registration and traceability', 'Risk assessment',
       'Information data management', 'Models', 'Surveillance', 'Awareness',
       'Clinical Examination', 'Epidemiological Investigation', 'Sampling',
       'Farm biosecurity', 'Personal biosecurity', 'Communication', 'Disposal',
       'Humane killing of animals', 'Vaccination', 'Cleaning and disinfection',
       'Movement control', 'Restricted zones', 'Psychological support',
       'Resource and impact tools and calculators', 'Logistic',
       'National emergency anagement', 'Coordination and PPP', 'Wildlife',
       'Recovery of disease status', 'Vaccination exit strategy',
       'Re-stocking']]

# Rename the columns in the brick_mapping_df so that the column names match the column names in additional_mappings_df

original_columns = brick_mapping_df.columns.tolist()
add_columns = additional_mappings_df.columns.tolist()

rename_cols = {}
original_columns = brick_mapping_df.columns.tolist()
add_columns = additional_mappings_df.columns.tolist()

for i in range(brick_mapping_df.shape[1]):
    rename_cols[original_columns[i]] = add_columns[i]

brick_mapping_df.rename(columns=rename_cols, inplace=True)

# now we are ready to merge the two dataframes
frames = [brick_mapping_df, additional_mappings_df]
merged_df = pd.concat(frames)

# Sort by main_topic and reset index:
merged_df = merged_df.sort_values(by=['main_topic'])
merged_df.reset_index(drop=True, inplace=True)

# Correct some incorrect/ unclear data:
merged_df.at[10, "Disease covered"] = "FMD"
merged_df.at[13, "Disease covered"] = "FMD"
merged_df.at[20, "Disease covered"] = "LSD"
merged_df.at[39, "Disease covered"] = "RVF"
merged_df.at[49, "Disease covered"] = "SPGP"

# TBD how will multiple laboratoy and "simulation advanced" topics be addressed?

In [143]:
merged_df

Unnamed: 0,main_topic,Disease covered,Foundation: Simulation exercises,Foundation: Training,Foundation: Laboratories,Foundation: Contingency planning,Foundation: Assessment,"Foundation: Identification, Registration and traceability",Foundation: Risk assessment,Foundation: Information data management,Foundation: Models,Foundation: Surveillance,Alert: Awareness,Alert: Clinical Examination,Alert: Epidemiological Investigation,Alert: Sampling,Alert: Farm biosecurity,Alert: Personal biosecurity,Emergency: Communication,Emergency: Disposal,Emergency: Humane killing of animals,Emergency: Vaccination,Emergency: Cleaning and disinfection,Emergency: Movement control,Emergency: Restricted zones,Emergency: Psychological support,Emergency: Resource and impact tools and calculators,Emergency: Logistic,Emergency: National emergency anagement,Emergency: Coordination and PPP,Emergency: Wildlife,Reconstruction: Recovery of disease status,Reconstruction: Vaccination exit strategy,Reconstruction: Re-stocking
0,african animal trypanosomosis,FAST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,african swine fever,FMD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,animal depopulation,FAST,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,animal depopulation,FAST,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,biorisk,FMD,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,biorisk introduction,FMD,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,capripox laboratory,LSD and SPGP,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,epidemiology,Single Disease,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,eufmdis,Single Disease,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,fmd emergency preparation course,Single Disease,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [120]:
# create get_prepared df with brick mappings:
brick_mapping_df = pd.read_excel("data/GET Prepared Training menu mapping_May2024.xlsx", skiprows=1)

# Make "Main topic" values lowercase:
brick_mapping_df["Main topic"] = brick_mapping_df["Main topic"].str.lower()

# Within the brick_mapping dataframe convert NaN values to 0 and X to 1
brick_mapping_df.replace(to_replace='X', value=1, inplace=True)
brick_mapping_df.replace(to_replace='x', value=1, inplace=True)
brick_mapping_df["Disease covered"].fillna(value='', inplace=True)
brick_mapping_df.fillna(value=0, inplace=True)

# Check output
brick_mapping_df.head(2)

Unnamed: 0,Course,Main topic,Level,Acccess,Disease covered,TOM Competency,Learning programs,Simulation exercises,Training,Laboratories,Contingency planning,Assessment,"Identification, Registration and traceability",Risk assessment,Information data management,Models,Surveillance,Awareness,Clinical Examination,Epidemiological Investigation,Sampling,Farm biosecurity,Personal biosecurity,Communication,Disposal,Humane killing of animals,Vaccination,Cleaning and disinfection,Movement control,Restricted zones,Psychological support,Resource and impact tools and calculators,Logistic,National emergency anagement,Coordination and PPP,Wildlife,Recovery of disease status,Vaccination exit strategy,Re-stocking
0,Introduction to Foot-and-Mouth disease,fmd introduction,Basic,Open,FMD,Tad 1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,Introduction to Lumpy Skin disease,lumpy skin disease introduction,Basic,Open,LSD,Tad 1,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [121]:
# What is the shape of the brick mapping dataframe?
brick_mapping_df.shape

(23, 39)

In [128]:
# Create another df with the new mappings we want to include:

additional_mappings_df = pd.read_excel("data/Master_courses.xlsx", "main_topic vs bricks")

# make main_topic lowercase
additional_mappings_df["main_topic"] = additional_mappings_df["main_topic"].str.lower()

additional_mappings_df["Disease covered"].fillna(value='', inplace=True)

In [129]:
additional_mappings_df.head(20)

Unnamed: 0,main_topic,Disease covered,Foundation: Simulation exercises,Foundation: Training,Foundation: Laboratories,Foundation: Contingency planning,Foundation: Assessment,"Foundation: Identification, Registration and traceability",Foundation: Risk assessment,Foundation: Information data management,Foundation: Models,Foundation: Surveillance,Alert: Awareness,Alert: Clinical Examination,Alert: Epidemiological Investigation,Alert: Sampling,Alert: Farm biosecurity,Alert: Personal biosecurity,Emergency: Communication,Emergency: Disposal,Emergency: Humane killing of animals,Emergency: Vaccination,Emergency: Cleaning and disinfection,Emergency: Movement control,Emergency: Restricted zones,Emergency: Psychological support,Emergency: Resource and impact tools and calculators,Emergency: Logistic,Emergency: National emergency anagement,Emergency: Coordination and PPP,Emergency: Wildlife,Reconstruction: Recovery of disease status,Reconstruction: Vaccination exit strategy,Reconstruction: Re-stocking
0,african animal trypanosomosis,FAST,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
1,african swine fever,FMD,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
2,animal depopulation,FAST,,,,1.0,,,,,,,,,,,,1.0,1.0,1.0,1.0,,1.0,,,1.0,1.0,,,,,,,
3,biorisk,FMD,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
4,epidemiology,Single Disease,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
5,eufmdis,Single Disease,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
6,poultry investigation,Single Disease,1.0,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,
7,fmd emergency preparation course,Single Disease,,,1.0,1.0,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,1.0,,,,,,,,,
8,fmd investigation training course,Single Disease,,,,1.0,,,,,,1.0,1.0,1.0,1.0,1.0,1.0,1.0,,,,,,,1.0,,,,,,,,,
9,fmd practical management,FAST,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,,


In [124]:
additional_mappings_df.shape

(31, 34)

In [125]:
brick_mapping_df = brick_mapping_df[['Main topic', 'Disease covered',
       'Simulation exercises','Training', 'Laboratories', 
       'Contingency planning', 'Assessment',
       'Identification, Registration and traceability', 'Risk assessment',
       'Information data management', 'Models', 'Surveillance', 'Awareness',
       'Clinical Examination', 'Epidemiological Investigation', 'Sampling',
       'Farm biosecurity', 'Personal biosecurity', 'Communication', 'Disposal',
       'Humane killing of animals', 'Vaccination', 'Cleaning and disinfection',
       'Movement control', 'Restricted zones', 'Psychological support',
       'Resource and impact tools and calculators', 'Logistic',
       'National emergency anagement', 'Coordination and PPP', 'Wildlife',
       'Recovery of disease status', 'Vaccination exit strategy',
       'Re-stocking']]

In [126]:
brick_mapping_df.head()

Unnamed: 0,Main topic,Disease covered,Simulation exercises,Training,Laboratories,Contingency planning,Assessment,"Identification, Registration and traceability",Risk assessment,Information data management,Models,Surveillance,Awareness,Clinical Examination,Epidemiological Investigation,Sampling,Farm biosecurity,Personal biosecurity,Communication,Disposal,Humane killing of animals,Vaccination,Cleaning and disinfection,Movement control,Restricted zones,Psychological support,Resource and impact tools and calculators,Logistic,National emergency anagement,Coordination and PPP,Wildlife,Recovery of disease status,Vaccination exit strategy,Re-stocking
0,fmd introduction,FMD,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,lumpy skin disease introduction,LSD,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,biorisk introduction,FMD,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,passive surveillance introduction,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,simulation introduction,FAST,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [60]:
original_columns = brick_mapping_df.columns.tolist()
add_columns = additional_mappings_df.columns.tolist()

In [65]:
rename_cols = {}
original_columns = brick_mapping_df.columns.tolist()
add_columns = additional_mappings_df.columns.tolist()

for i in range(brick_mapping_df.shape[1]):
    rename_cols[original_columns[i]] = add_columns[i]

In [66]:
# Confirm the additional matching columns are consistent with the original get prepared columns. 
rename_cols

{'Main topic': 'main_topic',
 'Disease covered': 'Disease covered',
 'Simulation exercises': 'Foundation: Simulation exercises',
 'Training': 'Foundation: Training',
 'Laboratories': 'Foundation: Laboratories',
 'Contingency planning': 'Foundation: Contingency planning',
 'Assessment': 'Foundation: Assessment',
 'Identification, Registration and traceability': 'Foundation: Identification, Registration and traceability',
 'Risk assessment': 'Foundation: Risk assessment',
 'Information data management': 'Foundation: Information data management',
 'Models': 'Foundation: Models',
 'Surveillance': 'Foundation: Surveillance',
 'Awareness': 'Alert: Awareness',
 'Clinical Examination': 'Alert: Clinical Examination',
 'Epidemiological Investigation': 'Alert: Epidemiological Investigation',
 'Sampling': 'Alert: Sampling',
 'Farm biosecurity': 'Alert: Farm biosecurity',
 'Personal biosecurity': 'Alert: Personal biosecurity',
 'Communication': 'Emergency: Communication',
 'Disposal': 'Emergency: D

In [69]:
brick_mapping_df.rename(columns=rename_cols, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  brick_mapping_df.rename(columns=rename_cols, inplace=True)


In [71]:
brick_mapping_df.head()

Unnamed: 0,main_topic,Disease covered,Foundation: Simulation exercises,Foundation: Training,Foundation: Laboratories,Foundation: Contingency planning,Foundation: Assessment,"Foundation: Identification, Registration and traceability",Foundation: Risk assessment,Foundation: Information data management,...,Emergency: Restricted zones,Emergency: Psychological support,Emergency: Resource and impact tools and calculators,Emergency: Logistic,Emergency: National emergency anagement,Emergency: Coordination and PPP,Emergency: Wildlife,Reconstruction: Recovery of disease status,Reconstruction: Vaccination exit strategy,Reconstruction: Re-stocking
0,fmd introduction,FMD,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,lumpy skin disease introduction,LSD,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,biorisk introduction,FMD,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,passive surveillance introduction,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,simulation introduction,FAST,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [None]:
# now we are ready to merge the two dataframes and then check for duplicates!

In [103]:
frames = [brick_mapping_df, additional_mappings_df]
merged_df = pd.concat(frames)

In [105]:
merged_df = merged_df.sort_values(by=['main_topic'])

In [106]:
pd.set_option('display.max_columns', None)
merged_df[merged_df["main_topic"] == "laboratory"]

Unnamed: 0,main_topic,Disease covered,Foundation: Simulation exercises,Foundation: Training,Foundation: Laboratories,Foundation: Contingency planning,Foundation: Assessment,"Foundation: Identification, Registration and traceability",Foundation: Risk assessment,Foundation: Information data management,Foundation: Models,Foundation: Surveillance,Alert: Awareness,Alert: Clinical Examination,Alert: Epidemiological Investigation,Alert: Sampling,Alert: Farm biosecurity,Alert: Personal biosecurity,Emergency: Communication,Emergency: Disposal,Emergency: Humane killing of animals,Emergency: Vaccination,Emergency: Cleaning and disinfection,Emergency: Movement control,Emergency: Restricted zones,Emergency: Psychological support,Emergency: Resource and impact tools and calculators,Emergency: Logistic,Emergency: National emergency anagement,Emergency: Coordination and PPP,Emergency: Wildlife,Reconstruction: Recovery of disease status,Reconstruction: Vaccination exit strategy,Reconstruction: Re-stocking
22,laboratory,RVF,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
21,laboratory,LSD/Sheep pox and goat pox,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
20,laboratory,FMD,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


there are three rows that have a "main_topic" called "laboratories" it turns out that the brick mapping is the same for all three, the only brick skill covered in courses of this main topic are "Foundation: Laboratories"

The "disease covered" for each course needs to be cleaned.

In [111]:
merged_df.reset_index(drop=True, inplace=True)

merged_df.at[33, "Disease covered"] = "FMD"
merged_df.at[35, "Disease covered"] = "FMD"
merged_df.at[39, "Disease covered"] = "LSD"
merged_df.at[47, "Disease covered"] = "RVF"
merged_df.at[52, "Disease covered"] = "SPGP"

In [112]:
merged_df

Unnamed: 0,main_topic,Disease covered,Foundation: Simulation exercises,Foundation: Training,Foundation: Laboratories,Foundation: Contingency planning,Foundation: Assessment,"Foundation: Identification, Registration and traceability",Foundation: Risk assessment,Foundation: Information data management,Foundation: Models,Foundation: Surveillance,Alert: Awareness,Alert: Clinical Examination,Alert: Epidemiological Investigation,Alert: Sampling,Alert: Farm biosecurity,Alert: Personal biosecurity,Emergency: Communication,Emergency: Disposal,Emergency: Humane killing of animals,Emergency: Vaccination,Emergency: Cleaning and disinfection,Emergency: Movement control,Emergency: Restricted zones,Emergency: Psychological support,Emergency: Resource and impact tools and calculators,Emergency: Logistic,Emergency: National emergency anagement,Emergency: Coordination and PPP,Emergency: Wildlife,Reconstruction: Recovery of disease status,Reconstruction: Vaccination exit strategy,Reconstruction: Re-stocking
0,African Animal Trypanosomosis,FAST,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,African Swine Fever,FMD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,Animal Depopulation,FAST,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,Biorisk,FMD,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,Capripox Laboratory,LSD and SPGP,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,Epidemiology,Single Disease,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,EuFMDiS,Single Disease,,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,FMD Emergency Preparation Course,Single Disease,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,FMD Investigation Training Course,Single Disease,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,FMD Laboratory,FMD,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [76]:
# Need to address "simulation advanced" and "laboratory"
merged_df["main_topic"].value_counts()

main_topic
laboratory                                 3
simulation advanced                        3
fmd introduction                           1
Psychological aspects and communication    1
FMD Practical Management                   1
FMD Laboratory                             1
Capripox Laboratory                        1
RVF Laboratory                             1
Lumpy Skin Disease                         1
Modelling                                  1
Passive surveillance                       1
Post Vaccination Monitoring                1
Progressive Control Pathway                1
Public-Private Partnerships                1
FMD Emergency Preparation Course           1
Rift Valley Fever                          1
Risk Analysis along the Value Chain        1
Risk Based Surveillance                    1
Risk Mapping                               1
Safe Trade                                 1
Simulation                                 1
Socio-Economic Impact Assessment           1

In [8]:
# Join the tom_enrolments df and the brick mapping df on "course_category" and "Main topic"

merged_df = tom_enrolments_df.merge(brick_mapping_df, left_on='course_category', right_on="Main topic", how='left')

In [9]:
# Check df shapes to see if merge appears to have worked correctly
print(tom_enrolments_df.shape)
print(brick_mapping_df.shape)
print(merged_df.shape)

(1988, 18)
(23, 39)
(1988, 57)


In [10]:
merged_df.head(3)

Unnamed: 0,id,full_name,email,course_fullname,course_shortname,course_category,city,country,institution,role,...,Restricted zones,Psychological support,Resource and impact tools and calculators,Logistic,National emergency anagement,Coordination and PPP,Wildlife,Recovery of disease status,Vaccination exit strategy,Re-stocking
0,6379,Jon Simonyan,jsimmk19@mail.ru,Open Session Online 2014,Open Session Online 2014,nu,Erevan,Armenia,,,...,,,,,,,,,,
1,6392,Mariam Militosyan,militosyan_96@mail.ru,Обучающий Онлайн Курс по Изучению Ящура на рус...,FITC_RU4,fmd investigation training course,,Armenia,,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,6393,Mariam Militosyan,militosyan_96@mail.ru,Knowledge Bank,Knowledge Bank,nu,,Armenia,,,...,,,,,,,,,,


In [11]:
merged_df.columns

Index(['id', 'full_name', 'email', 'course_fullname', 'course_shortname',
       'course_category', 'city', 'country', 'institution', 'role',
       'completed', 'time_enrolled', 'time_completed', 'progress', 'status',
       'course_id', 'moodle_category', 'new_id', 'Course', 'Main topic',
       'Level', 'Acccess', 'Disease covered', 'TOM Competency',
       'Learning programs', 'Simulation exercises', 'Training', 'Laboratories',
       'Contingency planning', 'Assessment',
       'Identification, Registration and traceability', 'Risk assessment',
       'Information data management', 'Models', 'Surveillance', 'Awareness',
       'Clinical Examination', 'Epidemiological Investigation', 'Sampling',
       'Farm biosecurity', 'Personal biosecurity', 'Communication', 'Disposal',
       'Humane killing of animals', 'Vaccination', 'Cleaning and disinfection',
       'Movement control', 'Restricted zones', 'Psychological support',
       'Resource and impact tools and calculators', 'Logist

In [12]:
# 1232 tom_enrolment courses did not successfully merge with a Main topic from the brick mapping spreadsheet. 
# We will clean this up a bit in the future. 

# For now let us work with what we have and fill the brick "skill" values with zero if they are NaN
brick_columns=['Learning programs', 'Simulation exercises', 'Training', 'Laboratories',
       'Contingency planning', 'Assessment',
       'Identification, Registration and traceability', 'Risk assessment',
       'Information data management', 'Models', 'Surveillance', 'Awareness',
       'Clinical Examination', 'Epidemiological Investigation', 'Sampling',
       'Farm biosecurity', 'Personal biosecurity', 'Communication', 'Disposal',
       'Humane killing of animals', 'Vaccination', 'Cleaning and disinfection',
       'Movement control', 'Restricted zones', 'Psychological support',
       'Resource and impact tools and calculators', 'Logistic',
       'National emergency anagement', 'Coordination and PPP', 'Wildlife',
       'Recovery of disease status', 'Vaccination exit strategy',
       'Re-stocking']

merged_df[brick_columns] = merged_df[brick_columns].fillna(0)

In [13]:
test = merged_df.groupby(["country", "course_category"])["id"].count()
test

country  course_category                             
Armenia  fmd emergency preparation course                 16
         fmd introduction                                  8
         fmd investigation training course               147
         fmd laboratory investigation training course     16
         lumpy skin disease                               36
                                                        ... 
Turkey   rift valley fever introduction                   16
         risk analysis along the value chain               4
         safe trade                                        4
         simulation exercises                              4
         socio-economic impact assessment                  4
Name: id, Length: 68, dtype: int64

In [14]:
# Filter the data so that we are only looking at completed courses:
merged_df = merged_df[merged_df["progress"]==100]

# Group the completed courses by country and course_category:
merged_df_course_category_by_country = merged_df.groupby(["country", "course_category"])[brick_columns].sum()

merged_df_course_category_by_country.head(10)

Unnamed: 0_level_0,Unnamed: 1_level_0,Learning programs,Simulation exercises,Training,Laboratories,Contingency planning,Assessment,"Identification, Registration and traceability",Risk assessment,Information data management,Models,...,Restricted zones,Psychological support,Resource and impact tools and calculators,Logistic,National emergency anagement,Coordination and PPP,Wildlife,Recovery of disease status,Vaccination exit strategy,Re-stocking
country,course_category,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1
Armenia,fmd emergency preparation course,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Armenia,fmd investigation training course,0.0,0.0,0.0,0.0,107.0,0.0,0.0,0.0,0.0,0.0,...,107.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Armenia,fmd laboratory investigation training course,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Armenia,lumpy skin disease,0.0,0.0,0.0,0.0,28.0,0.0,0.0,0.0,0.0,0.0,...,28.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Armenia,post vaccination monitoring,0.0,0.0,0.0,0.0,8.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,0.0
Armenia,risk analysis along the value chain,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Armenia,safe trade,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spain,african swine fever,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spain,animal depopulation,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Spain,fmd emergency preparation course,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,0.0,...,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [15]:
# Group the completed courses by country and brick:
merged_df_bricks_by_country = merged_df.groupby(["country", "Contingency planning"])["id"].count()

merged_df_bricks_by_country.head(10)

country      Contingency planning
Armenia      0.0                      12
             1.0                     147
Spain        0.0                      11
             1.0                       7
Switzerland  0.0                      63
             1.0                     152
Tunisia      0.0                      38
             1.0                      94
Turkey       0.0                      32
             1.0                      16
Name: id, dtype: int64

In [16]:
# Group the completed courses by country and brick:
test = merged_df.groupby(["country"])[brick_columns].sum()

test.head(10)

Unnamed: 0_level_0,Learning programs,Simulation exercises,Training,Laboratories,Contingency planning,Assessment,"Identification, Registration and traceability",Risk assessment,Information data management,Models,...,Restricted zones,Psychological support,Resource and impact tools and calculators,Logistic,National emergency anagement,Coordination and PPP,Wildlife,Recovery of disease status,Vaccination exit strategy,Re-stocking
country,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
Armenia,0.0,0.0,0.0,0.0,147.0,0.0,0.0,0.0,0.0,0.0,...,139.0,0.0,0.0,0.0,0.0,0.0,0.0,8.0,8.0,0.0
Spain,0.0,0.0,0.0,0.0,7.0,0.0,0.0,0.0,0.0,0.0,...,6.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0
Switzerland,0.0,0.0,0.0,0.0,152.0,0.0,0.0,0.0,0.0,0.0,...,144.0,4.0,4.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0
Tunisia,0.0,0.0,0.0,0.0,94.0,0.0,0.0,0.0,0.0,0.0,...,81.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Turkey,0.0,0.0,0.0,0.0,16.0,0.0,0.0,0.0,0.0,0.0,...,12.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
