In [1]:
import pymysql
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from dbeaver_connect import create_df_pymysql

In [2]:
# Define connection details in a format that is expected by dbeaver_connect functions:

connection_details = {"host": 'eufmd-database-1.cqodkl4vazie.eu-north-1.rds.amazonaws.com',
                      "user": 'root',
                      "password": 'YOUR_PASSWORD',
                      "db": 'db_tompilot',
                      "port": 3306
}

In [3]:
# Define the SQL query that is expected by the dbeaver_connect functions:

df_query_dict = {"competencies": "SELECT * FROM db_tompilot.competencies",
                 "countries": "SELECT * FROM db_tompilot.countries",
                 "course_categories": "SELECT * FROM db_tompilot.course_categories",
                 "courses": "SELECT * FROM db_tompilot.courses",
                 "position_competency_by_country": "SELECT * FROM db_tompilot.position_competency_by_country",
                 "positions": "SELECT * FROM db_tompilot.positions",
                 "regions_geolocation": "SELECT * FROM db_tompilot.regions_geolocation",
                 "roles": "SELECT * FROM db_tompilot.roles",
                 "self_assessment": "SELECT * FROM db_tompilot.self_assessment",
                 "tom_enrolments": "SELECT * FROM db_tompilot.tom_enrolments",
                 "trainings": "SELECT * FROM db_tompilot.trainings",
                 "user_course_enrolled": "SELECT * FROM db_tompilot.user_course_enrolled",
                 "user_course_enrolled_moodle": "SELECT * FROM db_tompilot.user_course_enrolled_moodle",
                 "users": "SELECT * FROM db_tompilot.users",  
}

In [4]:
# Convert SQL queries to dataframes:

for item in df_query_dict:
    query = df_query_dict[item]
    df_query_dict[item] = create_df_pymysql(connection_details, query)

  df = pd.read_sql(query, conn)


In [5]:
for item in df_query_dict:
    print(item, df_query_dict[item].shape)

competencies (13, 2)
countries (12, 8)
course_categories (34, 3)
courses (428, 8)
position_competency_by_country (179, 5)
positions (23, 4)
regions_geolocation (129, 7)
roles (3, 3)
self_assessment (107, 4)
tom_enrolments (1988, 18)
trainings (142, 12)
user_course_enrolled (700, 7)
user_course_enrolled_moodle (234, 10)
users (223, 15)


In [6]:
# what data do we have in tom_enrolments?
tom_enrolments_df = df_query_dict["tom_enrolments"]

tom_enrolments_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1988 entries, 0 to 1987
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   id                1988 non-null   int64  
 1   full_name         1988 non-null   object 
 2   email             1988 non-null   object 
 3   course_fullname   1988 non-null   object 
 4   course_shortname  1988 non-null   object 
 5   course_category   1988 non-null   object 
 6   city              1988 non-null   object 
 7   country           1988 non-null   object 
 8   institution       1988 non-null   object 
 9   role              1988 non-null   object 
 10  completed         1988 non-null   object 
 11  time_enrolled     1988 non-null   object 
 12  time_completed    1904 non-null   object 
 13  progress          1988 non-null   float64
 14  status            1988 non-null   object 
 15  course_id         1919 non-null   float64
 16  moodle_category   1988 non-null   object 


## Initial questions about tom_enrolments
#### What data is missing or incomplete?
    - 69 values of course_id are Nan
    - 1010 "roles" are missing. These show up in the database as blank strings ('').
    - City, institution, and moodle_category also have many instances of blank strings.
#### What data do we want from the tom_enrolment table?
    - id
    - full name
    - course_fullname
    - course_shortname
    - course_category
    - course_id
    - country
    - role
    - progress (if 100 then we know it is complete)
    - time_enrolled
    - time_completed
#### What is the distribution of comple vs. incomplete courses?
    - 1365 are 50% complete
    - 572 are fully complete
    - 35 have not yet been progressed
    - 8 are somewhere between not started and 50% complete
    - 8 are somewhere between 50% complete and 100% complete   
#### What do enrolments over time look like? Are enrolments stable, increasing, decreasing, or showing other trends?
#### Where are the learners from? (country)
#### How long does it take for learners to complete courses?
#### What are the most popular courses?

## Initial questions about the competencies mapping (brick skills present in each course)
#### Why are there only 19 courses in the mapping sheet?

### What data is missing or incomplete?
At a glance it looks like there are not many missing values in the dataframe. However, the Nan values have been replaced with empty strings. I want to see how many instances of "" can be found in the tom_enrolments dataframe.

In [7]:
# The isna() methos shows us that there are not many Nan values in the dataframe aside from in the course_id column.
# (A Nan value in the time completed column means the course has not been completed)

tom_enrolments_df.isna().sum()

id                   0
full_name            0
email                0
course_fullname      0
course_shortname     0
course_category      0
city                 0
country              0
institution          0
role                 0
completed            0
time_enrolled        0
time_completed      84
progress             0
status               0
course_id           69
moodle_category      0
new_id               0
dtype: int64

In [8]:
tom_enrolments_df.head(20)

Unnamed: 0,id,full_name,email,course_fullname,course_shortname,course_category,city,country,institution,role,completed,time_enrolled,time_completed,progress,status,course_id,moodle_category,new_id
0,6379,Jon Simonyan,jsimmk19@mail.ru,Open Session Online 2014,Open Session Online 2014,NU,Erevan,Armenia,,,No,2020-07-10 00:00:00,,50.0,Open,45.0,Resources / Open Session,179
1,6392,Mariam Militosyan,militosyan_96@mail.ru,Обучающий Онлайн Курс по Изучению Ящура на рус...,FITC_RU4,FMD Investigation Training Course,,Armenia,,,Yes,2020-09-17 00:00:00,2020-10-21 00:00:00,100.0,Completed,244.0,FMD Investigation Training Course,180
2,6393,Mariam Militosyan,militosyan_96@mail.ru,Knowledge Bank,Knowledge Bank,NU,,Armenia,,,No,2020-09-28 00:00:00,,50.0,Open,3.0,Resources,181
3,6394,Irma Artushyan,irma.artushyan95@mail.ru,Обучающий Онлайн Курс по Изучению Ящура на рус...,FITC_RU4,FMD Investigation Training Course,,Armenia,,,Yes,2020-09-17 00:00:00,2020-10-21 00:00:00,100.0,Completed,244.0,FMD Investigation Training Course,183
4,6395,Irma Artushyan,irma.artushyan95@mail.ru,Open Session Online 2014,Open Session Online 2014,NU,,Armenia,,,No,2020-05-10 00:00:00,,50.0,Open,45.0,Resources / Open Session,185
5,6441,Ilhan Genc,ilhan.genc@tarimorman.gov.tr,Altıncı Türkçe Çevrimiçi Şap Hastalığı Araştır...,FITC_TR6,FMD Investigation Training Course,ERZURUM,Armenia,,Central level government veterinarian,Yes,2022-11-14 00:00:00,2022-12-13 00:00:00,100.0,Completed,375.0,FMD Investigation Training Course,186
6,6442,Ilhan Genc,ilhan.genc@tarimorman.gov.tr,Knowledge Bank,Knowledge Bank,NU,ERZURUM,Armenia,,Central level government veterinarian,No,2022-11-25 00:00:00,,50.0,Open,3.0,Resources,189
7,6459,Pascal Truong,pascal.truong@bluewin.ch,FMD Emergency Preparation Course (EN13),FEPC_EN13,FMD Emergency Preparation Course,,Switzerland,,Regional level/field veterinarian,Yes,2022-11-22 00:00:00,2022-12-26 00:00:00,100.0,Completed,374.0,FMD Emergency Preparation Course / English,190
8,6460,Pascal Truong,pascal.truong@bluewin.ch,Knowledge Bank,Knowledge Bank,NU,,Switzerland,,Regional level/field veterinarian,No,2022-11-22 00:00:00,,50.0,Open,3.0,Resources,191
9,6462,Jürg Frigg,j.frigg@laclinique.ch,FMD Emergency Preparation Course (EN13),FEPC_EN13,FMD Emergency Preparation Course,Madiswil,Switzerland,,Central level government veterinarian,Yes,2022-11-22 00:00:00,2022-12-22 00:00:00,100.0,Completed,374.0,FMD Emergency Preparation Course / English,192


In [9]:
# How many blank strings do we have in the database?

tom_enrolments_df.eq('').sum()

id                     0
full_name              0
email                  0
course_fullname        0
course_shortname       0
course_category        0
city                1004
country                0
institution         1756
role                1010
completed              0
time_enrolled          0
time_completed         0
progress               0
status                 0
course_id              0
moodle_category      381
new_id                 0
dtype: int64

In [10]:
tom_enrolments_df["role"].value_counts()

role
                                         1010
Regional level/field veterinarian         414
Central level government veterinarian     396
Laboratory staff                          104
Other (please specify)                     44
Students                                   17
Private veterinarian                        3
Name: count, dtype: int64

In [11]:
tom_enrolments_df["progress"].value_counts()

progress
50.0     1365
100.0     572
0.0        35
14.3        2
60.0        1
80.0        1
10.0        1
23.1        1
87.0        1
1.2         1
92.9        1
57.1        1
7.1         1
46.2        1
42.9        1
2.0         1
77.8        1
90.0        1
Name: count, dtype: int64

In [12]:
brick_mapping_df = pd.read_excel("data/get_prepared_competencies_mapping.xlsx", skiprows=1)

In [15]:
brick_mapping_df.head()

Unnamed: 0,Course,Level,Acccess,Disease covered,Simulation exercises,Training,Laboratories,Contingency planning,Surveillance,Awareness,...,Communication,Disposal,Humane killing of animals,Vaccination,Cleaning and disinfection,Control zones,Wildlife,Recovery of disease status,Vaccination exit strategy,Re-stocking
0,Introduction to Foot-and-Mouth disease,Basic,Open,FMD,,,,,,X,...,,,,,,,,,,
1,Introduction to Lumpy Skin disease,Basic,Open,LSD,,,,,,X,...,,,,,,,,,,
2,Introduction to Biorisk Minimum Standards,Basic,Open,FMD,,,X,,,,...,,,,,,,,,,
3,Introduction to Simulation Exercises,Basic,Open,FAST,X,,,,,,...,,,,,,,,,,
4,FMD/RVF/SPGP/LSD Emergency preparedness course,Intermediate,Nomination,Courses dedicated to one single disease among:...,,,,X,X,X,...,,,,,,X,,,,


TODO: Create a function that loops through the courses listed in the mapping file and creates a list of all the "brick" skills that are covered in the course as well as a list of diseases covered in the course. NOTE, if a course covers FAST diseases, this means that ALL diseases of interest are covered. FAST stands for "Foot and mouth and similar transboundary animal diseases"

In [16]:
# Within the brick_mapping dataframe convert NaN values to 0 and X to 1

brick_mapping_df.replace(to_replace='X', value=1, inplace=True)
brick_mapping_df["Disease covered"].fillna(value='', inplace=True)
brick_mapping_df.fillna(value=0, inplace=True)

In [18]:
brick_mapping_df.shape

(19, 25)

In [19]:
# Convert the brick mapping dataframe into a dict for easier handling:
temp_dict = brick_mapping_df.to_dict(orient='records')

In [20]:
temp_dict

[{'Course': 'Introduction to Foot-and-Mouth disease',
  'Level': 'Basic',
  'Acccess': 'Open ',
  'Disease covered': 'FMD',
  'Simulation exercises': 0.0,
  'Training': 0.0,
  'Laboratories': 0.0,
  'Contingency planning': 0.0,
  'Surveillance': 0.0,
  'Awareness': 1.0,
  'Clinical Examination': 0.0,
  'Epidemiological Investigation': 0.0,
  'Sampling': 0.0,
  'Farm biosecurity': 0.0,
  'Personal biosecurity': 0.0,
  'Communication': 0.0,
  'Disposal': 0.0,
  'Humane killing of animals': 0.0,
  'Vaccination': 0.0,
  'Cleaning and disinfection': 0.0,
  'Control zones': 0.0,
  'Wildlife': 0.0,
  'Recovery of disease status': 0.0,
  'Vaccination exit strategy': 0.0,
  'Re-stocking': 0.0},
 {'Course': 'Introduction to Lumpy Skin disease',
  'Level': 'Basic',
  'Acccess': 'Open ',
  'Disease covered': 'LSD',
  'Simulation exercises': 0.0,
  'Training': 0.0,
  'Laboratories': 0.0,
  'Contingency planning': 0.0,
  'Surveillance': 0.0,
  'Awareness': 1.0,
  'Clinical Examination': 0.0,
  'Epid

In [21]:
brick_mapping_dict = {}

for dict_item in temp_dict:
    brick_mapping_dict[dict_item["Course"]] = dict_item
    

In [22]:
brick_mapping_dict['Introduction to Biorisk Minimum Standards']

{'Course': 'Introduction to Biorisk Minimum Standards',
 'Level': 'Basic',
 'Acccess': 'Open ',
 'Disease covered': 'FMD',
 'Simulation exercises': 0.0,
 'Training': 0.0,
 'Laboratories': 1.0,
 'Contingency planning': 0.0,
 'Surveillance': 0.0,
 'Awareness': 0.0,
 'Clinical Examination': 0.0,
 'Epidemiological Investigation': 0.0,
 'Sampling': 0.0,
 'Farm biosecurity': 0.0,
 'Personal biosecurity': 0.0,
 'Communication': 0.0,
 'Disposal': 0.0,
 'Humane killing of animals': 0.0,
 'Vaccination': 0.0,
 'Cleaning and disinfection': 0.0,
 'Control zones': 0.0,
 'Wildlife': 0.0,
 'Recovery of disease status': 0.0,
 'Vaccination exit strategy': 0.0,
 'Re-stocking': 0.0}

In [23]:
i = brick_mapping_dict['Introduction to Biorisk Minimum Standards']
skills_covered_in_course = []

for item in i:
    if i[item] == 1:
        skills_covered_in_course.append(item)

i["skills_covered_in_course"] = skills_covered_in_course

In [24]:
skills_covered_in_course

['Laboratories']

In [25]:
brick_mapping_dict['Introduction to Biorisk Minimum Standards']

{'Course': 'Introduction to Biorisk Minimum Standards',
 'Level': 'Basic',
 'Acccess': 'Open ',
 'Disease covered': 'FMD',
 'Simulation exercises': 0.0,
 'Training': 0.0,
 'Laboratories': 1.0,
 'Contingency planning': 0.0,
 'Surveillance': 0.0,
 'Awareness': 0.0,
 'Clinical Examination': 0.0,
 'Epidemiological Investigation': 0.0,
 'Sampling': 0.0,
 'Farm biosecurity': 0.0,
 'Personal biosecurity': 0.0,
 'Communication': 0.0,
 'Disposal': 0.0,
 'Humane killing of animals': 0.0,
 'Vaccination': 0.0,
 'Cleaning and disinfection': 0.0,
 'Control zones': 0.0,
 'Wildlife': 0.0,
 'Recovery of disease status': 0.0,
 'Vaccination exit strategy': 0.0,
 'Re-stocking': 0.0,
 'skills_covered_in_course': ['Laboratories']}

In [27]:
def add_skills_covered_to_dict(i):
    skills_covered_in_course = []
    
    for key_value in i:
        if i[key_value] == 1:
            skills_covered_in_course.append(key_value)

    i["skills_covered_in_course"] = skills_covered_in_course

In [28]:
brick_mapping_dict['Introduction to Lumpy Skin disease']['Awareness']

1.0

In [29]:
add_skills_covered_to_dict(brick_mapping_dict['Introduction to Lumpy Skin disease'])

In [30]:
brick_mapping_dict['Introduction to Lumpy Skin disease']

{'Course': 'Introduction to Lumpy Skin disease',
 'Level': 'Basic',
 'Acccess': 'Open ',
 'Disease covered': 'LSD',
 'Simulation exercises': 0.0,
 'Training': 0.0,
 'Laboratories': 0.0,
 'Contingency planning': 0.0,
 'Surveillance': 0.0,
 'Awareness': 1.0,
 'Clinical Examination': 0.0,
 'Epidemiological Investigation': 0.0,
 'Sampling': 0.0,
 'Farm biosecurity': 0.0,
 'Personal biosecurity': 0.0,
 'Communication': 0.0,
 'Disposal': 0.0,
 'Humane killing of animals': 0.0,
 'Vaccination': 0.0,
 'Cleaning and disinfection': 0.0,
 'Control zones': 0.0,
 'Wildlife': 0.0,
 'Recovery of disease status': 0.0,
 'Vaccination exit strategy': 0.0,
 'Re-stocking': 0.0,
 'skills_covered_in_course': ['Awareness']}

In [31]:
for i in brick_mapping_dict:
    add_skills_covered_to_dict(brick_mapping_dict[i])

In [32]:
brick_mapping_dict

{'Introduction to Foot-and-Mouth disease': {'Course': 'Introduction to Foot-and-Mouth disease',
  'Level': 'Basic',
  'Acccess': 'Open ',
  'Disease covered': 'FMD',
  'Simulation exercises': 0.0,
  'Training': 0.0,
  'Laboratories': 0.0,
  'Contingency planning': 0.0,
  'Surveillance': 0.0,
  'Awareness': 1.0,
  'Clinical Examination': 0.0,
  'Epidemiological Investigation': 0.0,
  'Sampling': 0.0,
  'Farm biosecurity': 0.0,
  'Personal biosecurity': 0.0,
  'Communication': 0.0,
  'Disposal': 0.0,
  'Humane killing of animals': 0.0,
  'Vaccination': 0.0,
  'Cleaning and disinfection': 0.0,
  'Control zones': 0.0,
  'Wildlife': 0.0,
  'Recovery of disease status': 0.0,
  'Vaccination exit strategy': 0.0,
  'Re-stocking': 0.0,
  'skills_covered_in_course': ['Awareness']},
 'Introduction to Lumpy Skin disease': {'Course': 'Introduction to Lumpy Skin disease',
  'Level': 'Basic',
  'Acccess': 'Open ',
  'Disease covered': 'LSD',
  'Simulation exercises': 0.0,
  'Training': 0.0,
  'Laborat