# Data preprocessing

In [1]:
import numpy as np
import pandas as pd
import json
import matplotlib.pyplot as plt

In [2]:
def read_json(file):
    reader = pd.read_json(f"Data/{file}.json")
    return reader[['onetsoc_code','element_id', 'scale_id', 'data_value']]

In [3]:
knowledge = read_json("knowledge")
skills = read_json("skills")
work_activities = read_json("work_activities")

In [4]:
knowledge.head(10)

Unnamed: 0,onetsoc_code,element_id,scale_id,data_value
0,11-1011.00,2.C.1.a,IM,4.75
1,11-1011.00,2.C.1.a,LV,6.23
2,11-1011.00,2.C.1.b,IM,2.66
3,11-1011.00,2.C.1.b,LV,3.5
4,11-1011.00,2.C.1.c,IM,3.7
5,11-1011.00,2.C.1.c,LV,4.36
6,11-1011.00,2.C.1.d,IM,3.23
7,11-1011.00,2.C.1.d,LV,3.9
8,11-1011.00,2.C.1.e,IM,4.09
9,11-1011.00,2.C.1.e,LV,5.55


In [5]:
skills.tail(10)

Unnamed: 0,onetsoc_code,element_id,scale_id,data_value
67008,53-7121.00,2.B.4.h,IM,2.12
67009,53-7121.00,2.B.4.h,LV,2.12
67010,53-7121.00,2.B.5.a,IM,3.12
67011,53-7121.00,2.B.5.a,LV,2.88
67012,53-7121.00,2.B.5.b,IM,2.0
67013,53-7121.00,2.B.5.b,LV,1.12
67014,53-7121.00,2.B.5.c,IM,2.0
67015,53-7121.00,2.B.5.c,LV,1.88
67016,53-7121.00,2.B.5.d,IM,2.88
67017,53-7121.00,2.B.5.d,LV,2.75


In [6]:
work_activities.tail(10)

Unnamed: 0,onetsoc_code,element_id,scale_id,data_value
79320,53-7121.00,4.A.4.b.5,IM,3.07
79321,53-7121.00,4.A.4.b.5,LV,3.28
79322,53-7121.00,4.A.4.b.6,IM,2.72
79323,53-7121.00,4.A.4.b.6,LV,2.4
79324,53-7121.00,4.A.4.c.1,IM,2.54
79325,53-7121.00,4.A.4.c.1,LV,2.27
79326,53-7121.00,4.A.4.c.2,IM,1.93
79327,53-7121.00,4.A.4.c.2,LV,1.6
79328,53-7121.00,4.A.4.c.3,IM,2.56
79329,53-7121.00,4.A.4.c.3,LV,2.64


### Required knowledge

In [7]:
len(knowledge["onetsoc_code"].value_counts())

967

In [8]:
knowledge["onetsoc_code"].value_counts().sort_values()

43-4111.00    55
43-4131.00    61
17-2199.05    66
11-3051.03    66
15-1199.10    66
              ..
11-9013.03    66
13-2071.00    66
29-1171.00    66
15-1122.00    66
19-3011.01    66
Name: onetsoc_code, Length: 967, dtype: int64

In [9]:
len(knowledge.groupby(["element_id"]).size())

33

### Required skills

In [23]:
len(skills["onetsoc_code"].value_counts())

951

In [24]:
skills["onetsoc_code"].value_counts().sort_values()

19-4051.01    70
11-3051.03    70
15-1199.10    70
41-3099.01    70
43-3021.02    70
              ..
11-3051.06    70
11-9013.03    70
13-2071.00    70
47-5012.00    70
47-2061.00    70
Name: onetsoc_code, Length: 951, dtype: int64

In [12]:
len(skills.groupby(["element_id"]).size())

35

### Work activities

In [25]:
len(work_activities["onetsoc_code"].value_counts())

965

In [26]:
work_activities["onetsoc_code"].value_counts().sort_values()

19-4051.01    82
15-1199.09    82
11-3051.03    82
15-1199.10    82
41-3099.01    82
              ..
11-3051.06    82
11-9013.03    82
13-2071.00    82
51-6063.00    82
47-2061.00    82
Name: onetsoc_code, Length: 965, dtype: int64

In [27]:
len(work_activities.groupby(["element_id"]).size())

41

## Reducing number of jobs

In [28]:
c = 0

for job in skills["onetsoc_code"].unique():
    if job in list(knowledge["onetsoc_code"]) :
        c+=1  
    else:
        print("In knowledge dataset there is no any data about: " + str(job) + " job")
print("There is " + str(c) + " matches between jobs described in skills and knowledge dataset")

There is 951 matches between jobs described in skills and knowledge dataset


In [29]:
c = 0

for job in skills["onetsoc_code"].unique():
    if job in list(work_activities["onetsoc_code"]) :
        c+=1  
    else:
        print("In knowledge dataset there is no any data about: " + str(job) + " job")
print("There is " + str(c) + " matches between jobs described in skills and work_activities dataset")

There is 951 matches between jobs described in skills and work_activities dataset


## From the analysis above we can conclude that:

- All of three datasets should have 958 job titles (We need to drop "43-4121.00" because knowledge dataset does not contain any data for it)
- <b>Skills dataset</b> does not contain 70 skills for: <b>49-2093.00, 33-9021.00, 49-3021.00, 33-3051.01</b> as others. It means that they would be dropped.
- The same thing is happening for job titles: <b>43-4111.00, 43-4131.00 in knowledge dataset</b>.
- <b>47-1011.00 would be dropped from work_activities because of that reason</b>

# Let's start with cleaning


### Step 1:

In [18]:
knowledge.drop(knowledge.loc[knowledge['onetsoc_code']=="43-4121.00"].index, inplace=True)

In [19]:
skills.drop(skills.loc[skills['onetsoc_code']=="43-4121.00"].index, inplace=True)

### Step 2:

In [20]:
skills.drop(skills.loc[skills['onetsoc_code']=="49-2093.00"].index, inplace=True)
skills.drop(skills.loc[skills['onetsoc_code']=="33-9021.00"].index, inplace=True)
skills.drop(skills.loc[skills['onetsoc_code']=="49-3021.00"].index, inplace=True)
skills.drop(skills.loc[skills['onetsoc_code']=="33-3051.01"].index, inplace=True)

### Step 3:

In [21]:
knowledge.drop(knowledge.loc[knowledge['onetsoc_code']=="43-4111.00"].index, inplace=True)
knowledge.drop(knowledge.loc[knowledge['onetsoc_code']=="43-4131.00"].index, inplace=True)

skills.drop(skills.loc[skills['onetsoc_code']=="43-4111.00"].index, inplace=True)
skills.drop(skills.loc[skills['onetsoc_code']=="43-4131.00"].index, inplace=True)

work_activities.drop(work_activities.loc[work_activities['onetsoc_code']=="43-4111.00"].index, inplace=True)
work_activities.drop(work_activities.loc[work_activities['onetsoc_code']=="43-4131.00"].index, inplace=True)

In [22]:
work_activities.drop(work_activities.loc[work_activities['onetsoc_code']=="47-1011.00"].index, inplace=True)

knowledge.drop(knowledge.loc[knowledge['onetsoc_code']=="47-1011.00"].index, inplace=True)

skills.drop(skills.loc[skills['onetsoc_code']=="47-1011.00"].index, inplace=True)
