### Imports 

In [None]:
# imports 
import pandas as pd
import re

## Approach

__Goals:__ <br>
Given a list of example technical skills, extract the technical skills of a given dataset

__Step 1: Extracting the data__<br>
Read in skills from "Example_Technical_Skills.csv" and 'Raw_Skills_Dataset.csv' and store them in their own dataframe. <br>
This is done with the "dataframe_create" function. <br>

Extract the information from "Technology Skills" and "RAW DATA" columns from "Example_Technical_Skills.csv" and 'Raw_Skills_Dataset.csv' respectively and store the extracted info into their own lists. <br>

__Step 2: Comparing the data__<br>
Compare the 2 lists and extract only the skills found in "Example_Technical_Skills.csv" from 'Raw_Skills_Dataset.csv' <br>

__Step 3: Separating soft skills from technical skills__<br>
Separate soft and technical skills. See below for more details

#### Helper Functions:

In [None]:
# Definiging helper functions 

def dataframe_create(str1):
  """
  Create a dataframe from given csv file name.

  Parameters:
  str1: Name of csv file you wish to create dataframe from 

  Output: Pandas dataframe
  """

  example = str1
  df = pd.read_csv(example, sep='\t',encoding="latin1")
  
  return df 


def find_skill(list1,list2):
  """ 
  Counts the number of skills in list1 that is present in list2. 
  
  Parameters:
  list1: List you want to check for skills
  list2: Reference list of skills you want to check for

  output: List of skills found in list1 present in list2
  
  """
  tech_skill = [skill for skill in list1 if skill in list2]

  return tech_skill


def uniq_skills(list1):

  """ 
  Counts the number of Unique skills in list1. 
  
  Parameters:
  list1: List you want to check for Unique skills

  output: List of Unique skills found in list1
  
  """
  tech_skills = []

  for skill in range(len(list1)):
    if list1[skill] not in tech_skills:
      tech_skills.append(list1[skill])


  return tech_skills


#### Driver code: 

In [None]:
# STEP 1: 

# Creating dataframes from given csv files
df_ex = dataframe_create('Example_Technical_Skills.csv')
df_raw = dataframe_create('Raw_Skills_Dataset.csv')

# Initialize empty lists to store data extracted from dataframes 
ex_skills = [] 
raw_data = []

# Storing example tech skills in a list 
for line in range(len(df_ex["Technology Skills"])):
  input_skill = df_ex["Technology Skills"].iloc[line]
  ex_skills.append(input_skill)


# Storing technical skills from raw_skills dataset 
for line in range(len(df_raw["RAW DATA"])):
  input_skill = df_raw["RAW DATA"].iloc[line]
  raw_data.append(input_skill)

print(f'Number of Technical (Hard) skills from "Example_Technical_Skills" dataset:', len(ex_skills))
print(f'Number of entries from "Raw_Skills_Dataset" dataset:',len(raw_data))

Number of Technical (Hard) skills from "Example_Technical_Skills" dataset: 979
Number of entries from "Raw_Skills_Dataset" dataset: 34116


In [None]:
# STEP 2: 
# Running function "find_skill" to extract "hard technical skills" from "raw_data" list

skills = find_skill(raw_data, ex_skills)
print(f'Number of technical skills found in "raw_data":', len(skills))

num_skills = uniq_skills(skills)
print(f'Number of unique technical skills found in "raw_data":',len(num_skills))
print(f'Unique techical skills extracted:',num_skills)

Number of technical skills found in "raw_data": 259
Number of unique technical skills found in "raw_data": 36
Unique techical skills extracted: ['MySQL', 'SCSS', 'EAC', 'DevOps', 'OpenShift', 'GitHub', 'NetSuite', 'Bitbucket', 'Ethereum', 'Kotlin', 'Bash', 'FTP', 'OneStream', 'Consul', 'Force.com', 'Amazon Lambda', 'Radius', 'Gulp', 'Guidewire PolicyCenter', 'Adobe Photoshop', 'Eclipse', 'Phantom', 'Magento', 'Microsoft Azure Networking', 'ActiveMQ', 'SAP Lumira', 'SAP Analytics Cloud', 'PostGIS', 'SAS JMP', 'NuGet', 'DRM', 'SAS Enterprise Guide', 'Katalon', 'Apache Oozie', 'Octopus Deploy', 'SAS Base']


### Step 3: __Separating soft skills from technical skills__

Upon examing the data in 'Raw_Skills_Dataset.csv' I notice that the entries can be classified as __technical skills__, __soft skills__, or __random words__.<br>

I attempt to sepate soft skills from technical skills by extracting the soft skills from the 'raw_data' list I created earlier.<br>

To do this I create a regex expression that covers commonly desired soft skills such as: 
- Good Attitude
- Communication Skills 
- Work ethic
- Teamwork
- Leadership qualities
- Time management
- Decision making
- Conflict resolution
- Critical thinking
- Networking
- Empathy
- Problem-solving 

Of course this list is not exhaustive and can be prone to return values that are not considered soft skills. However it will filter out most of the entries considered __"random words"__ and give you a decent idea of what soft skills are present in your dataset.<br>

With this I now have a function that can extract the techincal skills and soft skills. I can then apply the "uniq_skills" function created earlier to filter out repeated skills. 




In [None]:
def find_soft(list1):
  """
  Finds soft skills in list1

  Parameters:
  list1: List you want to check for skills


  output: List of skills found in list1 
 
  """

  # List of desired softskills in regex
  pattern = '[aA]ttitude[-\s]\w+|[pP]roblem[-\s][sS]\w+|[mM]entor\w+|[cC]ommunication\s[sS]kills|[pP]resent\w+|[wW]ork\s\w+\n|[tT]eamwork|[lL]eader\w+|[tT]ime[-\s][mM]anagement|[dD]ecision[-\s]\w+|[cC]onflict[-\s]\w+|[cC]ritical[-\s][tT]hink\w+|[nN]etworking|[eE]mpathy|[rR]esolution'

  # Finding softskills in list1 
  sft_sk = []
  for i in range(len(list1)):
    match = re.findall(pattern, list1[i])
    if match: # If soft skill is found, append to list "sft_sk"
      match = match[0].lower()
      sft_sk.append(match)

  return sft_sk

In [None]:
# Finding number of soft skills in dataset 
soft_sk = find_soft(raw_data)
print(f"Number of soft skills found in dataset:", len(soft_sk))

# Finding number of unique soft skills in dataset
uniq_soft = uniq_skills(soft_sk)
print(f'Number of unique soft skills found in "raw_data":',len(uniq_soft))
print(f"Unique soft skills extracted:",uniq_soft)

Number of soft skills found in dataset: 682
Number of unique soft skills found in "raw_data": 26
Unique soft skills extracted: ['problem-solving', 'problem-solver', 'mentors', 'leadership', 'decision models', 'problem solving', 'networking', 'empathy', 'resolution', 'mentorship', 'communication skills', 'problem solver', 'presentation', 'mentoring', 'presentations', 'teamwork', 'decision-making', 'presenter', 'critical thinkers', 'problem solvers', 'presentative', 'critical thinker', 'time management', 'time-management', 'presentatives', 'critical thinking']
