## Import Data

In [1]:
import pandas as pd

df = pd.read_csv("Digital Media Job Data.csv")

In [2]:
df

Unnamed: 0,job title,company name,location,salary,skills
0,Digital Media and Graphic Design Specialist,"Pennsylvania House of Representatives, Democra...","Harrisburg, PA","Estimated: $47,000 - $65,000 a year",PMP
1,Digital Media and Graphic Design Specialist,"Pennsylvania House of Representatives, Democra...","Harrisburg, PA","Estimated: $47,000 - $65,000 a year",Legislation
2,Digital Media and Graphic Design Specialist,"Pennsylvania House of Representatives, Democra...","Harrisburg, PA","Estimated: $47,000 - $65,000 a year",Assembly
3,Digital Media and Graphic Design Specialist,"Pennsylvania House of Representatives, Democra...","Harrisburg, PA","Estimated: $47,000 - $65,000 a year",Adobe Illustrator
4,Digital Media and Graphic Design Specialist,"Pennsylvania House of Representatives, Democra...","Harrisburg, PA","Estimated: $47,000 - $65,000 a year",Digital design
...,...,...,...,...,...
2918,Digital Media Specialist,"Spring Hills, LLC -",Remote,"Estimated: $53,000 - $71,000 a year",Microsoft Powerpoint
2919,Digital Media Specialist,"Spring Hills, LLC -",Remote,"Estimated: $53,000 - $71,000 a year",Adobe Photoshop
2920,Digital Media Specialist,"Spring Hills, LLC -",Remote,"Estimated: $53,000 - $71,000 a year",WordPress
2921,Digital Media Specialist,"Spring Hills, LLC -",Remote,"Estimated: $53,000 - $71,000 a year",Social media management


## Data Cleaning

### drop null values

In [3]:
df = df.dropna()

### drop columns

In [4]:
df=df.drop(['company name'], axis=1)

In [5]:
df=df.drop(['location'], axis=1)

## Data Cleaning

### salary

In [6]:
df['salary'] = df['salary'].apply(lambda x: x.replace('Estimated: ', ''))

In [7]:
df['salary'] = df['salary'].apply(lambda x: x.replace('$', '').replace('K', ',000'))

In [8]:
df['salary'] = df['salary'].apply(lambda x: x.replace('.1', '').replace('.2', '').replace('.3', '').replace('.4', '').replace('.5', '').replace('.6', '').replace('.7', '').replace('.8', '').replace('.9', ''))

In [9]:
df = df[df["salary"].str.contains("an hour")== False]

In [10]:
df = df[df["salary"].str.contains("a week")== False]

In [11]:
df = df[df["salary"].str.contains("a month")== False]

In [12]:
df['salary'] = df['salary'].apply(lambda x: x.replace(' a year', ''))

In [13]:
df = df.drop(df[df['salary'].str.contains('-')==False].index)

### create min, max, and average salary columns

In [14]:
df['min salary'] = df['salary'].apply(lambda x: x.split(' - ')[0])

In [15]:
df['max salary'] = df['salary'].apply(lambda x: x.split(' - ')[1])

In [16]:
df['min salary'] = df['min salary'].apply(lambda x: x.replace(',', ''))

In [17]:
df['max salary'] = df['max salary'].apply(lambda x: x.replace(',', ''))

In [18]:
df[['min salary', 'max salary']] = df[['min salary', 'max salary']].apply(pd.to_numeric)

In [19]:
df['avg salary'] = (df['min salary']+df['max salary'])/2

### drop salary column

In [20]:
df=df.drop(['salary'], axis=1)

In [21]:
df

Unnamed: 0,job title,skills,min salary,max salary,avg salary
0,Digital Media and Graphic Design Specialist,PMP,47000,65000,56000.0
1,Digital Media and Graphic Design Specialist,Legislation,47000,65000,56000.0
2,Digital Media and Graphic Design Specialist,Assembly,47000,65000,56000.0
3,Digital Media and Graphic Design Specialist,Adobe Illustrator,47000,65000,56000.0
4,Digital Media and Graphic Design Specialist,Digital design,47000,65000,56000.0
...,...,...,...,...,...
2918,Digital Media Specialist,Microsoft Powerpoint,53000,71000,62000.0
2919,Digital Media Specialist,Adobe Photoshop,53000,71000,62000.0
2920,Digital Media Specialist,WordPress,53000,71000,62000.0
2921,Digital Media Specialist,Social media management,53000,71000,62000.0


### drop job benefits

In [22]:
df = df[df["skills"].str.contains("Health insurance|401|Paid time off|year|years|Wellness program|Dental insurance|Tuition reimbursement|Vision insurance|Life insurance|Disability insurance|Retirement plan|Health savings account|insurance|Employee assistance program|Opportunities for advancement|Work from home|Parental leave|Flexible schedule|Caregiver leave|Employee stock purchase plan|Family leave|Paid holidays|Flexible spending account|Relocation assistance|Secret Clearance|Referral program|Professional development assistance|Adoption assistance|Unlimited paid time off|Training & development|No experience needed|Paid sick time|Caregiver leave|Employee discount|Prescription drug insurance|Visa sponsorship|Loan forgiveness|403|Top Secret Clearance|Employee stock ownership plan|Commuter assistance") == False]

### separate degrees from skills

In [23]:
degrees=df["skills"].str.contains("Doctor of Philosophy|degree|Bachelor|Master|High school")

In [24]:
df['education']=df["skills"].where(degrees,'')

In [25]:
df["skills"]=df["skills"].mask(degrees,'')

### drop education column

In [26]:
df=df.drop(['education'], axis=1)

In [27]:
df

Unnamed: 0,job title,skills,min salary,max salary,avg salary
0,Digital Media and Graphic Design Specialist,PMP,47000,65000,56000.0
1,Digital Media and Graphic Design Specialist,Legislation,47000,65000,56000.0
2,Digital Media and Graphic Design Specialist,Assembly,47000,65000,56000.0
3,Digital Media and Graphic Design Specialist,Adobe Illustrator,47000,65000,56000.0
4,Digital Media and Graphic Design Specialist,Digital design,47000,65000,56000.0
...,...,...,...,...,...
2918,Digital Media Specialist,Microsoft Powerpoint,53000,71000,62000.0
2919,Digital Media Specialist,Adobe Photoshop,53000,71000,62000.0
2920,Digital Media Specialist,WordPress,53000,71000,62000.0
2921,Digital Media Specialist,Social media management,53000,71000,62000.0


### job titles

In [28]:
df4 = df.copy()

In [29]:
pd.set_option("display.max_rows", None)
print(df4['job title'].value_counts())

Marketing Specialist                                                                                                   225
Digital Marketing Specialist                                                                                           154
Social Media Specialist                                                                                                 98
Communications Specialist                                                                                               46
Digital Media Specialist                                                                                                39
Marketing Manager                                                                                                       36
Marketing Communications Specialist                                                                                     36
Digital Marketing Manager                                                                                               27
Senior Communica

In [30]:
df4.loc[df4['job title'].str.contains('digital marketing', case=False), 'job title'] = 'Digital Marketing Specialist'

In [31]:
df4.loc[df4['job title'].str.contains('social media', case=False), 'job title'] = 'Social Media Specialist'

In [32]:
df4.loc[df4['job title'].str.contains('communication', case=False), 'job title'] = 'Communications Specialist'

In [33]:
df4.loc[df4['job title'].str.contains('digital media', case=False), 'job title'] = 'Digital Media Specialist'

In [34]:
df4.loc[df4['job title'].str.contains('digital content', case=False), 'job title'] = 'Digital Content Specialist'

In [35]:
df4.loc[df4['job title'].str.contains('ppc', case=False), 'job title'] = 'PPC Specialist'

In [36]:
df4.loc[df4['job title'].str.contains('seo', case=False), 'job title'] = 'SEO Specialist'

In [37]:
df4.loc[df4['job title'].str.contains('public relations', case=False), 'job title'] = 'Public Relations Specialist'

In [38]:
df4.loc[df4['job title'].str.contains('advertising', case=False), 'job title'] = 'Advertising Specialist'

In [39]:
df4.loc[df4['job title'].str.contains('content', case=False), 'job title'] = 'Content Specialist'

In [40]:
df4.loc[df4['job title'].str.contains('media', case=False), 'job title'] = 'Media Specialist'

In [41]:
df4.loc[df4['job title'].str.contains('creative services', case=False), 'job title'] = 'Creative Services Specialist'

In [42]:
df4.loc[df4['job title'].str.contains('copywriter', case=False), 'job title'] = 'Copy Supervisor'

In [43]:
df4.loc[df4['job title'].str.contains('digital strategist', case=False), 'job title'] = 'Digital Marketing Strategist'

In [44]:
df4.loc[df4['job title'].str.contains('digital marketer', case=False), 'job title'] = 'Digital Marketing Specialist'

In [45]:
df4.loc[df4['job title'].str.contains('marketing manager', case=False), 'job title'] = 'Marketing Specialist'

In [46]:
df4.loc[df4['job title'].str.contains('marketing associate', case=False), 'job title'] = 'Marketing Specialist'

In [47]:
df4.loc[df4['job title'].str.contains('director of marketing', case=False), 'job title'] = 'Marketing Specialist'

In [48]:
df4.loc[df4['job title'].str.contains('marketing director', case=False), 'job title'] = 'Marketing Specialist'

In [49]:
df4.loc[df4['job title'].str.contains('marketing automation specialist', case=False), 'job title'] = 'Marketing Specialist'

In [50]:
df4.loc[df4['job title'].str.contains('marketing automation specialist', case=False), 'job title'] = 'Marketing Specialist'

In [51]:
df4.loc[df4['job title'].str.contains('marketing operations manager', case=False), 'job title'] = 'Marketing Specialist'

In [52]:
df4.loc[df4['job title'].str.contains('omnichannel marketing specialist', case=False), 'job title'] = 'Marketing Specialist'

In [53]:
df4.loc[df4['job title'].str.contains('digital brand strategist', case=False), 'job title'] = 'Digital Marketing Strategist'

In [54]:
df4.loc[df4['job title'].str.contains('events specialist', case=False), 'job title'] = 'Marketing Specialist'

In [55]:
df4.loc[df4['job title'].str.contains('marketing professional', case=False), 'job title'] = 'Marketing Specialist'

In [56]:
df4.loc[df4['job title'].str.contains('senior marketing specialist', case=False), 'job title'] = 'Marketing Specialist'

In [57]:
df4.loc[df4['job title'].str.contains('product marketing specialist', case=False), 'job title'] = 'Marketing Specialist'

In [58]:
df4.loc[df4['job title'].str.contains('IT Strategic Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [59]:
df4.loc[df4['job title'].str.contains('Marketing Specialist - Hybrid', case=False), 'job title'] = 'Marketing Specialist'

In [60]:
df4.loc[df4['job title'].str.contains('MARKETING SPECIALIST NF4*', case=False), 'job title'] = 'Marketing Specialist'

In [61]:
df4.loc[df4['job title'].str.contains('B2B Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [62]:
df4.loc[df4['job title'].str.contains('Marketing Specialist - Gardena', case=False), 'job title'] = 'Marketing Specialist'

In [63]:
df4.loc[df4['job title'].str.contains('Paid Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [64]:
df4.loc[df4['job title'].str.contains('Sales and Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [65]:
df4.loc[df4['job title'].str.contains('Senior Proposal Specialist / Marketing Coordinator', case=False), 'job title'] = 'Marketing Coordinator'

In [66]:
df4.loc[df4['job title'].str.contains('International Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [67]:
df4.loc[df4['job title'].str.contains('Integrated Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [68]:
df4.loc[df4['job title'].str.contains('Transportation Outreach and Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [69]:
df4.loc[df4['job title'].str.contains('Marketing Specialist-Austin Industrial', case=False), 'job title'] = 'Marketing Specialist'

In [70]:
df4.loc[df4['job title'].str.contains('Senior Customer Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [71]:
df4.loc[df4['job title'].str.contains('Regional Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [72]:
df4.loc[df4['job title'].str.contains('Associate Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [73]:
df4.loc[df4['job title'].str.contains('Marketing Specialist - Berrien Springs/Hybrid Work Eligible', case=False), 'job title'] = 'Marketing Specialist'

In [74]:
df4.loc[df4['job title'].str.contains('Membership Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [75]:
df4.loc[df4['job title'].str.contains('Graphic Design Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [76]:
df4.loc[df4['job title'].str.contains('strategist', case=False), 'job title'] = 'Digital Marketing Strategist'

In [77]:
df4.loc[df4['job title'].str.contains('manager', case=False), 'job title'] = 'Marketing Specialist'

In [78]:
df4.loc[df4['job title'].str.contains('event', case=False), 'job title'] = 'Marketing Specialist'

In [79]:
df4.loc[df4['job title'].str.contains('coordinator', case=False), 'job title'] = 'Marketing Specialist'

In [80]:
df4.loc[df4['job title'].str.contains('Sr.', case=False), 'job title'] = 'Marketing Specialist'

In [81]:
df4.loc[df4['job title'].str.contains('Senior', case=False), 'job title'] = 'Marketing Specialist'

In [82]:
df4.loc[df4['job title'].str.contains('analyst', case=False), 'job title'] = 'Marketing Specialist'

In [83]:
df4.loc[df4['job title'].str.contains('associate', case=False), 'job title'] = 'Marketing Specialist'

In [84]:
df4.loc[df4['job title'].str.contains('Marketing Project Specialist ', case=False), 'job title'] = 'Marketing Specialist'

In [85]:
df4.loc[df4['job title'].str.contains('Marketing Specialilst', case=False), 'job title'] = 'Marketing Specialist'

In [86]:
df4.loc[df4['job title'].str.contains('MBS Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [87]:
df4.loc[df4['job title'].str.contains('Marketing Campaign Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [88]:
df4.loc[df4['job title'].str.contains('Director Marketing', case=False), 'job title'] = 'Marketing Specialist'

In [89]:
df4.loc[df4['job title'].str.contains('Ad Agency Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [90]:
df4.loc[df4['job title'].str.contains('Web/ Conversion Optimization Marketing Specialist', case=False), 'job title'] = 'Marketing Specialist'

In [91]:
df4 = df4.drop(df4[df4['job title'].str.contains('Marketing Specialist|Communications Specialist|Media Specialist|Digital Marketing Specialist|Content Specialist|PPC Specialist|Advertising Specialist|Digital Marketing Strategist|Copy Supervisor')==False].index)

In [92]:
df4 = df4[df4["job title"].str.contains("Employer Branding and Recruitment Marketing Specialist")== False]

In [93]:
df4 = df4[df4["job title"].str.contains("WGL - Marketing Specialist")== False]

In [94]:
df4 = df4[df4["job title"].str.contains("Recruitment Marketing Specialist")== False]

In [95]:
df4 = df4[df4["job title"].str.contains("REMOTE")== False]

In [96]:
df4 = df4[df4["job title"].str.contains("Executive Assistant / Tech and Marketing Specialist")== False]

In [97]:
pd.set_option("display.max_rows", None)
print(df4['job title'].value_counts())

Marketing Specialist            580
Communications Specialist       500
Media Specialist                396
Digital Marketing Specialist    337
Content Specialist              163
PPC Specialist                   42
Advertising Specialist           33
Digital Marketing Strategist     29
Copy Supervisor                  28
Name: job title, dtype: int64


In [98]:
df4=df4.reset_index(drop=True)

In [99]:
df4

Unnamed: 0,job title,skills,min salary,max salary,avg salary
0,Media Specialist,PMP,47000,65000,56000.0
1,Media Specialist,Legislation,47000,65000,56000.0
2,Media Specialist,Assembly,47000,65000,56000.0
3,Media Specialist,Adobe Illustrator,47000,65000,56000.0
4,Media Specialist,Digital design,47000,65000,56000.0
5,Media Specialist,,47000,65000,56000.0
6,Communications Specialist,Writing skills,72000,94000,83000.0
7,Communications Specialist,Project management,72000,94000,83000.0
8,Communications Specialist,Research,72000,94000,83000.0
9,Communications Specialist,Content creation,72000,94000,83000.0


In [135]:
df=df4.copy()

### skill separation

In [136]:
df["skills"].unique()

array(['PMP', 'Legislation', 'Assembly', 'Adobe Illustrator',
       'Digital design', '', 'Writing skills', 'Project management',
       'Research', 'Content creation', 'Content development',
       'Social media management', 'Google Ads', 'AdRoll',
       'Constant Contact', 'CSS', 'AP style', 'Adobe Photoshop',
       'Direct marketing', 'Editing', 'Communication skills',
       'Management', 'Time management', 'Organizational skills',
       'Google AdWords', 'Digital marketing', 'Marketing', 'Budgeting',
       'SEO', 'Analysis skills', 'Vendor management', 'Market research',
       'Microsoft Excel', 'Hootsuite', 'Google Docs',
       'Marketing automation', 'SaaS', 'Military', 'Presentation skills',
       'Final Cut Pro', 'Adobe After Effects', 'Graphic design',
       'Adobe Premiere', 'Product management', 'Microsoft Powerpoint',
       'Software troubleshooting', 'Adobe Creative Suite',
       'Google Analytics', 'WordPress', 'Teaching', 'Literacy education',
       'Teacher

### soft skills

In [137]:
soft=df["skills"].str.contains("Confluence|Interviewing|Hospitality|Constant contact|Research|Communication skills|Writing skills|Time management|Organizational skills|Presentation skills|Teaching|Literacy education|Teacher coaching|Computer skills|Recruiting|Human resources|Account management|Social listening|Leadership|Mentoring|Event planning|Supervising experience|Public speaking|Media relations|Translation|Public health|Customer service|Copywriting|Technical writing|Guest services|Continuous improvement|Addiction counseling|Office experience|Conflict management|Negotiation|Purchasing|Technical sales|Change management")

In [138]:
df['soft skills']=df["skills"].where(soft,'')

### business skills

In [139]:
business=df["skills"].str.contains("Financial services|Microsoft Outlook|Slack|Agile|Team management|B2B|Project implementation|Website management|Project planning|Product management|Management|AP style|Social media management|Banking|PMP|Legislation|Project management|Vendor management|Market research|Facebook Advertising|MailChimp|Integrated marketing|Public relations|Journalism|Six Sigma Certification|Pardot|System security plans|Google Ad Manager|Multichannel marketing|Affiliate marketing|Website maintenance|Growing experience|B2B marketing|Lead generation|Video production|Power BI|GIS|Series 6|Bing Ads|Statistical analysis|SAS|Search engines|A/B testing|Google Suite|B2B sales|Strategic planning|Google Analytics Certification|Google AdWords Certification|HTML5|Proofreading|Video editing|HR sourcing|Restoration|Customer relationship management|Software deployment|Contracts|Sage|Drupal|Customer retention|Google Tag Manager|Conversion optimization|Rally|Manufacturing|Web analytics|Financial acumen|Data collection|Customer segmentation|Data center experience|Fundraising|Securities law|Construction|Post-production|Upselling|Content management|WCAG|Algebra|Geometry")

In [140]:
df['business skills']=df["skills"].where(business,'')

### technical skills

In [141]:
technical=df["skills"].str.contains("Cataloging|Typing|Pivot tables|Filming|Canva|Databases|Email marketing|Organization design|Buffer|Smartsheet|Computer networking|Linkbuilding|Avid|Assembly|Branding|UX|Microsoft Word|Analytics|Windows|Photography|Sourcing|Business development|E-commerce|Word processing|Keyword research|Database management|Figma|Web development|Direct sales|UI|Spark|VersionOne|Google Search Console|Word processing|Sales|Content marketing|Event marketing|HubSpot|Apple Keynote|Performance marketing|Sharepoint|Prezi|Final Cut Pro|Adobe After Effects|Graphic design|Adobe Premiere|Marketing|Budgeting|Analysis skills|Marketing automation|Google AdWords|Digital marketing|Editing|Direct marketing|Adobe Photoshop|CSS|Google Ads|AdRoll|Content development|Content creation|Adobe Illustrator|Digital Design|Python|SQL|Microsoft Excel|Tableau|Google Analytics|WordPress|Microsoft Office|Adobe Creative Suite|SEO|SaaS|Hootsuite|Google Docs|Microsoft Powerpoint|Software troubleshooting|CRM software|Adobe InDesign|Jira|Adobe Experience Manager|Photo manipulation|Data analytics|Automotive diagnostics|Mobile applications|PPC Campaign Management|Content management systems|Adobe Analytics|Hybris|Omniture|Salesforce|Kanban|Microsoft Project|Information architecture|Back-end development|Data analysis skills|Attribution modeling|Operating systems|Responsive web design|LMS|UI design")

In [142]:
df['technical skills']=df["skills"].where(technical,'')

In [143]:
df=df.reset_index(drop=True)

In [144]:
df

Unnamed: 0,job title,skills,min salary,max salary,avg salary,soft skills,business skills,technical skills
0,Media Specialist,PMP,47000,65000,56000.0,,PMP,
1,Media Specialist,Legislation,47000,65000,56000.0,,Legislation,
2,Media Specialist,Assembly,47000,65000,56000.0,,,Assembly
3,Media Specialist,Adobe Illustrator,47000,65000,56000.0,,,Adobe Illustrator
4,Media Specialist,Digital design,47000,65000,56000.0,,,
5,Media Specialist,,47000,65000,56000.0,,,
6,Communications Specialist,Writing skills,72000,94000,83000.0,Writing skills,,
7,Communications Specialist,Project management,72000,94000,83000.0,,Project management,
8,Communications Specialist,Research,72000,94000,83000.0,Research,,
9,Communications Specialist,Content creation,72000,94000,83000.0,,,Content creation


In [145]:
df.to_csv('CleanedDigitalMedia.csv')