In [206]:
import pandas as pd
import numpy as np
import re
from datetime import timedelta, date

In [207]:
# df = pd.read_csv('Data SceintistGlassdoor_posts(2023-12-22).csv')
df = pd.read_csv('Machine Learning EngineerGlassdoor_posts(2023-12-23).csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 13 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   job_title         734 non-null    object 
 1   company_name      734 non-null    object 
 2   location          734 non-null    object 
 3   days_posted       734 non-null    object 
 4   salary_estimate   618 non-null    object 
 5   job_description   734 non-null    object 
 6   company_rating    734 non-null    float64
 7   company_size      734 non-null    object 
 8   company_founded   734 non-null    object 
 9   company_type      734 non-null    object 
 10  company_industry  734 non-null    object 
 11  company_sector    734 non-null    object 
 12  company_revenue   734 non-null    object 
dtypes: float64(1), object(12)
memory usage: 74.7+ KB


In [208]:
def convert_days_posted_to_date(days_posted: str) -> date:
    today = date.today()

    if days_posted == "24h":
        return today
    elif days_posted.endswith("d"):
        days = int(days_posted[:-1])  # Remove the "d" and convert to int
        return today - timedelta(days=days)
    elif days_posted == "30d+":
        return today - timedelta(days=30)
    else:
        return None  # Return None or raise an exception if the format is unexpected

In [209]:
def salary(df):
    # Extract start and end salaries
    df[['start', 'end']] = df['salary_estimate'].str.extract(r'\$(\d+K) - \$(\d+K)')

    # Remove the 'K' and convert to integer
    df['start'] = df['start'].str.replace('K', '').astype(float) * 1000   
    df['end'] = df['end'].str.replace('K', '').astype(float) * 1000

    # Calculate average
    df['average'] = df[['start', 'end']].mean(axis=1)

    return df

In [210]:
def process_salary(salary):
    if 'Per Hour' in salary:
        salary = salary.replace('Per Hour', '').replace('$', '').replace('K', '').replace('(Employer est.)', '').strip()
        salary = salary.split('-')
        salary = [float(s) * 2000 if '.' in s else float(s) * 1000 for s in salary]
    else:
        salary = salary.replace('$', '').replace('K', '').replace('(Employer est.)', '').strip()
        salary = salary.split('-')
        salary = [float(s) * 1000 for s in salary]
    
    if len(salary) == 1:
        salary = salary * 2

    average = np.mean(salary)
    return salary[0], salary[1], average

In [211]:
df = df.assign(
    company_size = lambda x: x['company_size'].str.replace('Size', ''),
    company_founded = lambda x: pd.to_numeric(x['company_founded'].str.replace('Founded', ''), errors='coerce').fillna(0).astype(int),
    company_type = lambda x: x['company_type'].str.replace('Type', ''),
    company_industry = lambda x: x['company_industry'].str.replace('Industry', ''),
    company_sector = lambda x: x['company_sector'].str.replace('Sector', ''),
    company_revenue = lambda x: x['company_revenue'].str.replace('Revenue', ''),
    post_date = lambda x: pd.to_datetime(x['days_posted'].apply(convert_days_posted_to_date)),
    age = lambda x:  date.today().year - x['company_founded'],
)
df = salary(df)
df.head()


Unnamed: 0,job_title,company_name,location,days_posted,salary_estimate,job_description,company_rating,company_size,company_founded,company_type,company_industry,company_sector,company_revenue,post_date,age,start,end,average
0,Machine Learning Engineer Graduate (TikTok Sea...,TikTok,"San Jose, CA",30d+,$112K - $147K (Employer est.),Responsibilities\n\nAbout TikTok:\nTikTok is t...,3.4,1001 to 5000 Employees,2016,Company - Private,Internet & Web Services,Information Technology,Unknown / Non-Applicable,2023-11-26,7,112000.0,147000.0,129500.0
1,Artificial Learning Engineer,"Agilysys, Inc.","Bellevue, WA",30d+,$72K - $106K (Glassdoor est.),Agilysys provides industry-leading modern clou...,3.6,1001 to 5000 Employees,1963,Company - Public,Hotels & Resorts,Hotels & Travel Accommodation,$100 to $500 million (USD),2023-11-26,60,72000.0,106000.0,89000.0
2,Machine Learning Engineer,Microsoft,United States,1d,$134K - $257K (Employer est.),The Industry Solutions Engineering (ISE) team ...,4.3,10000+ Employees,1975,Company - Public,Computer Hardware Development,Information Technology,$10+ billion (USD),2023-12-25,48,134000.0,257000.0,195500.0
3,"Machine Learning Engineer, Motion Planning, Au...",Tesla,"Palo Alto, CA",30d+,,What to Expect\nTesla is looking for strong Ma...,3.6,10000+ Employees,2003,Company - Public,Transportation Equipment Manufacturing,Manufacturing,$1 to $5 billion (USD),2023-11-26,20,,,
4,Machine Learning Engineer,ABBVIE,"Crystal Lake, IL",24h,$84K - $117K (Glassdoor est.),Are you ready to shape the future of machine l...,3.9,10000+ Employees,2013,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$10+ billion (USD),2023-12-26,10,84000.0,117000.0,100500.0


In [212]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   job_title         734 non-null    object        
 1   company_name      734 non-null    object        
 2   location          734 non-null    object        
 3   days_posted       734 non-null    object        
 4   salary_estimate   618 non-null    object        
 5   job_description   734 non-null    object        
 6   company_rating    734 non-null    float64       
 7   company_size      734 non-null    object        
 8   company_founded   734 non-null    int32         
 9   company_type      734 non-null    object        
 10  company_industry  734 non-null    object        
 11  company_sector    734 non-null    object        
 12  company_revenue   734 non-null    object        
 13  post_date         734 non-null    datetime64[ns]
 14  age               734 non-

In [213]:
new_df = df[(df["salary_estimate"].notnull()) & (df["start"].isnull())][["salary_estimate", "start", "end", "average"]]
new_df.head()

Unnamed: 0,salary_estimate,start,end,average
9,$25.00 - $35.00 Per Hour (Employer est.),,,
33,$45K (Employer est.),,,
54,$70.00 - $100.00 Per Hour (Employer est.),,,
64,$130K (Employer est.),,,
65,$90K (Employer est.),,,


In [214]:
def process_salary(salary):
    if 'Per Hour' in salary:
        salary = salary.replace('Per Hour', '').replace('$', '').replace('K', '').replace('(Employer est.)', '').strip()
        salary = salary.split('-')
        salary = [float(s) * 2000 if '.' in s else float(s) * 1000 for s in salary]
    else:
        salary = salary.replace('$', '').replace('K', '').replace('(Employer est.)', '').strip()
        salary = salary.split('-')
        salary = [float(s) * 1000 for s in salary]
    
    if len(salary) == 1:
        salary = salary * 2

    average = np.mean(salary)
    return salary[0], salary[1], average

new_df['start'], new_df['end'], new_df['average'] = zip(*new_df['salary_estimate'].map(process_salary))

In [215]:
df.loc[new_df.index, ["start","end", "average"]] = new_df[["start","end", "average"]]
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   job_title         734 non-null    object        
 1   company_name      734 non-null    object        
 2   location          734 non-null    object        
 3   days_posted       734 non-null    object        
 4   salary_estimate   618 non-null    object        
 5   job_description   734 non-null    object        
 6   company_rating    734 non-null    float64       
 7   company_size      734 non-null    object        
 8   company_founded   734 non-null    int32         
 9   company_type      734 non-null    object        
 10  company_industry  734 non-null    object        
 11  company_sector    734 non-null    object        
 12  company_revenue   734 non-null    object        
 13  post_date         734 non-null    datetime64[ns]
 14  age               734 non-

In [216]:
# List of technologies you want to check for
technologies = ['Python', 'R', 'PyTorch', 'TensorFlow', 'SQL', 'Java', 'Scala', 'C\\+\\+', 'Hadoop', 'Spark', 'Tableau', 'Power BI', "mongo","Scikit-Learn", "AWS", "Azure", "GCP", "BASH", "SAP", "Git", "Docker", "Keras", "excel", "snowflake"]

# Add columns for each technology in the DataFrame
for tech in technologies:
    df[tech] = df['job_description'].str.contains(tech, case=False, regex=True).astype(int)

In [217]:
df.head()

Unnamed: 0,job_title,company_name,location,days_posted,salary_estimate,job_description,company_rating,company_size,company_founded,company_type,...,AWS,Azure,GCP,BASH,SAP,Git,Docker,Keras,excel,snowflake
0,Machine Learning Engineer Graduate (TikTok Sea...,TikTok,"San Jose, CA",30d+,$112K - $147K (Employer est.),Responsibilities\n\nAbout TikTok:\nTikTok is t...,3.4,1001 to 5000 Employees,2016,Company - Private,...,1,0,0,0,0,0,0,0,0,0
1,Artificial Learning Engineer,"Agilysys, Inc.","Bellevue, WA",30d+,$72K - $106K (Glassdoor est.),Agilysys provides industry-leading modern clou...,3.6,1001 to 5000 Employees,1963,Company - Public,...,1,1,0,0,0,1,0,1,1,0
2,Machine Learning Engineer,Microsoft,United States,1d,$134K - $257K (Employer est.),The Industry Solutions Engineering (ISE) team ...,4.3,10000+ Employees,1975,Company - Public,...,1,1,0,0,0,0,0,0,1,0
3,"Machine Learning Engineer, Motion Planning, Au...",Tesla,"Palo Alto, CA",30d+,,What to Expect\nTesla is looking for strong Ma...,3.6,10000+ Employees,2003,Company - Public,...,0,0,0,0,0,0,0,0,0,0
4,Machine Learning Engineer,ABBVIE,"Crystal Lake, IL",24h,$84K - $117K (Glassdoor est.),Are you ready to shape the future of machine l...,3.9,10000+ Employees,2013,Company - Public,...,1,0,0,0,1,1,1,0,1,0


In [218]:
# Define keywords for each category
data_science_keywords = ['Data Scientist', 'Data Science', 'DS', 'Scientist']
data_analytics_keywords = ['Data Analyst', 'Data Analytics', 'DA', "Business Analyst"]
machine_learning_keywords = ['Machine Learning', 'ML', 'AI Engineer']

# Function to categorize job titles
def categorize_job_title(title):
    title = title.lower()
    if any(keyword.lower() in title for keyword in data_science_keywords):
        return 'Data Science'
    elif any(keyword.lower() in title for keyword in data_analytics_keywords):
        return 'Data Analytics'
    elif any(keyword.lower() in title for keyword in machine_learning_keywords):
        return 'Machine Learning Engineer'
    else:
        return 'Other'

# Create a new column 'job_category'
df['job_category'] = df['job_title'].apply(categorize_job_title)

df.job_category.value_counts()


job_category
Machine Learning Engineer    378
Other                        225
Data Analytics                77
Data Science                  54
Name: count, dtype: int64

In [195]:
df.iloc[:, :18].head()

Unnamed: 0,job_title,company_name,location,days_posted,salary_estimate,job_description,company_rating,company_size,company_founded,company_type,company_industry,company_sector,company_revenue,post_date,age,start,end,average
0,Machine Learning Engineer Graduate (TikTok Sea...,TikTok,"San Jose, CA",30d+,$112K - $147K (Employer est.),Responsibilities\n\nAbout TikTok:\nTikTok is t...,3.4,1001 to 5000 Employees,2016,Company - Private,Internet & Web Services,Information Technology,Unknown / Non-Applicable,2023-11-26,7,112000.0,147000.0,129500.0
1,Artificial Learning Engineer,"Agilysys, Inc.","Bellevue, WA",30d+,$72K - $106K (Glassdoor est.),Agilysys provides industry-leading modern clou...,3.6,1001 to 5000 Employees,1963,Company - Public,Hotels & Resorts,Hotels & Travel Accommodation,$100 to $500 million (USD),2023-11-26,60,72000.0,106000.0,89000.0
2,Machine Learning Engineer,Microsoft,United States,1d,$134K - $257K (Employer est.),The Industry Solutions Engineering (ISE) team ...,4.3,10000+ Employees,1975,Company - Public,Computer Hardware Development,Information Technology,$10+ billion (USD),2023-12-25,48,134000.0,257000.0,195500.0
3,"Machine Learning Engineer, Motion Planning, Au...",Tesla,"Palo Alto, CA",30d+,,What to Expect\nTesla is looking for strong Ma...,3.6,10000+ Employees,2003,Company - Public,Transportation Equipment Manufacturing,Manufacturing,$1 to $5 billion (USD),2023-11-26,20,,,
4,Machine Learning Engineer,ABBVIE,"Crystal Lake, IL",24h,$84K - $117K (Glassdoor est.),Are you ready to shape the future of machine l...,3.9,10000+ Employees,2013,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$10+ billion (USD),2023-12-26,10,84000.0,117000.0,100500.0


In [196]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 734 entries, 0 to 733
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   job_title         734 non-null    object        
 1   company_name      734 non-null    object        
 2   location          734 non-null    object        
 3   days_posted       734 non-null    object        
 4   salary_estimate   618 non-null    object        
 5   job_description   734 non-null    object        
 6   company_rating    734 non-null    float64       
 7   company_size      734 non-null    object        
 8   company_founded   734 non-null    int32         
 9   company_type      734 non-null    object        
 10  company_industry  734 non-null    object        
 11  company_sector    734 non-null    object        
 12  company_revenue   734 non-null    object        
 13  post_date         734 non-null    datetime64[ns]
 14  age               734 non-

In [219]:
def pre_processing(df):
    df = df.assign(
    company_size = lambda x: x['company_size'].str.replace('Size', ''),
    company_founded = lambda x: pd.to_numeric(x['company_founded'].str.replace('Founded', ''), errors='coerce').fillna(0).astype(int),
    company_type = lambda x: x['company_type'].str.replace('Type', ''),
    company_industry = lambda x: x['company_industry'].str.replace('Industry', ''),
    company_sector = lambda x: x['company_sector'].str.replace('Sector', ''),
    company_revenue = lambda x: x['company_revenue'].str.replace('Revenue', ''),
    post_date = lambda x: pd.to_datetime(x['days_posted'].apply(convert_days_posted_to_date)),
    age = lambda x:  date.today().year - x['company_founded'],
    )
    df = salary(df)
    new_df = df[(df["salary_estimate"].notnull()) & (df["start"].isnull())][["salary_estimate", "start", "end", "average"]]
    new_df['start'], new_df['end'], new_df['average'] = zip(*new_df['salary_estimate'].map(process_salary))
    df.loc[new_df.index, ["start","end", "average"]] = new_df[["start","end", "average"]]
    # List of technologies you want to check for
    technologies = ['Python', '\\bR\\b', 'PyTorch', 'TensorFlow', 'SQL', 'Java', 'Scala', 'C\\+\\+', 'Hadoop', 'Spark', 'Tableau', 'Power BI', "mongo","Scikit-Learn", "AWS", "Azure", "GCP", "BASH", "SAP", "Git", "Docker", "Keras", "excel", "snowflake"]

    # Corresponding column names
    column_names = ['Python', 'R', 'PyTorch', 'TensorFlow', 'SQL', 'Java', 'Scala', 'C++', 'Hadoop', 'Spark', 'Tableau', 'Power BI', "mongo","Scikit-Learn", "AWS", "Azure", "GCP", "BASH", "SAP", "Git", "Docker", "Keras", "excel", "snowflake"]

    # Add columns for each technology in the DataFrame
    for tech, col_name in zip(technologies, column_names):
        df[col_name] = df['job_description'].apply(lambda x: int(bool(re.search(tech, x, re.IGNORECASE))))
     # Define keywords for each category
    data_science_keywords = ['Data Scientist', 'Data Science', 'DS', 'Scientist']
    data_analytics_keywords = ['Data Analyst', 'Data Analytics', 'DA']
    machine_learning_keywords = ['Machine Learning', 'ML', 'AI Engineer']

    # Function to categorize job titles
    def categorize_job_title(title):
        title = title.lower()
        if any(keyword.lower() in title for keyword in data_science_keywords):
            return 'Data Science'
        elif any(keyword.lower() in title for keyword in data_analytics_keywords):
            return 'Data Analytics'
        elif any(keyword.lower() in title for keyword in machine_learning_keywords):
            return 'Machine Learning Engineer'
        else:
            return 'Other'

    # Create a new column 'job_category'
    df['job_category'] = df['job_title'].apply(categorize_job_title)
    return df

In [220]:
df = pd.read_csv('Data SceintistGlassdoor_posts(2023-12-22).csv')
df2 = pd.read_csv('Data AnalystGlassdoor_posts(2023-12-23).csv')
df3 = pd.read_csv('Machine Learning EngineerGlassdoor_posts(2023-12-23).csv')
df = pre_processing(df)
df2 = pre_processing(df2)
df3 = pre_processing(df3)

In [199]:
df3[df3.job_category=="Other"].job_title.value_counts()

job_title
Software Engineer                                                                                                  11
Senior Software Engineer                                                                                            3
AI Frameworks Engineer                                                                                              2
Customer Support Engineer                                                                                           2
Amazon Robotics - Business Intelligence Engineer (BIE) Co-op (Learning Experience and Technology) - Spring 2024     2
                                                                                                                   ..
Robotics Research Engineer                                                                                          1
Engineer I-Marketing Applications                                                                                   1
Bioinformatics Engineer                       

In [221]:
df3.drop(df3[df3.job_category=="Other"].index, inplace=True)

In [222]:
df.iloc[:, :18].head()

Unnamed: 0,job_title,company_name,location,days_posted,salary_estimate,job_description,company_rating,company_size,company_founded,company_type,company_industry,company_sector,company_revenue,post_date,age,start,end,average
0,Data Scientist - Finance,"Gametime United, Inc.",Remote,6d,$173K - $205K (Employer est.),About Us:\nLive experiences help people cross ...,4.5,51 to 200 Employees,2013,Company - Private,Ticket Sales,"Arts, Entertainment & Recreation",$25 to $100 million (USD),2023-12-20,10,173000.0,205000.0,189000.0
1,Data Scientist 1,MidAmerican Energy,"Des Moines, IA",10d,$74K - $87K (Employer est.),The Data Scientist I works on data and analyti...,4.1,1001 to 5000 Employees,2000,Subsidiary or Business Segment,Energy & Utilities,"Energy, Mining & Utilities",$1 to $5 billion (USD),2023-12-16,23,74000.0,87000.0,80500.0
2,Junior Data Scientist,"BearingPoint Consulting, Inc. USA","Chicago, IL",24h,$80K - $117K (Glassdoor est.),As a Junior Data Scientist at BearingPoint you...,4.1,51 to 200 Employees,0,Company - Private,--,--,Unknown / Non-Applicable,2023-12-26,2023,80000.0,117000.0,98500.0
3,Data Scientist,ClosedLoop,United States,24h,,About ClosedLoop:\nClosedLoop.ai is healthcare...,4.8,51 to 200 Employees,2017,Company - Private,Enterprise Software & Network Solutions,Information Technology,Unknown / Non-Applicable,2023-12-26,6,,,
4,Data Scientist,BlackLine,Remote,7d,$114K - $176K (Employer est.),Make Your Mark::\nBlackline is seeking a dynam...,3.3,1001 to 5000 Employees,2001,Company - Public,Software Development,Information Technology,$500 million to $1 billion (USD),2023-12-19,22,114000.0,176000.0,145000.0


In [223]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 811 entries, 0 to 810
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   job_title         811 non-null    object        
 1   company_name      811 non-null    object        
 2   location          811 non-null    object        
 3   days_posted       811 non-null    object        
 4   salary_estimate   661 non-null    object        
 5   job_description   811 non-null    object        
 6   company_rating    811 non-null    float64       
 7   company_size      811 non-null    object        
 8   company_founded   811 non-null    int32         
 9   company_type      811 non-null    object        
 10  company_industry  811 non-null    object        
 11  company_sector    811 non-null    object        
 12  company_revenue   811 non-null    object        
 13  post_date         811 non-null    datetime64[ns]
 14  age               811 non-

In [156]:
df[df.salary_estimate.isnull()]

Unnamed: 0,job_title,company_name,location,days_posted,salary_estimate,job_description,company_rating,company_size,company_founded,company_type,...,Azure,GCP,BASH,SAP,Git,Docker,Keras,excel,snowflake,job_category
3,Data Scientist,ClosedLoop,United States,24h,,About ClosedLoop:\nClosedLoop.ai is healthcare...,4.8,51 to 200 Employees,2017,Company - Private,...,0,0,0,0,0,0,0,1,0,Data Science
5,Data Scientist,SourceFuse,Remote,9d,,Job Information:\nWork Experience: 6+ years\nI...,4.1,201 to 500 Employees,2006,Company - Private,...,0,0,0,0,0,1,0,0,0,Data Science
9,Data Scientist - Batteries,Apple,"Cupertino, CA",6d,,"Summary\n\nPosted: Dec 14, 2023\n\nRole Number...",4.2,10000+ Employees,1976,Company - Public,...,0,0,0,0,0,0,0,0,0,Data Science
13,Geospatial Data Scientist,Saint Louis University,United States,30d+,,Who is Saint Louis University? Founded in 1818...,3.9,5001 to 10000 Employees,1818,College / University,...,1,0,0,0,1,0,0,0,0,Data Science
16,Junior Data Annotation Specialist,AllStars-IT,Georgia,30d+,,Junior Data Annotation Specialist\nLevel\nJuni...,4.9,501 to 1000 Employees,2004,Unknown,...,0,0,0,0,0,0,0,0,0,Data Analytics
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
763,Machine Learning Scientist (New Health Care Co...,3M,United States,30d+,,Job Description:\nMachine Learning Scientist (...,3.7,10000+ Employees,1902,Company - Public,...,1,0,0,0,0,0,0,1,0,Data Science
764,Data Scientist,NYC Health + Hospitals,"New York, NY",30d+,,About NYC Health + Hospitals\n\nMetroPlusHealt...,3.5,5001 to 10000 Employees,0,Hospital,...,1,0,0,0,0,0,0,1,0,Data Science
774,Data Scientist,Frankenmuth Insurance Company,Michigan,30d+,,Summary: Under direct supervision and followin...,4.4,501 to 1000 Employees,1868,Company - Private,...,0,0,0,0,0,0,0,0,0,Data Science
779,Vice President of Enterprise Data Analytics,WellSense Health Plan,Remote,2d,,It’s an exciting time to join the WellSense He...,3.2,501 to 1000 Employees,1997,Nonprofit Organization,...,0,0,0,0,0,0,0,1,0,Data Analytics


In [173]:
df2.iloc[:, :18].head()

Unnamed: 0,job_title,company_name,location,days_posted,salary_estimate,job_description,company_rating,company_size,company_founded,company_type,company_industry,company_sector,company_revenue,post_date,age,start,end,average
0,Junior Data Analyst Apprentice,Evergreen Trading,"New York, NY",22d,$45K (Employer est.),About Evergreen\nEvergreen Trading is a media ...,3.4,Unknown,0,Company - Private,Investment & Asset Management,Financial Services,Unknown / Non-Applicable,2023-12-04,2023,45000.0,45000.0,45000.0
1,Data Analyst,Viant,"Irvine, CA",5d,$57K - $70K (Employer est.),WHAT YOU’LL DO\n\nCome help us build Viant’s i...,4.0,501 to 1000 Employees,1999,Company - Public,Advertising & Public Relations,Media & Communication,Unknown / Non-Applicable,2023-12-21,24,57000.0,70000.0,63500.0
2,Sportsbook Data Analyst,DraftKings,"Boston, MA",24h,$77K - $115K (Employer est.),We’re defining what it means to build and deli...,4.0,1001 to 5000 Employees,2012,Company - Public,Internet & Web Services,Information Technology,Unknown / Non-Applicable,2023-12-26,11,77000.0,115000.0,96000.0
3,Junior Data Analyst,391 Financial,"Columbia, MO",30d+,$46K - $75K (Glassdoor est.),Junior Data Analyst Responsibilities:\nInterpr...,3.0,1 to 50 Employees,0,Contract,--,--,Unknown / Non-Applicable,2023-11-26,2023,46000.0,75000.0,60500.0
4,Junior Data Analyst,"Paul, Weiss, Rifkind, Wharton & Garrison LLP",United States,24h,$70K - $85K (Employer est.),Job Description\nJunior Data Analyst will be r...,4.0,1001 to 5000 Employees,1946,Private Practice / Firm,Legal,Legal,$100 to $500 million (USD),2023-12-26,77,70000.0,85000.0,77500.0


In [174]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 820 entries, 0 to 819
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   job_title         820 non-null    object        
 1   company_name      820 non-null    object        
 2   location          820 non-null    object        
 3   days_posted       820 non-null    object        
 4   salary_estimate   719 non-null    object        
 5   job_description   820 non-null    object        
 6   company_rating    820 non-null    float64       
 7   company_size      820 non-null    object        
 8   company_founded   820 non-null    int32         
 9   company_type      820 non-null    object        
 10  company_industry  820 non-null    object        
 11  company_sector    820 non-null    object        
 12  company_revenue   820 non-null    object        
 13  post_date         820 non-null    datetime64[ns]
 14  age               820 non-

In [175]:
df3.iloc[:, :18].head()

Unnamed: 0,job_title,company_name,location,days_posted,salary_estimate,job_description,company_rating,company_size,company_founded,company_type,company_industry,company_sector,company_revenue,post_date,age,start,end,average
0,Machine Learning Engineer Graduate (TikTok Sea...,TikTok,"San Jose, CA",30d+,$112K - $147K (Employer est.),Responsibilities\n\nAbout TikTok:\nTikTok is t...,3.4,1001 to 5000 Employees,2016,Company - Private,Internet & Web Services,Information Technology,Unknown / Non-Applicable,2023-11-26,7,112000.0,147000.0,129500.0
2,Machine Learning Engineer,Microsoft,United States,1d,$134K - $257K (Employer est.),The Industry Solutions Engineering (ISE) team ...,4.3,10000+ Employees,1975,Company - Public,Computer Hardware Development,Information Technology,$10+ billion (USD),2023-12-25,48,134000.0,257000.0,195500.0
3,"Machine Learning Engineer, Motion Planning, Au...",Tesla,"Palo Alto, CA",30d+,,What to Expect\nTesla is looking for strong Ma...,3.6,10000+ Employees,2003,Company - Public,Transportation Equipment Manufacturing,Manufacturing,$1 to $5 billion (USD),2023-11-26,20,,,
4,Machine Learning Engineer,ABBVIE,"Crystal Lake, IL",24h,$84K - $117K (Glassdoor est.),Are you ready to shape the future of machine l...,3.9,10000+ Employees,2013,Company - Public,Biotech & Pharmaceuticals,Pharmaceutical & Biotechnology,$10+ billion (USD),2023-12-26,10,84000.0,117000.0,100500.0
5,Computer Vision and Machine Learning Engineer,Apple,"Cupertino, CA",15d,,"Summary\n\nPosted: Nov 6, 2023\n\nWeekly Hours...",4.2,10000+ Employees,1976,Company - Public,Computer Hardware Development,Information Technology,$10+ billion (USD),2023-12-11,47,,,


In [176]:
df3.info()

<class 'pandas.core.frame.DataFrame'>
Index: 509 entries, 0 to 732
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   job_title         509 non-null    object        
 1   company_name      509 non-null    object        
 2   location          509 non-null    object        
 3   days_posted       509 non-null    object        
 4   salary_estimate   420 non-null    object        
 5   job_description   509 non-null    object        
 6   company_rating    509 non-null    float64       
 7   company_size      509 non-null    object        
 8   company_founded   509 non-null    int32         
 9   company_type      509 non-null    object        
 10  company_industry  509 non-null    object        
 11  company_sector    509 non-null    object        
 12  company_revenue   509 non-null    object        
 13  post_date         509 non-null    datetime64[ns]
 14  age               509 non-null 

In [224]:
all_df = pd.concat([df, df2, df3], ignore_index=True)
all_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2140 entries, 0 to 2139
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   job_title         2140 non-null   object        
 1   company_name      2140 non-null   object        
 2   location          2140 non-null   object        
 3   days_posted       2140 non-null   object        
 4   salary_estimate   1800 non-null   object        
 5   job_description   2140 non-null   object        
 6   company_rating    2140 non-null   float64       
 7   company_size      2140 non-null   object        
 8   company_founded   2140 non-null   int32         
 9   company_type      2140 non-null   object        
 10  company_industry  2140 non-null   object        
 11  company_sector    2140 non-null   object        
 12  company_revenue   2140 non-null   object        
 13  post_date         2140 non-null   datetime64[ns]
 14  age               2140 n

In [225]:
all_df.duplicated().sum()

23

In [226]:
duplicated_rows = all_df[all_df.duplicated()]
duplicated_rows.head(2)

Unnamed: 0,job_title,company_name,location,days_posted,salary_estimate,job_description,company_rating,company_size,company_founded,company_type,...,Azure,GCP,BASH,SAP,Git,Docker,Keras,excel,snowflake,job_category
430,Data Scientist,General Atomics and Affiliated Companies,"Poway, CA",24h,$98K - $171K (Employer est.),"General Atomics Aeronautical Systems, Inc. (GA...",3.6,10000+ Employees,1955,Company - Private,...,0,0,0,0,0,0,0,0,0,Data Science
539,Data Scientist,General Atomics and Affiliated Companies,"Poway, CA",24h,$98K - $171K (Employer est.),"General Atomics Aeronautical Systems, Inc. (GA...",3.6,10000+ Employees,1955,Company - Private,...,0,0,0,0,0,0,0,0,0,Data Science


In [229]:
all_df = all_df.drop_duplicates(subset=['job_title', 'company_name', 'location', 'salary_estimate', 'job_description', "company_type"], keep='first')
all_df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 2092 entries, 0 to 2139
Data columns (total 43 columns):
 #   Column            Non-Null Count  Dtype         
---  ------            --------------  -----         
 0   job_title         2092 non-null   object        
 1   company_name      2092 non-null   object        
 2   location          2092 non-null   object        
 3   days_posted       2092 non-null   object        
 4   salary_estimate   1760 non-null   object        
 5   job_description   2092 non-null   object        
 6   company_rating    2092 non-null   float64       
 7   company_size      2092 non-null   object        
 8   company_founded   2092 non-null   int32         
 9   company_type      2092 non-null   object        
 10  company_industry  2092 non-null   object        
 11  company_sector    2092 non-null   object        
 12  company_revenue   2092 non-null   object        
 13  post_date         2092 non-null   datetime64[ns]
 14  age               2092 non-nu

In [230]:
all_df.to_csv('datasets/all_posts.csv', index=False)