In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Set visualization style (optional)
sns.set(style="whitegrid")

In [3]:
# Load the dataset
df = pd.read_csv("/content/Uncleaned_DS_jobs.csv")

In [4]:
df.head()

Unnamed: 0,index,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors
0,0,Sr Data Scientist,$137K-$171K (Glassdoor est.),Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst\n3.1,"New York, NY","New York, NY",1001 to 5000 employees,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna"
1,1,Data Scientist,$137K-$171K (Glassdoor est.),"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech\n4.2,"Chantilly, VA","Herndon, VA",5001 to 10000 employees,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion (USD),-1
2,2,Data Scientist,$137K-$171K (Glassdoor est.),Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group\n3.8,"Boston, MA","Boston, MA",1001 to 5000 employees,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million (USD),-1
3,3,Data Scientist,$137K-$171K (Glassdoor est.),JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON\n3.5,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000 employees,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million (USD),"MKS Instruments, Pfeiffer Vacuum, Agilent Tech..."
4,4,Data Scientist,$137K-$171K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions\n2.9,"New York, NY","New York, NY",51 to 200 employees,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee"


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 672 entries, 0 to 671
Data columns (total 15 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              672 non-null    int64  
 1   Job Title          672 non-null    object 
 2   Salary Estimate    672 non-null    object 
 3   Job Description    672 non-null    object 
 4   Rating             672 non-null    float64
 5   Company Name       672 non-null    object 
 6   Location           672 non-null    object 
 7   Headquarters       672 non-null    object 
 8   Size               672 non-null    object 
 9   Founded            672 non-null    int64  
 10  Type of ownership  672 non-null    object 
 11  Industry           672 non-null    object 
 12  Sector             672 non-null    object 
 13  Revenue            672 non-null    object 
 14  Competitors        672 non-null    object 
dtypes: float64(1), int64(2), object(12)
memory usage: 78.9+ KB


In [6]:
# Drop the index column if it exists
if 'index' in df.columns:
    df.drop(columns=['index'], inplace=True)

# Verify
print("Columns after dropping index:")
print(df.columns.tolist())

Columns after dropping index:
['Job Title', 'Salary Estimate', 'Job Description', 'Rating', 'Company Name', 'Location', 'Headquarters', 'Size', 'Founded', 'Type of ownership', 'Industry', 'Sector', 'Revenue', 'Competitors']


In [7]:
# Function to split company name and rating
def split_company_rating(company_str):
    if pd.isna(company_str):
        return pd.Series([np.nan, np.nan])
    try:
        name = company_str.split('(')[0].strip()
        rating = company_str.split('(')[1].replace(')', '').strip() if '(' in company_str else np.nan
        return pd.Series([name, rating])
    except:
        return pd.Series([company_str, np.nan])

# Apply the function and create new columns
df[['Company Name', 'Rating']] = df['Company Name'].apply(split_company_rating)

# Convert Rating to numeric
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Verify
print("First 5 rows after splitting Company Name and Rating:")
print(df[['Company Name', 'Rating']].head())

First 5 rows after splitting Company Name and Rating:
              Company Name  Rating
0         Healthfirst\n3.1     NaN
1             ManTech\n4.2     NaN
2      Analysis Group\n3.8     NaN
3             INFICON\n3.5     NaN
4  Affinity Solutions\n2.9     NaN


In [9]:
# Display unique values or first 10 rows of Company Name
print("Sample of Company Name column:")
print(df['Company Name'].head(10))
print("\nUnique values in Company Name (first 10):")
print(df['Company Name'].unique()[:10])

Sample of Company Name column:
0           Healthfirst\n3.1
1               ManTech\n4.2
2        Analysis Group\n3.8
3               INFICON\n3.5
4    Affinity Solutions\n2.9
5           HG Insights\n4.2
6              Novartis\n3.9
7                iRobot\n3.5
8         Intuit - Data\n4.4
9    XSELL Technologies\n3.6
Name: Company Name, dtype: object

Unique values in Company Name (first 10):
['Healthfirst\n3.1' 'ManTech\n4.2' 'Analysis Group\n3.8' 'INFICON\n3.5'
 'Affinity Solutions\n2.9' 'HG Insights\n4.2' 'Novartis\n3.9'
 'iRobot\n3.5' 'Intuit - Data\n4.4' 'XSELL Technologies\n3.6']


In [10]:
# Function to split company name and rating based on newline
def split_company_rating_new(company_str):
    if pd.isna(company_str):
        return pd.Series([np.nan, np.nan])
    try:
        # Split by newline and strip whitespace
        parts = str(company_str).split('\n')
        name = parts[0].strip() if len(parts) > 0 else company_str
        rating = parts[1].strip() if len(parts) > 1 and parts[1].replace('.', '').isdigit() else np.nan
        return pd.Series([name, rating])
    except:
        return pd.Series([company_str, np.nan])

# Apply the function and create new columns
df[['Company Name', 'Rating']] = df['Company Name'].apply(split_company_rating_new)

# Convert Rating to numeric
df['Rating'] = pd.to_numeric(df['Rating'], errors='coerce')

# Verify
print("First 5 rows after splitting Company Name and Rating:")
print(df[['Company Name', 'Rating']].head())

First 5 rows after splitting Company Name and Rating:
         Company Name  Rating
0         Healthfirst     3.1
1             ManTech     4.2
2      Analysis Group     3.8
3             INFICON     3.5
4  Affinity Solutions     2.9


In [16]:
# Function to clean salary estimate
def clean_salary(salary_str):
    if pd.isna(salary_str):
        return pd.Series([np.nan, np.nan])
    # Remove 'Glassdoor est.' and other text, keep only numbers and ranges
    salary_str = str(salary_str).replace('Glassdoor est.', '').replace('$', '').replace('K', '000').strip()
    try:
        if '-' in salary_str:
            min_sal, max_sal = salary_str.split('-')
            min_sal = float(min_sal.replace(',', ''))  # Remove any commas in the number
            max_sal = float(max_sal.replace(',', ''))  # Remove any commas in the number
            return pd.Series([min_sal, max_sal])
        else:
            sal = float(salary_str.replace(',', ''))
            return pd.Series([sal, sal])
    except:
        return pd.Series([np.nan, np.nan])

# Apply the function and create new columns
df[['Min Salary', 'Max Salary']] = df['Salary Estimate'].apply(clean_salary)

# Verify
print("First 5 rows after cleaning Salary Estimate:")
print(df[['Salary Estimate', 'Min Salary', 'Max Salary']].head())

First 5 rows after cleaning Salary Estimate:
                Salary Estimate  Min Salary  Max Salary
0  $137K-$171K (Glassdoor est.)         NaN         NaN
1  $137K-$171K (Glassdoor est.)         NaN         NaN
2  $137K-$171K (Glassdoor est.)         NaN         NaN
3  $137K-$171K (Glassdoor est.)         NaN         NaN
4  $137K-$171K (Glassdoor est.)         NaN         NaN


In [17]:
# Inspect unique values in Salary Estimate
print("Unique values in Salary Estimate (first 10):")
print(df['Salary Estimate'].unique()[:10])

Unique values in Salary Estimate (first 10):
['$137K-$171K (Glassdoor est.)' '$75K-$131K (Glassdoor est.)'
 '$79K-$131K (Glassdoor est.)' '$99K-$132K (Glassdoor est.)'
 '$90K-$109K (Glassdoor est.)' '$101K-$165K (Glassdoor est.)'
 '$56K-$97K (Glassdoor est.)' '$79K-$106K (Glassdoor est.)'
 '$71K-$123K (Glassdoor est.)' '$90K-$124K (Glassdoor est.)']


In [18]:
# Function to clean salary estimate
def clean_salary(salary_str):
    if pd.isna(salary_str):
        return pd.Series([np.nan, np.nan])
    # Remove '(Glassdoor est.)' and other text, keep only the range
    salary_str = str(salary_str).replace('(Glassdoor est.)', '').replace('$', '').strip()
    try:
        if '-' in salary_str:
            min_sal, max_sal = salary_str.split('-')
            # Remove 'K' and convert to thousands, handle commas
            min_sal = float(min_sal.replace('K', '').replace(',', '')) * 1000
            max_sal = float(max_sal.replace('K', '').replace(',', '')) * 1000
            return pd.Series([min_sal, max_sal])
        else:
            sal = float(salary_str.replace('K', '').replace(',', '')) * 1000
            return pd.Series([sal, sal])
    except ValueError:
        return pd.Series([np.nan, np.nan])

# Apply the function and create new columns
df[['Min Salary', 'Max Salary']] = df['Salary Estimate'].apply(clean_salary)

# Verify
print("First 5 rows after cleaning Salary Estimate:")
print(df[['Salary Estimate', 'Min Salary', 'Max Salary']].head())

First 5 rows after cleaning Salary Estimate:
                Salary Estimate  Min Salary  Max Salary
0  $137K-$171K (Glassdoor est.)    137000.0    171000.0
1  $137K-$171K (Glassdoor est.)    137000.0    171000.0
2  $137K-$171K (Glassdoor est.)    137000.0    171000.0
3  $137K-$171K (Glassdoor est.)    137000.0    171000.0
4  $137K-$171K (Glassdoor est.)    137000.0    171000.0


In [19]:
# Check for missing values
print("Missing Values:")
print(df.isnull().sum())

Missing Values:
Job Title             0
Salary Estimate       0
Job Description       0
Rating               54
Company Name          0
Location              0
Headquarters          0
Size                  0
Founded               0
Type of ownership     0
Industry              0
Sector                0
Revenue               0
Competitors           0
Min Salary           20
Max Salary           20
dtype: int64


In [20]:
# Replace -1 with NaN in Competitors
df['Competitors'] = df['Competitors'].replace('-1', np.nan)

# Impute numerical columns (e.g., Rating) with median
for col in ['Rating', 'Min Salary', 'Max Salary']:
    if df[col].isnull().sum() > 0:
        df[col].fillna(df[col].median(), inplace=True)

# For categorical columns (e.g., Competitors), use 'Unknown' or drop
for col in ['Competitors']:
    if df[col].isnull().sum() > 0:
        df[col].fillna('Unknown', inplace=True)

# Verify
print("Missing Values After Imputation:")
print(df.isnull().sum())

Missing Values After Imputation:
Job Title            0
Salary Estimate      0
Job Description      0
Rating               0
Company Name         0
Location             0
Headquarters         0
Size                 0
Founded              0
Type of ownership    0
Industry             0
Sector               0
Revenue              0
Competitors          0
Min Salary           0
Max Salary           0
dtype: int64


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna('Unknown', inplace=True)


In [21]:
# Function to clean size (e.g., "1000-5000 employees" to "1000-5000")
def clean_size(size_str):
    if pd.isna(size_str):
        return np.nan
    return size_str.replace('employees', '').strip()

# Function to clean revenue (e.g., "10M-50M (USD)" to "10M-50M")
def clean_revenue(revenue_str):
    if pd.isna(revenue_str):
        return np.nan
    return revenue_str.replace('(USD)', '').strip()

# Apply cleaning
df['Size'] = df['Size'].apply(clean_size)
df['Revenue'] = df['Revenue'].apply(clean_revenue)

# Verify
print("Sample rows after cleaning Size and Revenue:")
print(df[['Size', 'Revenue']].head())

Sample rows after cleaning Size and Revenue:
            Size                   Revenue
0   1001 to 5000  Unknown / Non-Applicable
1  5001 to 10000          $1 to $2 billion
2   1001 to 5000      $100 to $500 million
3    501 to 1000      $100 to $500 million
4      51 to 200  Unknown / Non-Applicable


In [22]:
# Check for duplicate rows
print("Number of Duplicate Rows:", df.duplicated().sum())

# Drop duplicates if any
df.drop_duplicates(inplace=True)

# Verify
print("Number of Rows After Dropping Duplicates:", len(df))

Number of Duplicate Rows: 13
Number of Rows After Dropping Duplicates: 659


In [23]:
df.head()

Unnamed: 0,Job Title,Salary Estimate,Job Description,Rating,Company Name,Location,Headquarters,Size,Founded,Type of ownership,Industry,Sector,Revenue,Competitors,Min Salary,Max Salary
0,Sr Data Scientist,$137K-$171K (Glassdoor est.),Description\n\nThe Senior Data Scientist is re...,3.1,Healthfirst,"New York, NY","New York, NY",1001 to 5000,1993,Nonprofit Organization,Insurance Carriers,Insurance,Unknown / Non-Applicable,"EmblemHealth, UnitedHealth Group, Aetna",137000.0,171000.0
1,Data Scientist,$137K-$171K (Glassdoor est.),"Secure our Nation, Ignite your Future\n\nJoin ...",4.2,ManTech,"Chantilly, VA","Herndon, VA",5001 to 10000,1968,Company - Public,Research & Development,Business Services,$1 to $2 billion,Unknown,137000.0,171000.0
2,Data Scientist,$137K-$171K (Glassdoor est.),Overview\n\n\nAnalysis Group is one of the lar...,3.8,Analysis Group,"Boston, MA","Boston, MA",1001 to 5000,1981,Private Practice / Firm,Consulting,Business Services,$100 to $500 million,Unknown,137000.0,171000.0
3,Data Scientist,$137K-$171K (Glassdoor est.),JOB DESCRIPTION:\n\nDo you have a passion for ...,3.5,INFICON,"Newton, MA","Bad Ragaz, Switzerland",501 to 1000,2000,Company - Public,Electrical & Electronic Manufacturing,Manufacturing,$100 to $500 million,"MKS Instruments, Pfeiffer Vacuum, Agilent Tech...",137000.0,171000.0
4,Data Scientist,$137K-$171K (Glassdoor est.),Data Scientist\nAffinity Solutions / Marketing...,2.9,Affinity Solutions,"New York, NY","New York, NY",51 to 200,1998,Company - Private,Advertising & Marketing,Business Services,Unknown / Non-Applicable,"Commerce Signals, Cardlytics, Yodlee",137000.0,171000.0


In [24]:
df.to_csv("/content/Uncleaned_DS_jobs.csv", index=False)
print("Cleaned dataset saved as 'job_postings_cleaned.csv'")

Cleaned dataset saved as 'job_postings_cleaned.csv'
