Preprocessing Steps

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("jobs_data.csv")
df.head()

Unnamed: 0,jobTitle,company,location,experience,salary,jobLink,skills
0,Data Analyst,Version 1,Bengaluru,0-3 Yrs,Not disclosed,https://www.naukri.com/job-listings-data-analy...,"Computer science, Data analysis, Analytical, A..."
1,Data Analyst,Wolters Kluwer - Medknow Publications,"Kalyani, Pune",3-5 Yrs,Not disclosed,https://www.naukri.com/job-listings-data-analy...,"Computer science, Backend, Data analysis, Data..."
2,Data Analyst,Indegene,Bengaluru,0-4 Yrs,Not disclosed,https://www.naukri.com/job-listings-data-analy...,"Automation, Data analysis, Business analytics,..."
3,Data Analyst,Klene Paks,Bengaluru(Bannerghatta Road),3-8 Yrs,3.75-6 Lacs PA,https://www.naukri.com/job-listings-data-analy...,"HLOOKUP, Pivot Table, VLOOKUP, Excel Macros, D..."
4,Team Member Data Analyst,Bajaj Allianz General Insurance,Pune,0-4 Yrs,Not disclosed,https://www.naukri.com/job-listings-team-membe...,"Automation, Data validation, Process efficienc..."


In [5]:
df.dtypes

jobTitle      object
company       object
location      object
experience    object
salary        object
jobLink       object
skills        object
dtype: object

In [6]:
# Converting experience to numbers
def exp_min_max(exp):
    if pd.isna(exp) or 'N/A' in exp:
        return None , None
    if 'Yrs' in exp:
        exp = exp.replace('Yrs' , '')
        if '-' in exp:
            min_exp , max_exp = map(int , exp.split('-'))
            return min_exp , max_exp
        else:
            return int(exp), int(exp)
    return None, None

df[['experience_min' , 'experience_max']] = df['experience'].apply(lambda x:pd.Series(exp_min_max(x)))
print("Done")

Done


In [7]:
# Removing Experience Column
df.drop('experience' , axis = 1, inplace = True)

In [8]:
# Finding all the missing values
missing_values = df.isnull().sum()
print(missing_values)

jobTitle           0
company            0
location           5
salary             0
jobLink            0
skills             9
experience_min    14
experience_max    14
dtype: int64


In [9]:
# Droping all the rows which have nan values
df.dropna(inplace=True)

In [10]:
# Again checking for Missing Values
missing_values = df.isnull().sum()
print(missing_values)

jobTitle          0
company           0
location          0
salary            0
jobLink           0
skills            0
experience_min    0
experience_max    0
dtype: int64


In [11]:
# Changin Datatype of minimum and maximum experience
df['experience_min'] = df['experience_min'].astype(int)
df['experience_max'] = df['experience_max'].astype(int)

In [15]:
import re

# Function to process salary data
def salary(sal):
    # Handle 'Not disclosed' case
    if 'Not disclosed' in sal:
        return None, None  # Return None for missing salary data
    
    # Remove unwanted characters using regular expressions
    sal = re.sub(r'[^0-9.-]', '', sal)  # Keep only numbers, periods, and hyphens

    # Process salary range or single salary
    if '-' in sal:  # If salary is a range (e.g., '3.75-6')
        try:
            # Split the range and convert to float
            min_salary, max_salary = map(float, sal.split('-'))  
            return min_salary, max_salary
        except ValueError:
            print(f"Error processing range: {sal}")  # Print error if parsing fails
            return None, None
    else:  # If it's a single salary value (e.g., '5')
        try:
            salary_value = float(sal)  # Convert to float for single salary
            return salary_value, salary_value
        except ValueError:
            print(f"Error processing single salary: {sal}")  # Print error if parsing fails
            return None, None
    return None, None  # Return None if the salary format is unexpected

df[['min_salary', 'max_salary']] = df['salary'].apply(lambda x: pd.Series(salary(x)))

# Display the DataFrame with processed salary column
print(df[['salary', 'min_salary', 'max_salary']].head(20))


              salary  min_salary  max_salary
0      Not disclosed         NaN         NaN
1      Not disclosed         NaN         NaN
2      Not disclosed         NaN         NaN
3     3.75-6 Lacs PA        3.75         6.0
4      Not disclosed         NaN         NaN
5      Not disclosed         NaN         NaN
6      Not disclosed         NaN         NaN
7      Not disclosed         NaN         NaN
8      Not disclosed         NaN         NaN
9       8-16 Lacs PA        8.00        16.0
10     Not disclosed         NaN         NaN
11     Not disclosed         NaN         NaN
12     Not disclosed         NaN         NaN
13     Not disclosed         NaN         NaN
14     Not disclosed         NaN         NaN
15     Not disclosed         NaN         NaN
16     Not disclosed         NaN         NaN
17     Not disclosed         NaN         NaN
18  1.44-1.8 Lacs PA        1.44         1.8
19     Not disclosed         NaN         NaN


In [24]:
df.drop('salary' , axis = 1, inplace=True)

In [26]:
df.head(10)

Unnamed: 0,jobTitle,company,location,jobLink,skills,experience_min,experience_max,min_salary,max_salary
0,Data Analyst,Version 1,Bengaluru,https://www.naukri.com/job-listings-data-analy...,"Computer science, Data analysis, Analytical, A...",0,3,,
1,Data Analyst,Wolters Kluwer - Medknow Publications,"Kalyani, Pune",https://www.naukri.com/job-listings-data-analy...,"Computer science, Backend, Data analysis, Data...",3,5,,
2,Data Analyst,Indegene,Bengaluru,https://www.naukri.com/job-listings-data-analy...,"Automation, Data analysis, Business analytics,...",0,4,,
3,Data Analyst,Klene Paks,Bengaluru(Bannerghatta Road),https://www.naukri.com/job-listings-data-analy...,"HLOOKUP, Pivot Table, VLOOKUP, Excel Macros, D...",3,8,3.75,6.0
4,Team Member Data Analyst,Bajaj Allianz General Insurance,Pune,https://www.naukri.com/job-listings-team-membe...,"Automation, Data validation, Process efficienc...",0,4,,
5,Cloud Data Analyst,IBM,Bengaluru,https://www.naukri.com/job-listings-cloud-data...,"python, data analytics, consumables, team lead...",5,10,,
6,ESG Data Analyst - ESG Newsroom,Institutional Shareholder Services,Mumbai,https://www.naukri.com/job-listings-esg-data-a...,"Environmental science, Corporate governance, B...",0,3,,
7,ESG Data Analyst : ESG Newsroom,ISS Corporate Solutions,Mumbai,https://www.naukri.com/job-listings-esg-data-a...,"Environmental science, Corporate governance, B...",0,3,,
8,ESG Data Analyst : ESG Newsroom,Social Corporate Solutions,Mumbai,https://www.naukri.com/job-listings-esg-data-a...,"Environmental science, Corporate governance, B...",0,3,,
9,Openings For Data Analyst with Relevantz,Relevantz Technology Services,Chennai,https://www.naukri.com/job-listings-openings-f...,"Data analyst, Banking Sector, Data Warehousing...",5,10,8.0,16.0


In [29]:
df.to_csv('processed_DataSet.csv' , index = False)

In [30]:
df.head()

Unnamed: 0,jobTitle,company,location,jobLink,skills,experience_min,experience_max,min_salary,max_salary
0,Data Analyst,Version 1,Bengaluru,https://www.naukri.com/job-listings-data-analy...,"Computer science, Data analysis, Analytical, A...",0,3,,
1,Data Analyst,Wolters Kluwer - Medknow Publications,"Kalyani, Pune",https://www.naukri.com/job-listings-data-analy...,"Computer science, Backend, Data analysis, Data...",3,5,,
2,Data Analyst,Indegene,Bengaluru,https://www.naukri.com/job-listings-data-analy...,"Automation, Data analysis, Business analytics,...",0,4,,
3,Data Analyst,Klene Paks,Bengaluru(Bannerghatta Road),https://www.naukri.com/job-listings-data-analy...,"HLOOKUP, Pivot Table, VLOOKUP, Excel Macros, D...",3,8,3.75,6.0
4,Team Member Data Analyst,Bajaj Allianz General Insurance,Pune,https://www.naukri.com/job-listings-team-membe...,"Automation, Data validation, Process efficienc...",0,4,,
