In [1]:
!git clone https://github.com/AshishJangra27/datasets

Cloning into 'datasets'...
remote: Enumerating objects: 328, done.[K
remote: Counting objects: 100% (93/93), done.[K
remote: Compressing objects: 100% (83/83), done.[K
remote: Total 328 (delta 19), reused 54 (delta 9), pack-reused 235 (from 1)[K
Receiving objects: 100% (328/328), 278.62 MiB | 15.15 MiB/s, done.
Resolving deltas: 100% (145/145), done.
Updating files: 100% (225/225), done.


In [2]:
import pandas as pd

### 1. Data Exploration

#### 1.1) Loading the Dataset

In [3]:
df = pd.read_csv('/content/datasets/Job Postings/jobs.csv.zip')
df.head(2)

Unnamed: 0,job_id,job_role,company,experience,salary,location,rating,reviews,resposibilities,posted_on,job_link,company_link
0,70123010000.0,Branch Banking - Calling For Women Candidates,Hdfc Bank,1-6 Yrs,Not disclosed,"Kolkata, Hyderabad/Secunderabad, Pune, Ahmedab...",4.0,39110 Reviews,"Customer Service,Sales,Relationship Management",1 Day Ago,https://www.naukri.com/job-listings-branch-ban...,https://www.naukri.com/hdfc-bank-jobs-careers-213
1,60123910000.0,Product Owner Senior Manager,Accenture,11-15 Yrs,Not disclosed,"Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...",4.1,32129 Reviews,"Product management,Market analysis,Change mana...",1 Day Ago,https://www.naukri.com/job-listings-product-ow...,https://www.naukri.com/accenture-jobs-careers-...


#### 1.2) Removing "posted_on" Columns

In [4]:
del df['posted_on']

#### 1.3) Check Null Values

In [5]:
df.isnull().sum()

Unnamed: 0,0
job_id,480
job_role,480
company,481
experience,1749
salary,480
location,1706
rating,36199
reviews,36199
resposibilities,500
job_link,480


#### 1.4) Removign rows with null values in job_id,company and responsibility column

In [6]:
df.dropna(subset = ['job_id','company','resposibilities'], inplace = True)

#### 1.5) Filling Null values in location and experience column with most frequent

In [7]:
df['location'].fillna(df['location'].mode()[0], inplace = True)
df['experience'].fillna(df['experience'].mode()[0], inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['location'].fillna(df['location'].mode()[0], inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['experience'].fillna(df['experience'].mode()[0], inplace = True)


#### 1.6) Filling Null values in rating and reviews column with 0

In [8]:
df['rating'].fillna(0.0,inplace = True)
df['reviews'].fillna('0 Reviews',inplace = True)

The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['rating'].fillna(0.0,inplace = True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df['reviews'].fillna('0 Reviews',inplace = True)


In [9]:
df.isnull().sum()

Unnamed: 0,0
job_id,0
job_role,0
company,0
experience,0
salary,0
location,0
rating,0
reviews,0
resposibilities,0
job_link,0


#### 1.7) Remove Duplicates

In [10]:
df.drop_duplicates(subset=['job_link'],inplace = True)

### 2. Data Cleaning

#### 2.1) Cleaning Job_id Column

In [11]:
df['job_id'] = df['job_id'].astype('int').astype('str')

#### 2.2) Creating Company ID Column

In [12]:
df['company_id'] = df['company_link'].str.split('-').str[-1]

#### 2.3) Removing Companies with company_id = 0

In [13]:
df = df[df['company_id'] != '0']

#### 2.4) Cleaning Experience Column

In [43]:
df['start_exp'] = df['experience'].str.replace('Yrs','').str.strip().str.split('-').str[0].astype('int')
df['end_exp'] = df['experience'].str.replace('Yrs','').str.strip().str.split('-').str[1].astype('int')

#### 2.5) Cleaning Salary Column

###### 2.5.1 Removing Categories like gradudate etc

In [91]:
salary_clean = []

for i in df['salary']:
  if bool(re.search(r'\d', i)) == False or 'Cr' in i:
    salary_clean.append('Not disclosed')
  else:
    salary_clean.append(i)

df['salary_clean'] = salary_clean

###### 2.5.2 Splitting salaries

In [133]:
min_salaries = []
max_salaries = []

for i in df['salary_clean']:

  if i == 'Not disclosed':
    min_salary = 'Not disclosed'
    max_salary = 'Not disclosed'

  elif len(i.split('-')) == 2:
    min_salary = i.split('PA')[0].split('-')[0].strip().replace(',','')
    max_salary = i.split('PA')[0].split('-')[1].strip().replace(',','')


  elif len(i.split('-')) == 1:
    min_salary = ''.join(re.findall(r'\d', i.split('PA')[0]))
    max_salary = ''.join(re.findall(r'\d', i.split('PA')[0]))


  min_salaries.append(min_salary)
  max_salaries.append(max_salary)

df['min_salary'] = min_salaries
df['max_salary'] = max_salaries

###### 2.5.3 Clean further minimum and maximum salaries

In [134]:
df.loc[df['min_salary'] == 'Less than 5000', 'min_salary'] = '5000'
df.loc[df['min_salary'] == 'Less than 50000', 'min_salary'] = '50000'

#### 2.6) Cleaning Reviews Column

In [149]:
df['reviews'] = df['reviews'].str.split(' ').str[0].astype('int')

#### 2.7) Cleaning Location

In [159]:
df['location'].str.split(',').explode().str.lower().explode().nunique()

4777

In [166]:
df['location'].str.split(',').explode().str.lower().explode().str.split('/').explode().str.split('-').explode().unique()

array(['kolkata', ' hyderabad', 'secunderabad', ..., ' nerul',
       ' ambavadi', ' maninagar'], dtype=object)

In [170]:
df['resposibilities'].str.split(',').explode().value_counts().head(20)

Unnamed: 0_level_0,count
resposibilities,Unnamed: 1_level_1
Sales,4616
tied,3310
agency,3307
communication,2553
training,2217
Consulting,2099
Customer Service,2008
Java,1970
Business Development,1965
team handling,1951


In [141]:
df.head()

Unnamed: 0,job_id,job_role,company,experience,salary,location,rating,reviews,resposibilities,job_link,company_link,company_id,start_exp,end_exp,salary_clean,min_salary,max_salary
0,70123006070,Branch Banking - Calling For Women Candidates,Hdfc Bank,1-6 Yrs,Not disclosed,"Kolkata, Hyderabad/Secunderabad, Pune, Ahmedab...",4.0,39110 Reviews,"Customer Service,Sales,Relationship Management",https://www.naukri.com/job-listings-branch-ban...,https://www.naukri.com/hdfc-bank-jobs-careers-213,213,1,6,Not disclosed,Not disclosed,Not disclosed
1,60123905908,Product Owner Senior Manager,Accenture,11-15 Yrs,Not disclosed,"Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...",4.1,32129 Reviews,"Product management,Market analysis,Change mana...",https://www.naukri.com/job-listings-product-ow...,https://www.naukri.com/accenture-jobs-careers-...,7682,11,15,Not disclosed,Not disclosed,Not disclosed
2,60123905898,Employee Relations and Policies Associate Manager,Accenture,3-7 Yrs,Not disclosed,"Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...",4.1,32129 Reviews,"Business process,Change management,Team manage...",https://www.naukri.com/job-listings-employee-r...,https://www.naukri.com/accenture-jobs-careers-...,7682,3,7,Not disclosed,Not disclosed,Not disclosed
3,60123905897,Employee Relations and Policies Specialist,Accenture,3-7 Yrs,Not disclosed,"Kolkata, Mumbai, Hyderabad/Secunderabad, Pune,...",4.1,32129 Reviews,"Business process,Change management,Team manage...",https://www.naukri.com/job-listings-employee-r...,https://www.naukri.com/accenture-jobs-careers-...,7682,3,7,Not disclosed,Not disclosed,Not disclosed
4,60123008332,SAP BO Consultant,Mindtree,5-7 Yrs,Not disclosed,"Hybrid - Kolkata, Hyderabad/Secunderabad, Pune...",4.1,3759 Reviews,"SAP BO,PL / SQL,Oracle SQL,SAP Business Object...",https://www.naukri.com/job-listings-sap-bo-con...,https://www.naukri.com/mindtree-jobs-careers-3...,30528,5,7,Not disclosed,Not disclosed,Not disclosed


In [18]:
- Number of Active Jobs in Any Company | Top Companies
- Company which provides maximum/minimum average Salary
- Salary vs Experience of any Company
- Company hiring for most numbers of locations

SyntaxError: invalid syntax (<ipython-input-18-9565e96a72fc>, line 1)

In [None]:
1. Analyze Company Ratings
  - Calculate the average rating for each company
  - Identify companies with the highest and lowest average ratings.
  - Compare the distribution of ratings across different companies.
  - Calculate the total number of reviews for each company. | Sort to get most and LiteralString

- List the top 10 companies based on average rating and number of reviews.
- Analyze the characteristics and practices of these top-rated companies. | Most Popular Responsibilities of top 100 companies
