In [55]:
#Import all relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import zscore
from ydata_profiling import ProfileReport
import sweetviz as sv
import plotly.express as px
 
## This statement allows the visuals to render within your Jupyter Notebook.
%matplotlib inline

## Loading the data
We can now load the dataset into pandas using the read_csv() function. This converts the CSV file into a Pandas dataframe.

In [85]:
df = pd.read_csv('case5/Jadarat_data.csv')
df.head()

Unnamed: 0,job_title,job_date,job_desc,job_tasks,comp_name,comp_no,comp_type,comp_size,eco_activity,qualif,region,city,benefits,contract,positions,job_post_id,exper,gender
0,محاسب,27/05/1444,['إعداد وتنظيم مستندات الصرف ومتابعة تحصيل الإ...,[' تدقيق المطالبات المالية والتأكد من اكتمال...,شركة مقر العالم للسفريات,1-317262,خاص,متوسطة فئة أ,أنشطة وكالات السياحة والسفر,"['Language data', 'اللغة الانجليزية', 'محترف']",الرياض,AR RIYADH...,"['Salary', '5000.0']",دوام كامل,0 / 1,20202026350419,0 Years,both
1,بائع,27/05/1444,['بيع مجموعة من السلع والخدمات للعملاء، وتوفير...,"[' بيع مجموعة من السلع والخدمات للعملاء.', '...",شركة عالم الكهرباء للمقاولات,4-1324428,خاص,متوسطة فئة ب,تركيب انظمة التبريد وتكييف الهواء وصيانتها واص...,,المنطقة الشرقية,AD DAMMAM...,"['Salary', '5000.0']",دوام كامل,0 / 3,20202026350389,0 Years,both
2,أخصائي عمليات موارد بشرية,27/05/1444,['تنفيذ الإجراءات والأنظمة والنماذج الخاصة بمر...,[' تنفيذ الإجراءات والأنظمة والنماذج الخاصة ...,شركه دار السلام,1-155294,خاص,متوسطة فئة أ,ترميمات المباني السكنية والغير سكنية,"['Language data', 'الانجليزيه', 'محترف']",الرياض,ATH THUMA...,"['Salary', '4000.0']",دوام كامل,0 / 2,20202026350347,2 Years,both
3,ميكانيكي سيارات,27/05/1444,['تشخيص أعطال السيارات وإصلاحها وتنفيذ برامج ا...,[' فحص أداء المعدّات الكهربائية والميكانيكية...,مؤسسة لمكو لغيار الزيوت,8-1925495,خاص,صغيرة فئة ب,,"['Skill data', 'صيانة السيارات وتقييم الاعطال'...",المنطقة الشرقية,AL HUFUF...,"['Salary', '5000.0']",دوام كامل,0 / 10,20202026350219,0 Years,M
4,محاسب,27/05/1444,['إعداد وتنظيم مستندات الصرف ومتابعة تحصيل الإ...,[' تدقيق المطالبات المالية والتأكد من اكتمال...,مؤسسة فكرة اليمامة للمقاولات,1-2356639,خاص,كبيرة,الإنشاءات العامة للمباني السكنية,"['Skill data', 'تحمل ضغط العمل', 'محترف', 'Lan...",الرياض,AR RIYADH...,"['Salary', '5000.0']",دوام كامل,0 / 1,20202026350043,0 Years,both


## 1. Data Profiling:
Data profiling is a comprehensive process of examining the data available in an existing dataset and collecting statistics and information about that data. 

In [57]:
profile = ProfileReport(df, title="Pandas Profiling Report")
profile.to_notebook_iframe()

### Data Quality Checks
Data quality checks involve the process of ensuring that the data is accurate, complete, consistent, relevant, and reliable. 


**Here are typical steps involved in checking data quality:**

#### 1. Reliability:
Evaluate the data's source and collection process to determine its trustworthiness.

In [58]:
#The data from Jadarat

#### 2. Timeliness: 
Ensure the data is up-to-date and reflective of the current situation or the period of interest for the analysis.

In [59]:
# The data is not up-to-date. but we can use it 

#### 3. Consistency: 

Confirm that the data is consistent within the dataset and across multiple data sources. For example, the same data point should not have different values in different places.


In [60]:
#The data is consistent

In [86]:
df = df.map(lambda x: x.strip() if isinstance(x, str) else x)

In [62]:
df.head(0)

Unnamed: 0,job_title,job_date,job_desc,job_tasks,comp_name,comp_no,comp_type,comp_size,eco_activity,qualif,region,city,benefits,contract,positions,job_post_id,exper,gender


In [87]:
df = df.rename(columns={'job_title': 'Job Title' ,
                         'job_date' : 'Job Date' ,
                         'job_desc' : 'Job description' ,
                         'job_tasks' : 'Job Tasks' ,
                         'comp_name': 'Company Name' ,
                         'comp_no' : 'Company Number',
                         'comp_type' : 'Company Type' , 
                         'comp_size' : 'Company Size' ,
                         'eco_activity' : 'Eco Activity',
                         'qualif' : 'Qualifications',
                         'region' : 'Region',
                         'city' : 'City',
                         'benefits' : 'Benefits',
                         'contract' : 'Contract',
                         'positions' : 'Positions',
                         'job_post_id' : 'Job post ID',
                         'exper' : 'Experience',
                         'gender' : 'Gender'}) 

In [64]:
df.head(0)

Unnamed: 0,Job Title,Job Date,Job description,Job Tasks,Company Name,Company Number,Company Type,Company Size,Eco Activity,Qualifications,Region,City,Benefits,Contract,Positions,Job post ID,Experience,Gender


#### 4. Relevance: 
Assess whether the data is appropriate and applicable for the intended analysis. Data that is not relevant can skew results and lead to incorrect conclusions.

**Key considerations for relevance include:**

> 1. Sample Appropriateness: Confirm that your data sample aligns with your analysis objectives. For instance, utilizing data from the Northern region will not yield accurate insights for the Western region of the Kingdom.
>
> 2. Variable Selection: Any column will not be relevant for our analysis, we can get rid of these using the drop() method. We will set the “axis” argument to 1 since we’re dealing with columns, and set the “inplace” argument to True to make the change permanent.


In [65]:
#The available data is consistent with the objectives of the analysis.
#There are irrelevant columns to our objectives.

In [90]:
df.drop('Job description', axis=1, inplace=True)
df.drop('Job Tasks', axis=1, inplace=True)
df.drop('Company Number', axis=1, inplace=True)
df.drop('Company Size', axis=1, inplace=True)
df.drop('Eco Activity', axis=1, inplace=True)
df.drop('City', axis=1, inplace=True)
df.drop('Positions', axis=1, inplace=True)
df.drop('Job Date', axis=1, inplace=True)
df.drop('Qualifications', axis=1, inplace=True)
df.drop('Job post ID', axis=1, inplace=True)

In [67]:
df.head(1)

Unnamed: 0,Job Title,Company Name,Company Type,Region,Benefits,Contract,Experience,Gender
0,محاسب,شركة مقر العالم للسفريات,خاص,الرياض,"['Salary', '5000.0']",دوام كامل,0 Years,both


#### 5. Uniqueness: 
Check for and remove duplicate records to prevent skewed analysis results.


In [68]:
# There is one duplicate one row

In [88]:
df.duplicated().sum()

1

In [89]:
df.drop_duplicates(inplace=True)

#### 6. Completeness: 
Ensure that no critical data is missing. This might mean checking for null values or required fields that are empty.

We will start by checking the dataset for missing or null values. For this, we can use the isna() method which returns a dataframe of boolean values indicating if a field is null or not. To group all missing values by column, we can include the sum() method.

In [71]:
#There are no missing values

In [72]:
df.isnull().sum()

Job Title       0
Company Name    0
Company Type    0
Region          0
Benefits        0
Contract        0
Experience      0
Gender          0
dtype: int64

#### 7. Check Accuracy:

Verify that the data is correct and precise. This could involve comparing data samples with known sources or using validation rules.

**The process includes:**
1. Validating the appropriateness of data types for the dataset.
2. Identifying outliers  using established validation  rule

In [73]:
df.dtypes

Job Title       object
Company Name    object
Company Type    object
Region          object
Benefits        object
Contract        object
Experience      object
Gender          object
dtype: object

In [91]:
report = sv.analyze(df)
report.show_html('Report.html')

                                             |          | [  0%]   00:00 -> (? left)


Glyph 1588 (\N{ARABIC LETTER SHEEN}) missing from font(s) Roboto.


Matplotlib currently does not support Arabic natively.


Glyph 1576 (\N{ARABIC LETTER BEH}) missing from font(s) Roboto.


Glyph 1607 (\N{ARABIC LETTER HEH}) missing from font(s) Roboto.


Glyph 1581 (\N{ARABIC LETTER HAH}) missing from font(s) Roboto.


Glyph 1603 (\N{ARABIC LETTER KAF}) missing from font(s) Roboto.


Glyph 1608 (\N{ARABIC LETTER WAW}) missing from font(s) Roboto.


Glyph 1605 (\N{ARABIC LETTER MEEM}) missing from font(s) Roboto.


Glyph 1610 (\N{ARABIC LETTER YEH}) missing from font(s) Roboto.


Glyph 1577 (\N{ARABIC LETTER TEH MARBUTA}) missing from font(s) Roboto.


Glyph 1582 (\N{ARABIC LETTER KHAH}) missing from font(s) Roboto.


Glyph 1575 (\N{ARABIC LETTER ALEF}) missing from font(s) Roboto.


Glyph 1589 (\N{ARABIC LETTER SAD}) missing from font(s) Roboto.


Glyph 1588 (\N{ARABIC LETTER SHEEN}) missing from font(s) Roboto.


Matplotlib currently does not support Arabic natively.


Glyph 1576 (

Report Report.html was generated! NOTEBOOK/COLAB USERS: the web browser MAY not pop up, regardless, the report IS saved in your notebook/colab files.


## 2. Data Cleaning: 

Preliminary findings from data profiling can lead to cleaning the data by:
- Handling missing values
- Correcting errors.
- Dealing with outliers.

-------------------



### Correcting errors

-------------------

In [75]:
df.head()

Unnamed: 0,Job Title,Company Name,Company Type,Region,Benefits,Contract,Experience,Gender
0,محاسب,شركة مقر العالم للسفريات,خاص,الرياض,"['Salary', '5000.0']",دوام كامل,0 Years,both
1,بائع,شركة عالم الكهرباء للمقاولات,خاص,المنطقة الشرقية,"['Salary', '5000.0']",دوام كامل,0 Years,both
2,أخصائي عمليات موارد بشرية,شركه دار السلام,خاص,الرياض,"['Salary', '4000.0']",دوام كامل,2 Years,both
3,ميكانيكي سيارات,مؤسسة لمكو لغيار الزيوت,خاص,المنطقة الشرقية,"['Salary', '5000.0']",دوام كامل,0 Years,M
4,محاسب,مؤسسة فكرة اليمامة للمقاولات,خاص,الرياض,"['Salary', '5000.0']",دوام كامل,0 Years,both


In [76]:
#Correcte errors in Benefits column and get the salary information 
df['Benefits'] = df['Benefits'].str.replace("'", '')

In [77]:
df['Benefits'] = df['Benefits'].str.replace("]", '')

In [78]:
df['Benefits'] = df['Benefits'].map(lambda x : float(x.split(",")[1]) if "," in x else x)

In [79]:
df = df.rename(columns={'Benefits': 'Salary'}) 

In [80]:
df.head()

Unnamed: 0,Job Title,Company Name,Company Type,Region,Salary,Contract,Experience,Gender
0,محاسب,شركة مقر العالم للسفريات,خاص,الرياض,5000.0,دوام كامل,0 Years,both
1,بائع,شركة عالم الكهرباء للمقاولات,خاص,المنطقة الشرقية,5000.0,دوام كامل,0 Years,both
2,أخصائي عمليات موارد بشرية,شركه دار السلام,خاص,الرياض,4000.0,دوام كامل,2 Years,both
3,ميكانيكي سيارات,مؤسسة لمكو لغيار الزيوت,خاص,المنطقة الشرقية,5000.0,دوام كامل,0 Years,M
4,محاسب,مؤسسة فكرة اليمامة للمقاولات,خاص,الرياض,5000.0,دوام كامل,0 Years,both


In [81]:
#The word years from the column
df['Experience'] = df['Experience'].map(lambda x : int(x.split(" ")[0]) if " " in x else x)

In [82]:
df.head()

Unnamed: 0,Job Title,Company Name,Company Type,Region,Salary,Contract,Experience,Gender
0,محاسب,شركة مقر العالم للسفريات,خاص,الرياض,5000.0,دوام كامل,0,both
1,بائع,شركة عالم الكهرباء للمقاولات,خاص,المنطقة الشرقية,5000.0,دوام كامل,0,both
2,أخصائي عمليات موارد بشرية,شركه دار السلام,خاص,الرياض,4000.0,دوام كامل,2,both
3,ميكانيكي سيارات,مؤسسة لمكو لغيار الزيوت,خاص,المنطقة الشرقية,5000.0,دوام كامل,0,M
4,محاسب,مؤسسة فكرة اليمامة للمقاولات,خاص,الرياض,5000.0,دوام كامل,0,both


In [83]:
#Change the data type
df['Salary'].astype(float)
df['Experience'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1470 entries, 0 to 1469
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Job Title     1470 non-null   object 
 1   Company Name  1470 non-null   object 
 2   Company Type  1470 non-null   object 
 3   Region        1470 non-null   object 
 4   Salary        1470 non-null   float64
 5   Contract      1470 non-null   object 
 6   Experience    1470 non-null   int64  
 7   Gender        1470 non-null   object 
dtypes: float64(1), int64(1), object(6)
memory usage: 92.0+ KB


## check outliers 

In [84]:
fig = px.histogram(df, x="Salary",title="Histogram of Salary" )
fig.show()

In [130]:
df.sort_values(by='Salary', ascending=False)

Unnamed: 0,Job Title,Company Name,Company Type,Region,Salary,Contract,Experience,Gender
888,طيار جناح ثابت,شركة طيران اديل,خاص,مكة المكرمة,35000.0,دوام كامل,10,both
530,مدير عمليات موارد بشرية,شركة انكر اليابان,خاص,الرياض,32000.0,دوام كامل,7,both
470,مدير عمليات موارد بشرية,شركة ال جي شاكر المحدوده,خاص,الرياض,27000.0,دوام كامل,4,both
1169,مدير فرع,شركة إختبار التربة للقياس,خاص,الحدود الشمالية,18000.0,دوام كامل,10,M
171,مدير برمجيات,شركه العرض المتقن للخدمات التجارية شركة مساهمة...,خاص,الرياض,16000.0,دوام كامل,4,both
...,...,...,...,...,...,...,...,...
1290,سائق حافلة,مدارس سفراء المستقبل الاهلية,خاص,مكة المكرمة,3000.0,دوام كامل,0,M
345,محاسب,مؤسسة شريان حائل للمقاولات العامة,خاص,حائل,3000.0,دوام كامل,4,both
110,سكرتير,المكتب التعاوني للدعوة والارشاد وتوعية الجاليا...,خاص,الرياض,3000.0,دوام كامل,0,both
15,مدخل بيانات,مكتب مدارات للاستشارات الهندسية,خاص,حائل,3000.0,دوام كامل,2,M


In [16]:
#I will consider any value greater than 27000.0 as an outlier to make the chart more understandable
df= df[(df['Salary'] < 27000.0)]
df.sort_values(by='Salary', ascending=False)

Unnamed: 0,Job Title,Company Name,Company Type,Region,Salary,Contract,Experience,Gender
1169,مدير فرع,شركة إختبار التربة للقياس,خاص,الحدود الشمالية,18000.0,دوام كامل,10,M
171,مدير برمجيات,شركه العرض المتقن للخدمات التجارية شركة مساهمة...,خاص,الرياض,16000.0,دوام كامل,4,both
947,أخصائي استشارات أعمال,برنامج مستشفى الملك فهدللحرس الوطني,شبه حكومية,الرياض,15532.0,دوام كامل,10,both
1239,مهندس معماري,شركة المساحون العرب نصر أحمد اسحاق الحسيني للا...,خاص,الرياض,15000.0,دوام كامل,7,both
841,مهندس مدني,مكتب مدارات للاستشارات الهندسية,خاص,حائل,15000.0,دوام كامل,10,M
...,...,...,...,...,...,...,...,...
110,سكرتير,المكتب التعاوني للدعوة والارشاد وتوعية الجاليا...,خاص,الرياض,3000.0,دوام كامل,0,both
916,بائع,أسواق المسارات الرائعة للتجارة,خاص,عسير,3000.0,دوام كامل,0,both
15,مدخل بيانات,مكتب مدارات للاستشارات الهندسية,خاص,حائل,3000.0,دوام كامل,2,M
345,محاسب,مؤسسة شريان حائل للمقاولات العامة,خاص,حائل,3000.0,دوام كامل,4,both


In [17]:
fig = px.histogram(df, x="Salary",title="Histogram of Salary" )
fig.show()

## Analysis: 

**What proportion of job postings is attributed to each region within the kingdom?**

In [141]:
jobs_by_region = df['Region'].value_counts().reset_index()
jobs_by_region.columns = ['Region', 'Count']

fig = px.pie(jobs_by_region, values='Count', names='Region', title='Proportion of Job Postings by Region')
fig.show()

In [None]:
#Most job posts are in Riyadh
#67% of job posts are in Riyadh and mecca

**Is there a gender preference indicated in the job postings?**

In [22]:
gender_counts = df['Gender'].value_counts()

fig = px.bar(x=gender_counts.index, y=gender_counts.values, labels={'x':'Gender', 'y':'Count'}, title='Gender Distribution')
fig.show()

In [None]:
#There is not a huge difference between genders

**What is the expected salary range for fresh graduates?**

In [28]:
fresh = df[df['Experience'] <= 1]

fig = px.histogram(fresh, y='Salary', title='Salary Range for Fresh Graduates')
fig.show()

In [None]:
#The salary range is 4K to 7K.

**Are job opportunities predominantly targeted at individuals with experience, or is there room for fresh graduates as well?**

In [29]:
fig = px.histogram(df, x="Experience", nbins=10, title="Job Opportunities by Required Experience", labels={"Experience": "Required Experience (Years)"})
fig.update_layout(xaxis_title="Years of Experience", yaxis_title="Number of Job Opportunities")
fig.show()

In [None]:
#Fresh graduates have more opportunities.
#People with less experience have more opportunities.

**Which company posts the most jobs?**

In [46]:
company_job_counts = df['Company Name'].value_counts().reset_index()
company_job_counts.columns = ['Company Name', 'Number of Job Postings']
top_10_companies = company_job_counts.head(10)

# Create bar chart
fig = px.bar(top_10_companies, x='Company Name', y='Number of Job Postings',
             title='Top 10 Companies with Most Job Postings',             
             text='Number of Job Postings')
fig.show()

In [None]:
#Most of the company is from Riyadh