## **Section 1: Importing Data for Use** ##

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns


## **Section 2: Load and Explore Data** ##

In [8]:
## Load CSV With Pandas and print head ##
df = pd.read_csv('ai_job_market.csv')
print(df.head())

   job_id              company_name    industry                 job_title  \
0       1           Foster and Sons  Healthcare              Data Analyst   
1       2   Boyd, Myers and Ramirez        Tech  Computer Vision Engineer   
2       3                  King Inc        Tech          Quant Researcher   
3       4  Cooper, Archer and Lynch        Tech        AI Product Manager   
4       5                  Hall LLC     Finance            Data Scientist   

                                     skills_required experience_level  \
0  NumPy, Reinforcement Learning, PyTorch, Scikit...              Mid   
1                    Scikit-learn, CUDA, SQL, Pandas           Senior   
2          MLflow, FastAPI, Azure, PyTorch, SQL, GCP            Entry   
3       Scikit-learn, C++, Pandas, LangChain, AWS, R              Mid   
4                    Excel, Keras, SQL, Hugging Face           Senior   

  employment_type               location salary_range_usd posted_date  \
0       Full-time        

In [9]:
## Show values and data types ##
print(df.info())
print(df.isnull().sum())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 12 columns):
 #   Column            Non-Null Count  Dtype 
---  ------            --------------  ----- 
 0   job_id            2000 non-null   int64 
 1   company_name      2000 non-null   object
 2   industry          2000 non-null   object
 3   job_title         2000 non-null   object
 4   skills_required   2000 non-null   object
 5   experience_level  2000 non-null   object
 6   employment_type   2000 non-null   object
 7   location          2000 non-null   object
 8   salary_range_usd  2000 non-null   object
 9   posted_date       2000 non-null   object
 10  company_size      2000 non-null   object
 11  tools_preferred   2000 non-null   object
dtypes: int64(1), object(11)
memory usage: 187.6+ KB
None
job_id              0
company_name        0
industry            0
job_title           0
skills_required     0
experience_level    0
employment_type     0
location            0
salary_range_us

In [15]:
## explore categorical data ##
# Focus on key categorical columns for analysis
categorical_columns = ["experience_level", "location", "employment_type", "industry"]

for col in categorical_columns:
    print(f"\n{'='*50}")
    print(f"Value counts for: {col}")
    print(f"{'='*50}")
    print(df[col].value_counts())
    print(f"\nUnique values: {df[col].nunique()}")



Value counts for: experience_level
experience_level
Entry     702
Mid       668
Senior    630
Name: count, dtype: int64

Unique values: 3

Value counts for: location
location
Tracybury, AR             1
Lake Scott, CU            1
East Paige, CM            1
Perezview, FI             1
North Desireeland, NE     1
                         ..
Washingtonmouth, SD       1
Joshuafort, ZA            1
West Brittanyburgh, CG    1
Anthonyshire, OM          1
Benjaminview, NE          1
Name: count, Length: 2000, dtype: int64

Unique values: 2000

Value counts for: employment_type
employment_type
Internship    574
Full-time     509
Contract      465
Remote        452
Name: count, dtype: int64

Unique values: 4

Value counts for: industry
industry
Automotive    300
Education     294
Retail        293
E-commerce    291
Finance       279
Tech          274
Healthcare    269
Name: count, dtype: int64

Unique values: 7


### **Section 3: Data Cleaning** ##