In [1]:
import pandas as pd
import numpy as np

# for visualization
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("final_data.csv")

In [3]:
df.shape

(374, 19)

In [4]:
df.isnull().sum()

company_name           0
location               0
job_title              0
job_description        0
rating                53
employer_estimate      0
min_salary             0
max_salary             0
avg_salary             0
Size                  51
Founded               51
Type                  51
Industry              51
Sector                51
Revenue               51
skills                 1
age                  106
seniority              0
job_simp               0
dtype: int64

In [5]:
df.duplicated().sum()

np.int64(0)

In [6]:
df.columns

Index(['company_name', 'location', 'job_title', 'job_description', 'rating',
       'employer_estimate', 'min_salary', 'max_salary', 'avg_salary', 'Size',
       'Founded', 'Type', 'Industry', 'Sector', 'Revenue', 'skills', 'age',
       'seniority', 'job_simp'],
      dtype='object')

In [7]:
df.drop(columns = ['job_title', 'min_salary', 'max_salary'], inplace = True)

In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 374 entries, 0 to 373
Data columns (total 16 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   company_name       374 non-null    object 
 1   location           374 non-null    object 
 2   job_description    374 non-null    object 
 3   rating             321 non-null    float64
 4   employer_estimate  374 non-null    int64  
 5   avg_salary         374 non-null    float64
 6   Size               323 non-null    object 
 7   Founded            323 non-null    object 
 8   Type               323 non-null    object 
 9   Industry           323 non-null    object 
 10  Sector             323 non-null    object 
 11  Revenue            323 non-null    object 
 12  skills             373 non-null    object 
 13  age                268 non-null    float64
 14  seniority          374 non-null    float64
 15  job_simp           374 non-null    object 
dtypes: float64(4), int64(1), o

## Train Test split

In [9]:
# Filter once for training data
train_data = df[df['avg_salary'] != -1]

# Now split into features (xtrain) and target (ytrain) this was required for the modelling phase purpose but since we are still in eda phase,
# we can scale both features and target column together
xtrain = train_data.drop('avg_salary', axis=1)
ytrain = train_data['avg_salary']  # Selecting a single column doesn't need .loc

# For test data
xtest = df[df['avg_salary'] == -1].drop('avg_salary', axis=1)

In [10]:
xtrain.shape, ytrain.shape, xtest.shape

((173, 15), (173,), (201, 15))

## Scaling

In [11]:
df.columns

Index(['company_name', 'location', 'job_description', 'rating',
       'employer_estimate', 'avg_salary', 'Size', 'Founded', 'Type',
       'Industry', 'Sector', 'Revenue', 'skills', 'age', 'seniority',
       'job_simp'],
      dtype='object')

In [12]:
cols_scale_train = xtrain.select_dtypes('number').drop(columns = ['employer_estimate', 'seniority'])
cols_scale_test = xtest.select_dtypes('number').drop(columns = ['employer_estimate', 'seniority'])

In [13]:
cols_scale_test

Unnamed: 0,rating,age
39,4.0,225.0
40,3.5,
41,4.1,164.0
42,3.7,43.0
43,4.6,17.0
...,...,...
369,3.9,15.0
370,4.7,
371,,
372,4.3,9.0


Introducing a bit of bias , filling rating and age null values with means

In [14]:
cols_scale_train.isna().sum()

rating    24
age       50
dtype: int64

In [15]:
means_train = cols_scale_train.mean()
means_test = cols_scale_test.mean()

In [16]:
cols_scale_train = cols_scale_train.fillna(means_train)
cols_scale_test = cols_scale_test.fillna(means_test)

In [17]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
scaler1 = MinMaxScaler()
xtrain_trf = scaler1.fit_transform(cols_scale_train)
xtest_trf = scaler1.transform(cols_scale_test)

In [18]:
type(xtest_trf)

numpy.ndarray

In [19]:
# this is done so that when we merge encoded and scaled columns into one they are merged properly
# but maybe original xtrain wont be required anymore after merging we'll get a new df
xtrain.reset_index(drop = 'first', inplace = True)
xtest.reset_index(drop = 'first', inplace = True)

## Encoding

In [20]:
for col in xtrain.select_dtypes('object').columns:
    print(xtrain[col].value_counts())
    print()

company_name
Boston Consulting Group                       38
ANZ Banking Group                              5
Adobe                                          5
S&P Global                                     4
BOEING                                         4
                                              ..
DTCC                                           1
Mediphore Systems and Technologies Pvt Ltd     1
TE Connectivity                                1
Aristocrat                                     1
Funfull Inc.                                   1
Name: count, Length: 103, dtype: int64

location
Bengaluru             45
Gurgaon               34
Delhi                 19
Pune                  13
Noida                 12
India                 11
Remote                11
Mumbai                 5
Chennai                5
Chandigarh             2
Cochin                 2
Hyderābād              2
Salem                  2
Vadodara               2
Ghaziabad              1
Haryāna                1

### Problems encountered : 
1) how to deal with description col with many lines
2) how to deal with cols with hundreds of categories - maybe we can make a new category as other
3) how to deal with skills col - like we want the model should capture the importance of each skill type on the salary - so what should be done how it should be encoded ? as separate skill tag each or according to the value counts output with around 100 categoies ..

In [21]:
# lets try one hot encoding
pd.get_dummies(df, columns = xtrain.select_dtypes('object').columns, drop_first = True).columns[200:600]

Index(['job_description_Bachelor’s degree in computer science, Statistics, Mathematics, or related field; master’s degree preferred. Work on LLM/Gen AI based applications.…\r\nSkills: Machine learning, Natural language processing, Data science, AI, Communication skills',
       'job_description_Bachelor’s degree or higher in computer science or other closely related fields. Exhibits confidence and an extensive knowledge of emerging industry practices……\r\nSkills: MATLAB, Big data, Mobile applications, Business analysis, System design',
       'job_description_Bachelor’s degree or higher in computer science, aerospace or other closely related field and 3-4 or more years' related work experience.…\r\nSkills: CI/CD, Azure, Rust, Go, Node.js',
       'job_description_Based on their interests and our business needs, interns will be placed in roles that best suit them. Thorough grasp of computer science principles.…\r\nSkills: Computer science, C#, .NET, Analysis skills, Math',
       'job_d

## Modeling

In [22]:
-m python --version

SyntaxError: invalid syntax (3224696157.py, line 1)

In [40]:
pip install --upgrade pip


Collecting pip
  Downloading pip-24.3.1-py3-none-any.whl.metadata (3.7 kB)
Downloading pip-24.3.1-py3-none-any.whl (1.8 MB)
   ---------------------------------------- 0.0/1.8 MB ? eta -:--:--
   ----------------- ---------------------- 0.8/1.8 MB 5.1 MB/s eta 0:00:01
   ---------------------------------------- 1.8/1.8 MB 5.2 MB/s eta 0:00:00
Installing collected packages: pip
  Attempting uninstall: pip
    Found existing installation: pip 24.2
    Uninstalling pip-24.2:
      Successfully uninstalled pip-24.2
Successfully installed pip-24.3.1
Note: you may need to restart the kernel to use updated packages.


In [23]:
!pip install tensorflow

ERROR: Could not find a version that satisfies the requirement tensorflow (from versions: none)
ERROR: No matching distribution found for tensorflow


In [36]:
from tensorflow import keras

ModuleNotFoundError: No module named 'tensorflow'

## Cross validation and hyperparameter tuning

## Evaluation