Imports

In [18]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from tensorflow.keras.layers import LSTM, Dense, Dropout, Input
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

from sklearn.metrics import r2_score

In [None]:
df = pd.read_csv("jobs_in_data.csv")

# Split the 'salary_in_usd' column by 1000
df['salary_in_usd'] /= 1000


# Calculate the ratio of "salary_in_usd" to "salary"
df['salary_ratio'] = df['salary_in_usd'] / df['salary']


# Map experience levels to ordinal numbers
experience_mapping = {'Entry-level': 1, 'Mid-level': 2, 'Senior': 3, 'Executive': 4}
df['experience_level_encoded'] = df['experience_level'].map(experience_mapping)

# Mapping for 'work_setting'
work_setting_mapping = {'Hybrid': 1, 'In-person': 2, 'Remote': 3}
df['work_setting_encoded'] = df['work_setting'].map(work_setting_mapping)

# Mapping for 'employment_type'
employment_type_mapping = {'Full-time': 1, 'Part-time': 2, 'Contract': 3, 'Freelance': 4}
df['employment_type_encoded'] = df['employment_type'].map(employment_type_mapping)

# Mapping for 'company_size'
company_size_mapping = {'L': 1, 'M': 2, 'S': 3}
df['company_size_encoded'] = df['company_size'].map(company_size_mapping)




# Calculate the percentile rank of each salary within its job category
df['Percentile'] = df.groupby('job_category')['salary'].rank(pct=True)

# Normalize the percentile ranks to a scale of 0 to 1
min_percentile = df['Percentile'].min()
max_percentile = df['Percentile'].max()
df['Normalized_Salary_within_Job_Category'] = (df['Percentile'] - min_percentile) / (max_percentile - min_percentile)

# Drop the temporary 'Percentile' column if you don't need it anymore
df.drop(columns=['Percentile'], inplace=True)

# Define features and target variable
X_numerical = df.select_dtypes(include=np.number).drop(
    columns=["salary_in_usd", "salary"])  # Select only numeric columns
y = df["salary_in_usd"]

# Scale numerical data
scaler = StandardScaler()
X_numerical_scaled = scaler.fit_transform(X_numerical)

# Split data into train and test sets
X_train_num, X_test_num, y_train, y_test = train_test_split(X_numerical_scaled, y, test_size=0.2, random_state=42)

# Reshape the input data to include timestep dimension
X_train_reshaped = X_train_num.reshape(X_train_num.shape[0], X_train_num.shape[1], 1)
X_test_reshaped = X_test_num.reshape(X_test_num.shape[0], X_test_num.shape[1], 1)

# Define LSTM model
input_layer = Input(shape=(X_train_reshaped.shape[1], X_train_reshaped.shape[2]))
lstm_layer = LSTM(128, return_sequences=True)(input_layer)
dropout_layer = Dropout(0.0)(lstm_layer)
lstm_layer2 = LSTM(64)(dropout_layer)
output_layer = Dense(1, activation='linear')(lstm_layer2)
model = Model(inputs=input_layer, outputs=output_layer)

# Compile the model
optimizer = Adam(learning_rate=0.01)
model.compile(optimizer=optimizer, loss='mean_squared_error', metrics=['mse'])


# Train the model
history = model.fit(X_train_reshaped, y_train, epochs=100, batch_size=64, validation_split=0.1)

# Evaluate the model
y_pred = model.predict(X_test_reshaped)
mse = mean_squared_error(y_test, y_pred)
print("RMSE:", np.sqrt(mse))

# Calculate R-squared (accuracy)
r2 = r2_score(y_test, y_pred)
accuracy = np.round(r2 * 100, 2)
print("Accuracy:", accuracy)


In [29]:
# 1. Feature Descriptions
print("Feature Descriptions:")
print(df.info())  # Display information about each column (e.g., data type)

# 2. Data Distribution
print("\nData Distribution:")
print(df.describe())  # Display summary statistics for numerical features

# 3. Missing Values
print("\nMissing Values:")
print(df.isnull().sum())  # Count missing values in each column

# 4. Correlation Analysis
print("\nCorrelation Analysis:")
numeric_df = df.select_dtypes(include=['number'])  # Select only numerical columns
correlation_matrix = numeric_df.corr()  # Calculate correlation matrix
print(correlation_matrix)  # Display correlation matrix

Feature Descriptions:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9355 entries, 0 to 9354
Data columns (total 18 columns):
 #   Column                                 Non-Null Count  Dtype  
---  ------                                 --------------  -----  
 0   work_year                              9355 non-null   int64  
 1   job_title                              9355 non-null   object 
 2   job_category                           9355 non-null   object 
 3   salary_currency                        9355 non-null   object 
 4   salary                                 9355 non-null   int64  
 5   salary_in_usd                          9355 non-null   float64
 6   employee_residence                     9355 non-null   object 
 7   experience_level                       9355 non-null   object 
 8   employment_type                        9355 non-null   object 
 9   work_setting                           9355 non-null   object 
 10  company_location                       9355 non-nu