In [1]:
import numpy as np
import pandas as pd
from sentence_transformers import SentenceTransformer
from autogluon.tabular import TabularPredictor

# Load data
train_df = pd.read_csv('raw-data/train.csv')
test_df = pd.read_csv('raw-data/test.csv')
valid_df = pd.read_csv('raw-data/valid.csv')

# Fill NaN value in 'requirements_and_role' column with an empty string
train_df['requirements_and_role'].fillna('', inplace=True)

# Separate labeled and unlabeled data
labeled_train_df = train_df.iloc[:8000]
unlabeled_train_df = train_df.iloc[8000:]

# Initialize SBERT model
sbert_model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

# Vectorize text data using SBERT
X_labeled_train = sbert_model.encode(labeled_train_df['requirements_and_role'].tolist())
X_test = sbert_model.encode(test_df['requirements_and_role'].tolist())
X_valid = sbert_model.encode(valid_df['requirements_and_role'].tolist())

# Create DataFrames for AutoGluon
train_features = pd.DataFrame(X_labeled_train)
valid_features = pd.DataFrame(X_valid)

train_labels = labeled_train_df['salary_bin']
valid_labels = valid_df['salary_bin']

train_data = train_features.copy()
train_data['salary_bin'] = train_labels

valid_data = valid_features.copy()
valid_data['salary_bin'] = valid_labels

# Train an AutoGluon model
predictor = TabularPredictor(label='salary_bin').fit(train_data)

# Validate the model
performance = predictor.evaluate(valid_data)
print(performance)

# Make predictions on the test set
test_features = pd.DataFrame(X_test)
predictions = predictor.predict(test_features)

# Create a DataFrame for the submission
submission = pd.DataFrame({'job_id': test_df['job_id'], 'salary_bin': predictions})

# Save the predictions to a CSV file
submission.to_csv('test_predictions_auto_gluon_sbert.csv', index=False)

Downloading (…)001fa/.gitattributes:   0%|          | 0.00/690 [00:00<?, ?B/s]

Downloading (…)_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/README.md:   0%|          | 0.00/3.69k [00:00<?, ?B/s]

Downloading (…)bb8001fa/config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading (…)ce_transformers.json:   0%|          | 0.00/122 [00:00<?, ?B/s]

Downloading pytorch_model.bin:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

Downloading (…)nce_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

Downloading (…)cial_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

Downloading (…)001fa/tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

Downloading (…)okenizer_config.json:   0%|          | 0.00/314 [00:00<?, ?B/s]

Downloading (…)3bbb8001fa/vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

Downloading (…)b8001fa/modules.json:   0%|          | 0.00/229 [00:00<?, ?B/s]

No path specified. Models will be saved in: "AutogluonModels\ag-20230504_112031\"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20230504_112031\"
AutoGluon Version:  0.7.0
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22000
Train Data Rows:    8000
Train Data Columns: 384
Label Column: salary_bin
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == float, but few unique label-values observed and label-values can be converted to int).
	10 unique label values:  [9.0, 4.0, 6.0, 3.0, 1.0, 0.0, 2.0, 8.0, 7.0, 5.0]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (You may specify problem_type as one of: ['binary', 'multiclass', 'regression'])
Train Data Class Count: 10
Using Feature Generators to preprocess the data ...
Fitting AutoMLPipelineFeatureGenerator...


{'accuracy': 0.2504317789291883, 'balanced_accuracy': 0.24684796098838585, 'mcc': 0.16722763632222662}
