In [1]:
import pandas as pd
import numpy as np

# load data
train_embeddings = np.load('embeddings-data/train-embeddings.npy')
test_embeddings = np.load('embeddings-data/test-embeddings.npy')
valid_embeddings = np.load('embeddings-data/valid-embeddings.npy')

train_tfidf = np.load('tfidf-data/train-tfidf.npy')
test_tfidf = np.load('tfidf-data/test-tfidf.npy')
valid_tfidf = np.load('tfidf-data/valid-tfidf.npy')

train_df = pd.read_csv('raw-data/train.csv')
test_df = pd.read_csv('raw-data/test.csv')
valid_df = pd.read_csv('raw-data/valid.csv')

In [2]:
# Preprocessing
# Fill NaN value in 'requirements_and_role' column with an empty string
train_df['requirements_and_role'].fillna('', inplace=True)

# Separate labeled and unlabeled data
labeled_train_df = train_df.iloc[:8000]
unlabeled_train_df = train_df.iloc[8000:]

# Separate the features and labels
X_labeled_train_embeddings = train_embeddings[:8000]
X_labeled_train_tfidf = train_tfidf[:8000]
X_unlabeled_train_embeddings = train_embeddings[8000:]
X_unlabeled_train_tfidf = train_tfidf[8000:]

X_labeled_train_combined = np.concatenate((X_labeled_train_embeddings, X_labeled_train_tfidf), axis=1)
X_unlabeled_train_combined = np.concatenate((X_unlabeled_train_embeddings, X_unlabeled_train_tfidf), axis=1)

valid_combined = np.concatenate((valid_embeddings, valid_tfidf), axis=1)

y_train_salary_bin = labeled_train_df['salary_bin'].values
y_valid_salary_bin = valid_df['salary_bin'].values

In [3]:
label = 'salary_bin'

datasets = {
    'combined': (
        X_labeled_train_combined, y_train_salary_bin, valid_combined, y_valid_salary_bin, X_unlabeled_train_combined
    ),
    'embeddings': (
        X_labeled_train_embeddings, y_train_salary_bin, valid_embeddings, y_valid_salary_bin,
        X_unlabeled_train_embeddings
    ),
    'tfidf': (
        X_labeled_train_tfidf, y_train_salary_bin, valid_tfidf, y_valid_salary_bin, X_unlabeled_train_tfidf
    )
}

In [4]:
from autogluon.tabular import TabularPredictor

# Initialize dictionaries to store confident samples and predictions for each dataset
confident_samples_dict = {}
confident_predictions_dict = {}

for data_name, (X_train, y_train, _, _, X_unlabeled) in datasets.items():
    # Use AutoGluon to predict labels for unlabeled data
    train_data = pd.DataFrame(X_train)
    train_data['label'] = y_train

    predictor = TabularPredictor(label='label').fit(train_data=train_data)

    unlabeled_data = pd.DataFrame(X_unlabeled)
    unlabeled_predictions = predictor.predict(unlabeled_data)

    confident_samples_dict[data_name] = X_unlabeled
    confident_predictions_dict[data_name] = unlabeled_predictions.values

No path specified. Models will be saved in: "AutogluonModels\ag-20230505_014847\"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20230505_014847\"
AutoGluon Version:  0.7.0
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22000
Train Data Rows:    8000
Train Data Columns: 884
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == float, but few unique label-values observed and label-values can be converted to int).
	10 unique label values:  [9.0, 4.0, 6.0, 3.0, 1.0, 0.0, 2.0, 8.0, 7.0, 5.0]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor init (

[1000]	valid_set's multi_error: 0.74375


	0.265	 = Validation score   (accuracy)
	165.97s	 = Training   runtime
	0.18s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	0.2975	 = Validation score   (accuracy)
	0.45s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 540.44s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels\ag-20230505_021223\")
No path specified. Models will be saved in: "AutogluonModels\ag-20230505_022125\"
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20230505_022125\"
AutoGluon Version:  0.7.0
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22000
Train Data Rows:    8000
Train Data Columns: 500
Label Column: label
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == float, but few unique label-values observed and label-v

# Model training

In [5]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


def train_model(model, X_train, y_train, X_unlabeled=None, use_unlabeled_data=False, dataset_name=None):
    if use_unlabeled_data and X_unlabeled is not None and dataset_name is not None:
        # Combine the labeled and predicted unlabeled data
        X_train = np.concatenate((X_train, confident_samples_dict[dataset_name]), axis=0)
        y_train = np.concatenate((y_train, confident_predictions_dict[dataset_name]), axis=0)

    # Train the model
    model.fit(X_train, y_train)

    return model

results = []

models = {
    'KNN': KNeighborsClassifier(n_neighbors=5),
    'Logistic Regression': LogisticRegression(max_iter=1000, multi_class='auto'),
    'SVM': SVC(probability=True),
    'Random Forest': RandomForestClassifier(criterion='gini')
}

for data_name, (X_train, y_train, X_valid, y_valid, X_unlabeled) in datasets.items():
    for model_name, model in models.items():
        # Train and evaluate the model without self-training
        model = train_model(model, X_train, y_train)

        # Calculate the performance metrics
        predictions = model.predict(X_valid)
        valid_accuracy = accuracy_score(y_valid, predictions)
        valid_precision = precision_score(y_valid, predictions, average='weighted')
        valid_recall = recall_score(y_valid, predictions, average='weighted')
        valid_f1_score = f1_score(y_valid, predictions, average='weighted')

        print(f'{model_name} on {data_name} data')
        print(f'Accuracy: {valid_accuracy}')
        print(f'Precision: {valid_precision}')
        print(f'Recall: {valid_recall}')
        print(f'F1-score: {valid_f1_score}')
        print()

        results.append({
            'Model': model_name,
            'Dataset': data_name,
            'Use-unlabeled-data': 'No',
            'Accuracy': valid_accuracy,
            'Precision': valid_precision,
            'Recall': valid_recall,
            'F1-score': valid_f1_score
        })

        # Train and evaluate the model with self-training
        model_with_unlabeled = train_model(model, X_train, y_train, X_unlabeled, use_unlabeled_data=True, dataset_name=data_name)

        # Calculate the performance metrics
        predictions_self_trained = model_with_unlabeled.predict(X_valid)
        valid_accuracy_self_trained = accuracy_score(y_valid, predictions_self_trained)
        valid_precision_self_trained = precision_score(y_valid, predictions_self_trained, average='weighted')
        valid_recall_self_trained = recall_score(y_valid, predictions_self_trained, average='weighted')
        valid_f1_score_self_trained = f1_score(y_valid, predictions_self_trained, average='weighted')

        print(f'{model_name} on {data_name} data with self-training')
        print(f'Accuracy: {valid_accuracy_self_trained}')
        print(f'Precision: {valid_precision_self_trained}')
        print(f'Recall: {valid_recall_self_trained}')
        print(f'F1-score: {valid_f1_score_self_trained}')
        print()

        results.append({
            'Model': model_name,
            'Dataset': data_name,
            'Use-unlabeled-data': 'Yes',
            'Accuracy': valid_accuracy_self_trained,
            'Precision': valid_precision_self_trained,
            'Recall': valid_recall_self_trained,
            'F1-score': valid_f1_score_self_trained
        })

# Convert the results list to a DataFrame
results_df = pd.DataFrame(results)

# Print the results DataFrame in descending order of Accuracy in a table
results_df.sort_values(by='Accuracy', ascending=False).reset_index(drop=True)

KNN on combined data
Accuracy: 0.2141623488773748
Precision: 0.20668085627261887
Recall: 0.2141623488773748
F1-score: 0.20568786889198232

KNN on combined data with self-training
Accuracy: 0.2251007484168106
Precision: 0.209483017465995
Recall: 0.2251007484168106
F1-score: 0.21075614902757725

Logistic Regression on combined data
Accuracy: 0.2274035693724813
Precision: 0.21598954604961518
Recall: 0.2274035693724813
F1-score: 0.21808175286307932

Logistic Regression on combined data with self-training
Accuracy: 0.23546344271732872
Precision: 0.2194381699617523
Recall: 0.23546344271732872
F1-score: 0.21339612748724743

SVM on combined data
Accuracy: 0.24870466321243523
Precision: 0.23618464970455344
Recall: 0.24870466321243523
F1-score: 0.2325197677033256

SVM on combined data with self-training
Accuracy: 0.25215889464594127
Precision: 0.24022487585927163
Recall: 0.25215889464594127
F1-score: 0.22527018413211408

Random Forest on combined data
Accuracy: 0.2504317789291883
Precision: 0.24

Unnamed: 0,Model,Dataset,Use-unlabeled-data,Accuracy,Precision,Recall,F1-score
0,Random Forest,combined,Yes,0.254462,0.25594,0.254462,0.217983
1,SVM,embeddings,No,0.253886,0.245809,0.253886,0.235848
2,SVM,combined,Yes,0.252159,0.240225,0.252159,0.22527
3,Random Forest,combined,No,0.250432,0.240302,0.250432,0.237468
4,Random Forest,tfidf,Yes,0.249856,0.239016,0.249856,0.212757
5,SVM,combined,No,0.248705,0.236185,0.248705,0.23252
6,Random Forest,tfidf,No,0.245826,0.228841,0.245826,0.227522
7,SVM,tfidf,Yes,0.244675,0.224344,0.244675,0.214895
8,Random Forest,embeddings,Yes,0.244675,0.232048,0.244675,0.202658
9,SVM,tfidf,No,0.243523,0.228687,0.243523,0.228357


# Kaggle submission

## AutoGluon

In [6]:
# AutoGluon - Without self-training
from autogluon.tabular import TabularPredictor

# Use the combined labeled training data (embeddings and TF-IDF)
X_train = X_labeled_train_combined
train_data = pd.DataFrame(X_train)
train_data[label] = y_train_salary_bin

# Use the combined test data (embeddings and TF-IDF)
X_test = np.concatenate((test_embeddings, test_tfidf), axis=1)
test_data = pd.DataFrame(X_test)

predictor = TabularPredictor(label=label).fit(train_data=train_data)

predictions = predictor.predict(test_data)
predictions = predictions.astype(int)

submission = pd.DataFrame({'job_id': test_df['job_id'], label: predictions})

submission.to_csv('test_predictions_auto_gluon.csv', index=False)

No path specified. Models will be saved in: "AutogluonModels\ag-20230505_025252\"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels\ag-20230505_025252\"
AutoGluon Version:  0.7.0
Python Version:     3.10.11
Operating System:   Windows
Platform Machine:   AMD64
Platform Version:   10.0.22000
Train Data Rows:    8000
Train Data Columns: 884
Label Column: salary_bin
Preprocessing data ...
AutoGluon infers your prediction problem is: 'multiclass' (because dtype of label-column == float, but few unique label-values observed and label-values can be converted to int).
	10 unique label values:  [9.0, 4.0, 6.0, 3.0, 1.0, 0.0, 2.0, 8.0, 7.0, 5.0]
	If 'multiclass' is not the correct problem_type, please manually specify the problem_type parameter during predictor i