In [1]:
#load standard libs
import csv
import pandas as pd 
import numpy as np

#load sklrean modules
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

In [16]:
# load the following data : occupation, intresets, work values 
occupation_file= "db_28_0_text/Occupation Data.txt"
intrests_file= "db_28_0_text/Interests.txt"
work_values_file = "db_28_0_text/Work Values.txt"



In [3]:
#load training data and make unicalize names
df_occ = pd.read_csv(occupation_file,delimiter="\t")
df_intrests = pd.read_csv(intrests_file,delimiter="\t")
df_intrests['Element Name']= df_intrests['Element Name'].apply(lambda x: x.lower())
df_intrests= df_intrests.pivot(index="O*NET-SOC Code", columns="Element Name",values="Data Value")

df_work_values = pd.read_csv(work_values_file,delimiter="\t")
df_work_values['Element Name']= df_work_values['Element Name'].apply(lambda x: x.lower())
df_work_values = df_work_values.pivot(index="O*NET-SOC Code", columns="Element Name",values="Data Value")


In [4]:
#join data 
data = df_occ.join(df_intrests, on='O*NET-SOC Code',how="inner",rsuffix='_intrests') \
             .join(df_work_values, on='O*NET-SOC Code',how="inner",rsuffix='_work_values')

In [7]:
#define training columns
training_cols= [x for x in list(data.columns.unique()) if x not in ["Title","O*NET-SOC Code","Description"]]


In [8]:
#prepare data for training 
label_encoder = LabelEncoder()
label_encoder.fit(data["Title"])
x_train, y_train = data[training_cols], data["Title"]
y_train= label_encoder.transform(y_train)
x_train = x_train.rename(columns={'working conditions': 'working_conditions'})
x_train= x_train.reindex(sorted(x_train.columns), axis=1)

In [9]:
#define model 
clf = Pipeline(
    steps=[("scaler", StandardScaler()), ("knn", KNeighborsClassifier(leaf_size=1,n_neighbors=11,p=2))]
)

In [11]:
#fit model
clf.fit(x_train,y_train)
clf.score(x_train,y_train)

0.10983981693363844

In [13]:
#measure recall@10 for training data 
res= clf.predict_proba(x_train)
recall_10 = 0
for i in range(len(x_train)):
    top_10= np.argpartition(res[i], -10)[-10:]
    top_10_jobs = label_encoder.inverse_transform(top_10)
    gt = label_encoder.inverse_transform([y_train[i]])
    if gt in top_10_jobs : 
        recall_10 +=1 
print(recall_10/len(x_train))


0.8935926773455377


In [17]:
import pickle
with open('model.pickle', 'wb') as f:
    pickle.dump(clf, f, protocol=pickle.HIGHEST_PROTOCOL)

with open('label_encoder.pickle', 'wb') as f:
    pickle.dump(label_encoder, f, protocol=pickle.HIGHEST_PROTOCOL)