In [1]:
import pandas as pd
import pickle
import matplotlib.pyplot as plt
import numpy as np
from sklearn.manifold import TSNE
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, precision_score, recall_score
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neural_network import MLPClassifier

In [2]:
with open("name2emb.pickle", "rb") as f:
    name2emb = pickle.load(f)

In [3]:
df = pd.read_csv("lab2_oil_gas_field_construction_data.csv")

In [4]:
names = []
gwcs = []
for _, row in df[["work_name", "generalized_work_class"]].iterrows():
    work_name = row["work_name"]
    generalized_work_class = row["generalized_work_class"]
    if work_name not in names and generalized_work_class is not np.nan:
        names.append(work_name)
        gwcs.append(generalized_work_class)

In [5]:
gwc2id = {gwc: i for i, gwc in enumerate(set(gwcs))}
id2gwc = {i: gwc for i, gwc in enumerate(set(gwcs))}

gwc_ids = np.asarray([gwc2id[gwc] for gwc in gwcs])

In [6]:
embeddings = [name2emb[n] for n in names]   
embeddings = np.stack(embeddings)

In [7]:
vc = pd.Series(gwc_ids).value_counts()
single_appearings = vc[vc == 1].index.to_list()
to_keep = [i for i, e in enumerate(gwc_ids) if e not in single_appearings]
embeddings = embeddings[to_keep]
gwc_ids = gwc_ids[to_keep]

In [8]:
train_x, test_x, train_y, test_y = train_test_split(embeddings, gwc_ids, test_size=0.15, stratify=gwc_ids, random_state=1)

In [9]:
model = RandomForestClassifier(n_jobs=4, random_state=1, class_weight="balanced_subsample")
model.fit(train_x, train_y)

In [10]:
pred_y = model.predict(train_x)
print("Precision: ", precision_score(train_y, pred_y, average="macro"))
print("Recall: ", recall_score(train_y, pred_y, average="macro"))

Precision:  1.0
Recall:  1.0


In [11]:
pred_y = model.predict(test_x)
print("Precision: ", precision_score(test_y, pred_y, average="macro"))
print("Recall: ", recall_score(test_y, pred_y, average="macro"))

Precision:  0.6234373602920458
Recall:  0.4465564289637802


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [12]:
model = LogisticRegression(penalty="l2", class_weight="balanced", random_state=1, n_jobs=4)
model.fit(train_x, train_y)

In [13]:
pred_y = model.predict(train_x)
print("Precision: ", precision_score(train_y, pred_y, average="macro"))
print("Recall: ", recall_score(train_y, pred_y, average="macro"))

Precision:  0.38073694505047956
Recall:  0.6385104362618624


  _warn_prf(average, modifier, msg_start, len(result))


In [14]:
pred_y = model.predict(test_x)
print("Precision: ", precision_score(test_y, pred_y, average="macro"))
print("Recall: ", recall_score(test_y, pred_y, average="macro"))

Precision:  0.2891260491163791
Recall:  0.38585056857639416


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [15]:
model = SGDClassifier(penalty="elasticnet",  random_state=1, n_jobs=4)
model.fit(train_x, train_y)

In [16]:
pred_y = model.predict(train_x)
print("Precision: ", precision_score(train_y, pred_y, average="macro"))
print("Recall: ", recall_score(train_y, pred_y, average="macro"))

Precision:  0.2899980548845073
Recall:  0.21298193488792114


  _warn_prf(average, modifier, msg_start, len(result))


In [17]:
pred_y = model.predict(test_x)
print("Precision: ", precision_score(test_y, pred_y, average="macro"))
print("Recall: ", recall_score(test_y, pred_y, average="macro"))

Precision:  0.2860073125923038
Recall:  0.22773500094424276


  _warn_prf(average, modifier, msg_start, len(result))


In [18]:
model = MLPClassifier((512, 512), learning_rate="adaptive", random_state=1, batch_size=2048)
model.fit(train_x, train_y)



In [19]:
pred_y = model.predict(train_x)
print("Precision: ", precision_score(train_y, pred_y, average="macro"))
print("Recall: ", recall_score(train_y, pred_y, average="macro"))

Precision:  0.9360218707635168
Recall:  0.819832878352313


  _warn_prf(average, modifier, msg_start, len(result))


In [20]:
pred_y = model.predict(test_x)
print("Precision: ", precision_score(test_y, pred_y, average="macro"))
print("Recall: ", recall_score(test_y, pred_y, average="macro"))

Precision:  0.6022330137044871
Recall:  0.5356522797730416


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
