In [137]:
import pandas as pd
import pickle
import numpy as np
import json
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split
from tqdm import tqdm

In [123]:
class Predictor:
    def __init__(
        self, 
        name2_emb_pickle: str,
        generalized_work_class2idx_json: str,
        generalized2global_mapping_json: str,
        classifier
    ):
        with open(name2_emb_pickle, "rb") as f:
            self.name2_emb = pickle.load(f)
            
        with open(generalized_work_class2idx_json, "r") as f:
            self.generalized_work_class2idx = json.load(f)
            self.idx2generalized_work_class = {i: gwc for gwc, i in self.generalized_work_class2idx.items()}
                
        with open(generalized2global_mapping_json, "r") as f:
            self.generalized2global_mapping = json.load(f)
        
        self.classifier = classifier
        
        # Can be filled in `fit` method.
        self.workname2generalized_work_classes_single_entries = {}

    def fit(self, df: pd.DataFrame):
        df_ =  df[["work_name", "generalized_work_class"]].dropna()
        work_names = df_.work_name.to_list()
        generalized_work_classes = df_.generalized_work_class.to_list()
                
        generalized_work_classes_idxes = np.asarray([self.generalized_work_class2idx[gen_work_cls] for gen_work_cls in generalized_work_classes])
        
        embeddings = [self.name2_emb[n] for n in work_names]   
        embeddings = np.stack(embeddings)
        
        # generalized_work classes with single occurence are excluded from the classifier scope and stored separately.
        generalized_work_classes_occurences = pd.Series(generalized_work_classes_idxes).value_counts()
        single_appearings = generalized_work_classes_occurences[generalized_work_classes_occurences == 1].index.to_list()
        kept_samples_idxes = []
        for sample_idx, gen_work_cls_idx in enumerate(generalized_work_classes_idxes):
            if gen_work_cls_idx in single_appearings:
                gen_work_cls = self.idx2generalized_work_class[gen_work_cls_idx]
                work_name = work_names[sample_idx]
                self.workname2generalized_work_classes_single_entries[work_name] = gen_work_cls
            else:
                kept_samples_idxes.append(sample_idx)
        embeddings = embeddings[kept_samples_idxes]
        generalized_work_classes_idxes = generalized_work_classes_idxes[kept_samples_idxes]
               
        train_x, test_x, train_y, test_y = train_test_split(
            embeddings, 
            generalized_work_classes_idxes, 
            test_size=0.15, 
            stratify=generalized_work_classes_idxes, 
            random_state=1,
        )
        
        print(f"Train size: {len(train_x)}, Test size: {len(test_x)}")
        
        print("Fitting classifier...")
        self.classifier.fit(train_x, train_y)
        
        print("Validating classifier...")
        pred_y = self.classifier.predict(test_x)
        print("Precision: ", precision_score(test_y, pred_y, average="macro"))
        print("Recall: ", recall_score(test_y, pred_y, average="macro"))
    
    def predict(self, df: pd.DataFrame) -> pd.DataFrame:
        assert df.work_name.notnull().all(), "remove nullable `work_name` entries"
                
        prediction = df.copy()
                
        special_case_idxes = df.work_name.apply(lambda x: x in self.workname2generalized_work_classes_single_entries.keys())
        prediction.generalized_work_class[special_case_idxes] = prediction.work_name[special_case_idxes].apply(lambda x: self.workname2generalized_work_classes_single_entries[x])
        
        usual_case_idxes = ~special_case_idxes
        embeddings = prediction.work_name.apply(lambda x: self.name2_emb[x]).to_numpy()
        embeddings = np.stack(embeddings)
        generalized_work_class_idxes = self.classifier.predict(embeddings)
        generalized_work_classes = pd.Series([self.idx2generalized_work_class[i] for i in generalized_work_class_idxes], index=special_case_idxes.index)
        prediction.generalized_work_class[usual_case_idxes] = generalized_work_classes[usual_case_idxes]
        
        idxes_to_restore_generalized_work_class = df.generalized_work_class.notna()
        prediction.generalized_work_class[idxes_to_restore_generalized_work_class] = df.generalized_work_class[idxes_to_restore_generalized_work_class]
        
        prediction.global_work_class = prediction.generalized_work_class.apply(lambda x: self.generalized2global_mapping[x])
        
        idxes_to_resore_global_work_class = df.global_work_class.notna()
        prediction.global_work_class[idxes_to_resore_global_work_class] = df.global_work_class[idxes_to_resore_global_work_class]
        
        return prediction
    
    def save(self, pickle_path: str):
        with open(pickle_path, "wb") as f:
            pickle.dump(self, f)
            
    @staticmethod
    def load(pickle_path: str):
        with open(pickle_path, "rb") as f:
            return pickle.load(f)
        

In [124]:
classifier = RandomForestClassifier(
    n_estimators=100,
    n_jobs=-1, 
    random_state=1, 
    class_weight="balanced_subsample",
)

predictor = Predictor(
    "./name2emb.pickle",
    "./generalized_work_class2idx.json",
    "./generalized2global_mapping.json",
    classifier
)

In [92]:
df = pd.read_csv("lab2_oil_gas_field_construction_data.csv")

In [46]:
predictor.fit(df)
predictor.save("./random_forest_clf.pickle")

Train size: 252326, Test size: 44529
Fitting classifier...
Validating classifier...


  _warn_prf(average, modifier, msg_start, len(result))


Precision:  0.98057797468956
Recall:  0.9476099083574409


In [139]:
test_df = pd.read_csv("./lab2_test_dataset.csv", sep=";")
test_df.drop(columns=["index",], inplace=True)
test_df.dropna(inplace=True)

In [140]:
test_df.head()

Unnamed: 0,work_name,generalized_work_class,global_work_class
0,"монтаж шаровых кранов, дроссельной шайбы, запо...",Монтаж мк,Монтаж мк
1,монтаж кипиа и зра с электроприводом,Монтаж ЗРА,Монтаж
2,монтаж оборудования и приборов,Монтаж приборов,Монтаж
3,тх./поз.2.13.1-2/монтаж арматуры,Монтаж арматуры,Монтаж
4,"монтаж шаровых кранов, огнепреградителя, дросс...",Монтаж мк,Монтаж мк


In [141]:
test_df_for_model = test_df.copy()
test_df_for_model["generalized_work_class"] = np.nan
test_df_for_model["global_work_class"] = np.nan

In [142]:
test_df_for_model.head()

Unnamed: 0,work_name,generalized_work_class,global_work_class
0,"монтаж шаровых кранов, дроссельной шайбы, запо...",,
1,монтаж кипиа и зра с электроприводом,,
2,монтаж оборудования и приборов,,
3,тх./поз.2.13.1-2/монтаж арматуры,,
4,"монтаж шаровых кранов, огнепреградителя, дросс...",,


In [143]:
prediction = predictor.predict(test_df_for_model)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction.generalized_work_class[special_case_idxes] = prediction.work_name[special_case_idxes].apply(lambda x: self.workname2generalized_work_classes_single_entries[x])
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  prediction.generalized_work_class[usual_case_idxes] = generalized_work_classes[usual_case_idxes]


In [144]:
prediction.head()

Unnamed: 0,work_name,generalized_work_class,global_work_class
0,"монтаж шаровых кранов, дроссельной шайбы, запо...",Монтаж мк,Монтаж мк
1,монтаж кипиа и зра с электроприводом,Монтаж ЗРА,Монтаж
2,монтаж оборудования и приборов,Монтаж приборов,Монтаж
3,тх./поз.2.13.1-2/монтаж арматуры,Монтаж арматуры,Монтаж
4,"монтаж шаровых кранов, огнепреградителя, дросс...",Монтаж мк,Монтаж мк


In [149]:
print("Ratio of correctly predicted generalized_work_class", (prediction.generalized_work_class == test_df.generalized_work_class).sum() / len(prediction))
print("Ratio of correctly predicted global_work_class", (prediction.global_work_class == test_df.global_work_class).sum() / len(prediction))

Ratio of correctly predicted generalized_work_class 0.9932845549817586
Ratio of correctly predicted global_work_class 0.993709007488555


In [148]:
print("Precision (generalized_work_class): ", precision_score(test_df.generalized_work_class, prediction.generalized_work_class, average="macro"))
print("Recall (generalized_work_class): ", recall_score(test_df.generalized_work_class, prediction.generalized_work_class, average="macro"))
print()
print("Precision (global_work_class): ", precision_score(test_df.global_work_class, prediction.global_work_class, average="macro"))
print("Recall (global_work_class): ", recall_score(test_df.global_work_class, prediction.global_work_class, average="macro"))

  _warn_prf(average, modifier, msg_start, len(result))


Precision (generalized_work_class):  0.9763076038884346
Recall (generalized_work_class):  0.930060970778018



  _warn_prf(average, modifier, msg_start, len(result))


Precision (global_work_class):  0.9782718595314528
Recall (global_work_class):  0.9496714258703001
