In [3]:
import pandas as pd
df = pd.read_csv("labels_and_filenames.csv",names=["label","filename"],header=1)
print(df.shape)
df.head(3)

(180, 2)


Unnamed: 0,label,filename
0,process_plan,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...
1,process_plan,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...
2,process_plan,G36 & 5.56 STE Implementation Project - V1.1 D...


In [4]:
df.label.value_counts()

process_plan           30
project_charter_doc    30
project_plan           30
solution_design_doc    30
technical_spec_doc     30
training_material      30
Name: label, dtype: int64

In [5]:
df.dropna(inplace=True)
df.shape

(180, 2)

In [6]:
df.label.replace("technical specification","tech_spec_document",inplace=True)
df.label.replace("user manuals","user_manual",inplace=True)
df.label.unique()

array(['process_plan', 'project_charter_doc', 'project_plan',
       'solution_design_doc', 'technical_spec_doc', 'training_material'],
      dtype=object)

In [7]:
df["label"] = "__label__" + df['label'].astype(str)
df.head(3)

Unnamed: 0,label,filename
0,__label__process_plan,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...
1,__label__process_plan,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...
2,__label__process_plan,G36 & 5.56 STE Implementation Project - V1.1 D...


In [8]:
df['label_description'] = df['label'] + ' ' + df['filename']
df.head(3)


Unnamed: 0,label,filename,label_description
0,__label__process_plan,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...,__label__process_plan G36 & 5.56 S4Hana STE Im...
1,__label__process_plan,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...,__label__process_plan G36 & 5.56 S4Hana STE Im...
2,__label__process_plan,G36 & 5.56 STE Implementation Project - V1.1 D...,__label__process_plan G36 & 5.56 STE Implement...


In [9]:
import re
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

In [10]:
df['label_description'] = df['label_description'].map(preprocess)
df.head()

Unnamed: 0,label,filename,label_description
0,__label__process_plan,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...,__label__process_plan g36 5 56 s4hana ste impl...
1,__label__process_plan,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...,__label__process_plan g36 5 56 s4hana ste impl...
2,__label__process_plan,G36 & 5.56 STE Implementation Project - V1.1 D...,__label__process_plan g36 5 56 ste implementat...
3,__label__process_plan,G36 & 5.56 STE Implementation Project - V1.2 D...,__label__process_plan g36 5 56 ste implementat...
4,__label__process_plan,G36 & 5.56 STE Implementation Project -Risk Re...,__label__process_plan g36 5 56 ste implementat...


In [11]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [12]:
train.shape, test.shape

((144, 3), (36, 3))

In [13]:
train.to_csv("doctype.train", columns=["label_description"], index=False, header=False)
test.to_csv("doctype.test", columns=["label_description"], index=False, header=False)

In [14]:
import fasttext

model = fasttext.train_supervised(input="doctype.train")
model.test("doctype.test")

Read 0M words
Number of words:  381
Number of labels: 6
Progress: 100.0% words/sec/thread: 2986274 lr:  0.000000 avg.loss:  1.795664 ETA:   0h 0m 0s


(36, 0.4722222222222222, 0.4722222222222222)

In [22]:
model.predict(preprocess("AVON_SDD_BD9-01-Sales for authorized customers (Credit Sales)_V1.0_20200811"))

(('__label__solution_design_doc',), array([0.16671246]))

In [23]:
# Save the model to a binary file
model.save_model("fasttext_model.bin")

In [36]:
loaded_model = fasttext.load_model("../models/fasttext_model.bin")

# Predict with the loaded model
prediction = loaded_model.predict(preprocess("MIC_G36&5.56_SDD_FICO_Accounts Payable_V 1.0_06062021.pdf"))



In [37]:
prediction

(('__label__project_charter_doc',), array([0.16674384]))