In [2]:
import pandas as pd
df = pd.read_csv("labels_and_filenames.csv",names=["label","filename"],header=1)
print(df.shape)
df.head(3)

(179, 2)


Unnamed: 0,label,filename
0,discover,G36 & 5.56 Project Kick-Off_Internal - V1.0 20...
1,discover,G36 & 5.56 S4Hana STE Impl & Roll Out - LE Pro...
2,discover,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...


In [3]:
df.label.value_counts()

frs                        20
fts                        20
mom                        20
Prepare                    20
realize                    20
sdd                        20
technical specification    20
user manuals               20
discover                   19
Name: label, dtype: int64

In [4]:
df.dropna(inplace=True)
df.shape

(179, 2)

In [5]:
df.label.replace("technical specification","tech_spec_document",inplace=True)
df.label.replace("user manuals","user_manual",inplace=True)
df.label.unique()

array(['discover', 'frs', 'fts', 'mom', 'Prepare', 'realize', 'sdd',
       'tech_spec_document', 'user_manual'], dtype=object)

In [6]:
df["label"] = "__label__" + df['label'].astype(str)
df.head(3)

Unnamed: 0,label,filename
0,__label__discover,G36 & 5.56 Project Kick-Off_Internal - V1.0 20...
1,__label__discover,G36 & 5.56 S4Hana STE Impl & Roll Out - LE Pro...
2,__label__discover,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...


In [7]:
df['label_description'] = df['label'] + ' ' + df['filename']
df.head(3)


Unnamed: 0,label,filename,label_description
0,__label__discover,G36 & 5.56 Project Kick-Off_Internal - V1.0 20...,__label__discover G36 & 5.56 Project Kick-Off_...
1,__label__discover,G36 & 5.56 S4Hana STE Impl & Roll Out - LE Pro...,__label__discover G36 & 5.56 S4Hana STE Impl &...
2,__label__discover,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...,__label__discover G36 & 5.56 S4Hana STE Impl D...


In [8]:
import re
def preprocess(text):
    text = re.sub(r'[^\w\s\']',' ', text)
    text = re.sub(' +', ' ', text)
    return text.strip().lower()

In [9]:
df['label_description'] = df['label_description'].map(preprocess)
df.head()

Unnamed: 0,label,filename,label_description
0,__label__discover,G36 & 5.56 Project Kick-Off_Internal - V1.0 20...,__label__discover g36 5 56 project kick off_in...
1,__label__discover,G36 & 5.56 S4Hana STE Impl & Roll Out - LE Pro...,__label__discover g36 5 56 s4hana ste impl rol...
2,__label__discover,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...,__label__discover g36 5 56 s4hana ste impl del...
3,__label__discover,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...,__label__discover g36 5 56 s4hana ste impl del...
4,__label__discover,G36 & 5.56 S4Hana STE Impl Delivery Costing Sh...,__label__discover g36 5 56 s4hana ste impl del...


In [10]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(df, test_size=0.2)

In [11]:
train.shape, test.shape

((143, 3), (36, 3))

In [12]:
train.to_csv("doctype.train", columns=["label_description"], index=False, header=False)
test.to_csv("doctype.test", columns=["label_description"], index=False, header=False)

In [13]:
import fasttext

model = fasttext.train_supervised(input="doctype.train")
model.test("doctype.test")

Read 0M words
Number of words:  378
Number of labels: 9
Progress: 100.0% words/sec/thread: 2115148 lr:  0.000000 avg.loss:  2.212981 ETA:   0h 0m 0s


(36, 0.5277777777777778, 0.5277777777777778)

In [15]:
model.predict(preprocess("MIC_G36_FRS_BL-QM-002-Customized scrap report print.pdf"))

(('__label__user_manual',), array([0.11115469]))

In [17]:
# Save the model to a binary file
model.save_model("fasttext_model.bin")

In [18]:
loaded_model = fasttext.load_model("fasttext_model.bin")

# Predict with the loaded model
prediction = loaded_model.predict(preprocess("MIC_G36_FRS_BL-QM-002-Customized scrap report print.pdf"))



In [19]:
prediction

(('__label__user_manual',), array([0.11115469]))