In [233]:
# downloading libraries

import pandas as pd
import numpy as np
from collections import OrderedDict
import string
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics

In [182]:
data=pd.read_csv('./data/AI_DATA.csv')

In [183]:
# Functioun for counting Levenshteins metric. 
# We divide the metric value by the average size of the token and attribute

def levenshtein(seq1, seq2):  
    size_x = len(seq1) + 1
    size_y = len(seq2) + 1
    matrix = np.zeros ((size_x, size_y))
    for x in range(size_x):
        matrix [x, 0] = x
    for y in range(size_y):
        matrix [0, y] = y

    for x in range(1, size_x):
        for y in range(1, size_y):
            if seq1[x-1] == seq2[y-1]:
                matrix [x,y] = min(
                    matrix[x-1, y] + 1,
                    matrix[x-1, y-1],
                    matrix[x, y-1] + 1
                )
            else:
                matrix [x,y] = min(
                    matrix[x-1,y] + 1,
                    matrix[x-1,y-1] + 1,
                    matrix[x,y-1] + 1
                )
    lev = (matrix[size_x - 1, size_y - 1])
    ret = lev/((len(seq1)+len(seq2))/2)
    return ret

In [184]:
# The function calculates the match table for each token with each attribute using the Levenshtein metric. 
# Calculate the table for each product.

def lev_prepr(data,columns,index):
    token_1_input=data.input[index].replace(',',' ').split()
    atrib_1_good=data[col].iloc[index]
    atrib_1_good = [x for x in atrib_1_good if str(x) != 'nan']
    matrix_1=[0]*len(atrib_1_good)
    for i in range(len(atrib_1_good)):
            array=[]
            for j in range(len(token_1_input)):
                array.append(round(levenshtein(str(atrib_1_good[i]),str(token_1_input[j])),2))
            matrix_1[i]=array
    lev_frame=pd.DataFrame(matrix_1,columns=token_1_input)
    lev_frame['attrib']=atrib_1_good
    return lev_frame

In [185]:
col=data.columns.drop(['input','company','plant'])  # attribute names

In [186]:
# Mark tokens of all descriptions. 
# If the token has a minimum value of the Lowenstein metric below the threshold, we mark this token as an attribute.

%%time
token=[]
label_tok=[]
tresh_hold_att=0.7
for i in range(len(data)):
    a=lev_prepr(data,col,i)
    for j in range(len(a.columns)-1):
        index=list(a.iloc[:,j]).index(min(list(a.iloc[:,j])))
        if min(list(a.iloc[:,j]))<tresh_hold_att and min(a.iloc[index,:-1].values.tolist())==min(list(a.iloc[:,j])):
            token.append(a.columns[j])
            index_atr=data.iloc[i].values.tolist().index(a.attrib[index])
            label_tok.append(data.columns[index_atr])

CPU times: user 1min 21s, sys: 0 ns, total: 1min 21s
Wall time: 1min 21s


In [187]:
# Marked tokens that do not belong to the attribute. 
# If the token has a minimum value of the Levenshtein metric above the threshold,
# then this token will be marked as nothing

%%time
rubish = []
tresh_hold_rubish = 0.95
for i in range(len(data)):
    a = lev_prepr(data,col,i)
    for j in range(len(a.columns)-1):
        if min(list(a.iloc[:,j]))>=tresh_hold_rubish:
            rubish.append(a.columns[j])

CPU times: user 51.3 s, sys: 16 ms, total: 51.3 s
Wall time: 51.7 s


In [188]:
attr=np.concatenate((token,rubish_uniq))  # made one list of all flagged tokens
label=np.concatenate((label_tok,nothing)) # made one list of all label

# Make Features for tokens

In [189]:
# Make list for all symbols

dict_for_word=[]
dict_for_word = list(dict.fromkeys(string.ascii_uppercase, 0))
for i in range(10):
    dict_for_word.append(str(i))
symb=['/','.',' ',',',':','!','#','?','[',']','{','}','~','-','_','|','"']
for i in symb:
    dict_for_word.append(i) 

In [190]:

def calculate_vowel(token):
    # function for count vowel
    vowel=0
    for a in token:
        if (a=='A') or (a=='E') or (a=='I') or (a=='O') or (a=='U'):
            vowel+=1
    return vowel
    
def dict_feat(word): 
    # function for calculate all symbols in token
    word_feat=[0]*len(dict_for_word)
    for i in word:
        if i in dict_for_word:
            word_feat[dict_for_word.index(i)]+=1
    return word_feat

def calculate_all_features(token):
    len_tok=len(token) # Length of token 
    num=sum(c.isdigit() for c in token) # amount digits in token
    let=sum(c.isalpha() for c in token) # number of letters in token 
    vowel=calculate_vowel(token) # amount vowel in token
    const=let-vowel # amount const in token
    dict_f=dict_feat(token) # amount each symbols in token
    features=[len_tok,num,let,vowel,const]
    return features+dict_f

In [191]:
# make list of feture name
features_name=['Len_of_token','amount_of_digit','amount_of_letter','amount_of_vowel','amount_of_const']+dict_for_word

In [192]:
# make features for all token
feat_attr=list(map(calculate_all_features,attr))

In [214]:
# divided the sample into a training and test
x_train,x_test,y_train,y_test=train_test_split(feat_attr,label,stratify=label,test_size=0.25,random_state=0)

In [215]:
# Training RFClassifier. Making predict for test sample
rand_for=RandomForestClassifier()
rand_for.fit(x_train,y_train)
predict_rand_forest=rand_for.predict(x_test)

In [216]:
# Estimates for test sample
print(metrics.classification_report(y_test,predict_rand_forest))

             precision    recall  f1-score   support

   amperage       0.90      0.91      0.90       129
    current       0.99      1.00      0.99       427
  enclosure       0.98      0.96      0.97       321
      frame       0.93      0.96      0.94       806
      hertz       0.85      0.96      0.90       221
manufacture       0.93      0.96      0.95      1031
   modifier       1.00      0.99      1.00       968
    nothing       0.79      0.64      0.70       976
       noun       1.00      1.00      1.00      1878
 partnumber       0.87      0.94      0.90      1399
      phase       0.80      0.90      0.85       268
      power       0.93      0.89      0.91       867
        rpm       0.96      0.96      0.96      1366
    voltage       0.96      0.96      0.96       735

avg / total       0.93      0.93      0.93     11392



# Marking tokens manually from ten random input

In [217]:
# Chose 10 random items
index=[8503, 7516, 594, 6194, 2042, 3944, 5950, 2017, 4336, 1364]

In [218]:
df=data

In [219]:
tok_for_test=[]
label_for_test=[]

In [220]:
# Making tokens from 10 random inputs

tok_for_test.append(df.input[index[0]].replace(',',' ').split())
tok_for_test.append(df.input[index[1]].replace(',',' ').split())
tok_for_test.append(df.input[index[2]].replace(',',' ').split())
tok_for_test.append(df.input[index[3]].replace(',',' ').split())
tok_for_test.append(df.input[index[4]].replace(',',' ').split())
tok_for_test.append(df.input[index[5]].replace(',',' ').split())
tok_for_test.append(df.input[index[6]].replace(',',' ').split())
tok_for_test.append(df.input[index[7]].replace(',',' ').split())
tok_for_test.append(df.input[index[8]].replace(',',' ').split())
tok_for_test.append(df.input[index[9]].replace(',',' ').split())

In [221]:
#manually mark the tokens

label_for_test.append(['power','rpm','voltage','nothing','nothing','nothing','nothing','nothing','nothing','nothing'])
label_for_test.append(['manufacture','partnumber','noun'])
label_for_test.append(['manufacture','partnumber','noun','power','nothing','rpm','frame','nothing','nothing','nothing','nothing'])
label_for_test.append(['partnumber','nothing'])
label_for_test.append(['manufacture','partnumber','noun','modifier','power','nothing','rpm','nothing','current','nothing','hertz','nothing','frame','nothing','enclosure','nothing'])
label_for_test.append(['partnumber','nothing','voltage','hertz','power'])
label_for_test.append(['noun','power','rpm','voltage'])
label_for_test.append(['manufacture','noun','modifier','power','nothing','rpm','nothing','current','voltage','phase','hertz','amperage','nothing','nothing','nothing','nothing','nothing','nothing','nothing','nothing'])
label_for_test.append(['manufacture','nothing','noun','nothing','nothing','nothing','nothing','partnumber','nothing','nothing'])
label_for_test.append(['manufacture','partnumber','noun','modifier','power','nothing','voltage','phase','frame','nothing'])

In [222]:
# Making one list from tokens
tok_for_test_onelist=[]
for i in range(len(tok_for_test)):
    for j in range(len(tok_for_test[i])):
        tok_for_test_onelist.append(tok_for_test[i][j])

In [223]:
# making one list from label
lab_for_test_onelist=[]
for i in range(len(label_for_test)):
    for j in range(len(label_for_test[i])):
        lab_for_test_onelist.append(label_for_test[i][j])

In [224]:
# Make featrures for tokens
test_tok_feat=list(map(calculate_all_features,tok_for_test_onelist))

In [225]:
# Make predict for tokens
pr_test=rand_for.predict(test_tok_feat)

In [226]:
# Estimates 
print(metrics.classification_report(lab_for_test_onelist,pr_test)) 

             precision    recall  f1-score   support

   amperage       1.00      1.00      1.00         1
    current       0.67      1.00      0.80         2
  enclosure       0.50      1.00      0.67         1
      frame       0.75      1.00      0.86         3
      hertz       0.75      1.00      0.86         3
manufacture       0.42      0.83      0.56         6
   modifier       1.00      1.00      1.00         3
    nothing       0.74      0.51      0.61        39
       noun       0.88      1.00      0.93         7
 partnumber       0.56      0.71      0.63         7
      phase       0.40      1.00      0.57         2
      power       0.50      0.29      0.36         7
        rpm       0.80      0.80      0.80         5
    voltage       1.00      0.80      0.89         5

avg / total       0.71      0.68      0.67        91



# With treshhold

### If the maximum probability of belonging to a class is less than 0.7, the token will be assigned to the class 'nothing'

In [227]:
token_predict=[0]*len(tok_for_test)
for j in range(len(tok_for_test)):
    test_tok_feat=list(map(calculate_all_features,tok_for_test[j]))
    proba_input=np.array(rand_for.predict_proba(test_tok_feat))


    token_predict_array=[]
    for i in range(len(tok_for_test[j])):
        if np.max(proba_input[i,:])>0.7:
            index=list(proba_input[i,:]).index(np.max(proba_input[i,:]))
            if np.max(proba_input[:,index])==np.max(proba_input[i,:]):
                token_predict_array.append(rand_for.classes_[index])
            else:
                token_predict_array.append('nothing')
        else:
            token_predict_array.append('nothing')
    token_predict[j]=token_predict_array

In [228]:
# Make one list for predict
token_predict_one_list=[]
for i in range(len(token_predict)):
    for j in range(len(token_predict[i])):
        token_predict_one_list.append(token_predict[i][j])

In [229]:
# Estimates for tokens
print (metrics.classification_report(lab_for_test_onelist,token_predict_one_list))

             precision    recall  f1-score   support

   amperage       1.00      1.00      1.00         1
    current       1.00      1.00      1.00         2
  enclosure       0.50      1.00      0.67         1
      frame       0.75      1.00      0.86         3
      hertz       1.00      1.00      1.00         3
manufacture       0.83      0.83      0.83         6
   modifier       1.00      1.00      1.00         3
    nothing       0.79      0.79      0.79        39
       noun       1.00      1.00      1.00         7
 partnumber       0.71      0.71      0.71         7
      phase       0.50      1.00      0.67         2
      power       0.50      0.29      0.36         7
        rpm       0.80      0.80      0.80         5
    voltage       1.00      0.80      0.89         5

avg / total       0.80      0.80      0.80        91



### Making data,which consist token, true class for token and predicted class for token

In [231]:
data91=pd.DataFrame(lab_for_test_onelist,columns=['true_class'])
data91['predicted_class']=token_predict_one_list
data91['tokens']=tok_for_test_onelist

In [232]:
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(data91)

Unnamed: 0,true_class,predicted_class,tokens
0,power,power,50HP
1,rpm,rpm,1800RPM
2,voltage,voltage,230/460
3,nothing,nothing,COOLING
4,nothing,nothing,TOWER
5,nothing,nothing,DUTY
6,nothing,nothing,BASED
7,nothing,nothing,OFF
8,nothing,nothing,OF
9,nothing,nothing,E10-01003999-GT-04
