In [37]:
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import numpy as np
import warnings
import os
from skimage import io
from sklearn import svm
import sklearn
#warnings.filterwarnings(action='once')
warnings.filterwarnings('ignore')

pd.options.display.width = 0
from sklearn import svm
    
def train_classifier(clf):
    #this function trains a classifier, from sklearn, with our data
    
    # go through all files:
    x = []
    y = []
    w = []
    h = []
    fig_width = []
    fig_height = []
    labels = []
    path = os.path.join("your_path","academic") # this is the path to the data from https://drive.google.com/drive/folders/1Bg9hyxlt2szXj6CBWIIt3yInIjKEqPFx
    i = 0
    for file_name in os.listdir(path):
        if file_name[-15:] != "pred1-texts.csv" and file_name[-4:] != ".png":
            df = pd.read_csv(os.path.join(path,file_name))
        if file_name[-4:] == ".png":
            img = io.imread(os.path.join(path,file_name))
            #save relevant information
            x.append(df["x"].to_numpy())
            y.append(df["y"].to_numpy())
            w.append(df["width"].to_numpy())
            h.append(df["height"].to_numpy())
            fig_width.append([img.shape[1]])
            fig_height.append([img.shape[0]])
            labels.extend(df["type"].to_numpy())
            
    labels = np.array(labels)
    text_labels = labels.copy()
    
    for i,label in enumerate(np.unique(labels)):
        labels[labels==label] = int(i)
    
    #calculate the features we need with our own function
    my_features = []
    for i in range(len(x)):
        features = calculate_text_class_features(fig_width[i], fig_height[i], x[i], x[i]+w[i], y[i], y[i]+h[i])
        for row in features:
            my_features.append(row)

    my_features = np.array(my_features)

    #create the final dataframe and split the data
    data = np.empty([len(my_features),11])
    for i,row in enumerate(data):
        data[i] = np.append(my_features[i],labels[i])

    np.random.shuffle(data)

    x_train = data[:int(len(data)*0.8),:-1]
    x_test = data[int(len(data)*0.8):,:-1]

    y_train = data[:int(len(data)*0.8),-1].astype("int")
    y_test = data[int(len(data)*0.8):,-1].astype("int")

    #train the classifier
    clf.fit(x_train, y_train)
    return clf, x_train, x_test, y_train, y_test

In [38]:
def calculate_text_class_features(fig_width, fig_height, x_mins, x_maxs, y_mins, y_maxs):
    #this function calculates the text classification features
    
    # x_min, x_max, y_min, y_max as a np.array of all boxes
    n_boxes = len(x_mins)
    features = np.zeros([n_boxes,10])
    
    #to calculate:
    #norm_x_coordinate
    widths = x_maxs - x_mins
    x_coordinates = widths/2 + x_mins
    features[:,0] = x_coordinates/fig_width
    
    #norm_y_coordinate
    heights = y_maxs - y_mins
    y_coordinates = heights/2 + y_mins
    features[:,1] = y_coordinates/fig_height
    
    #aspect_ratio
    features[:,2] = widths/heights
    
    #quadrant
    quad = np.empty_like(features[:,1])
    for i,(x,y) in enumerate(zip(features[:,0],features[:,1])):
        if x < 0.5 and y < 0.5:
            quad[i] = 0
        elif x >= 0.5 and y < 0.5:
            quad[i] = 1
        elif x >= 0.5 and y >= 0.5:
            quad[i] = 2
        elif x < 0.5 and y >= 0.5:
            quad[i] = 3
    features[:,3] = quad
    
    #norm_cont_h
    cont_h = max(y_maxs) - min(y_mins)
    features[:,4] = cont_h/fig_height
    
    #norm_cont_w
    cont_w = max(x_maxs) - min(x_mins)
    features[:,5] = cont_w/fig_width
    
    #normalized center coordinates relative to the container x
    features[:,6] = (x_coordinates - min(x_mins))/cont_w
    
    #normalized center coordinates relative to the container y
    features[:,7] = (y_coordinates - min(y_mins))/cont_h
    
    #vertical_score and horizontal_score
    h_score = np.empty_like(x_mins)
    
    #vscore is complicated, maybe i should consider direction of the text
    v_score = np.empty_like(x_mins)

    for i,box in enumerate(zip(x_mins, x_maxs, y_mins, y_maxs)):
        h_intersections = 0
        v_intersections = 0

        for boxx in zip(x_mins, x_maxs, y_mins, y_maxs):
            mid_x = (box[1] - box[0])/2 + box[0]
            mid_y = (box[3] - box[2])/2 + box[2]
            x_pixels = box[1] - box[0]
            y_pixels = box[3] - box[2]
            x_pixxels = boxx[1] - boxx[0]
            y_pixxels = boxx[3] - boxx[2]
            if boxx[0] < mid_x < boxx[1]:
                if x_pixels>=y_pixels and x_pixxels>=y_pixxels:
                    h_intersections += 1
                if x_pixels<=y_pixels and x_pixxels<=y_pixxels:
                    h_intersections += 1

            elif boxx[0] < mid_x < boxx[1]:
                if x_pixels>=y_pixels and x_pixxels>=y_pixxels:
                    h_intersections += 1
                if x_pixels<=y_pixels and x_pixxels<=y_pixxels:
                    h_intersections += 1

            
            if boxx[2] < mid_y < boxx[3]:
                if x_pixels>=y_pixels and x_pixxels>=y_pixxels:
                    v_intersections += 1
                if x_pixels<=y_pixels and x_pixxels<=y_pixxels:
                    v_intersections += 1

            elif boxx[2] < mid_y < boxx[3]:
                if x_pixels>=y_pixels and x_pixxels>=y_pixxels:
                    v_intersections += 1
                if x_pixels<=y_pixels and x_pixxels<=y_pixxels:
                    v_intersections += 1

        h_score[i] = h_intersections  
        v_score[i] = v_intersections  
    features[:,8] = (h_score+1)/n_boxes
    features[:,9] = (v_score+1)/n_boxes

    return features

In [41]:
clf = svm.SVC(C=100, gamma=0.1)
clf, x_train, x_test, y_train, y_test = train_classifier(clf)
pred = clf.predict(x_test)
print(f"svm accuracy on testset: {np.sum(pred==y_test)/len(y_test)}")
print(f"svm f1-score on testset: {sklearn.metrics.f1_score(y_test,pred, average='weighted')}")

svm accuracy on testset: 0.9744744744744744
svm f1-score on testset: 0.9712534541706195


In [42]:
clf = clf = RandomForestClassifier()
clf, x_train, x_test, y_train, y_test = train_classifier(clf)
pred = clf.predict(x_test)
print(f"RF accuracy on testset: {np.sum(pred==y_test)/len(y_test)}")
print(f"RF f1-score on testset: {sklearn.metrics.f1_score(y_test,pred, average='weighted')}")

RF accuracy on testset: 0.9737237237237237
RF f1-score on testset: 0.9726712734076394
