In [2]:
import cv2
import numpy as np
import pandas as pd
import pytesseract # need to install tesseract and set system PATH first
from pytesseract import Output
from PIL import Image

INF = float('inf')

src = cv2.imread("example.png")

In [2]:
gray = cv2.cvtColor(src, cv2.COLOR_BGR2GRAY)
ret, binary_ = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
binary_ = 255 - binary_
# cv2.imshow("binary", binary_)
# cv2.waitKey(0)
num_labels, labels, stats, centers = cv2.connectedComponentsWithStats(binary_, connectivity=4, ltype=cv2.CV_32S)

In [3]:
## Filtering using aspect ratio and area
centers_x = stats[:, 0] + stats[:, 2] / 2
centers_y = stats[:, 1] + stats[:, 3] / 2
centers = np.array(list(zip(centers_x, centers_y)))
filtered_stats = stats[stats[:, -1] < 1000]
filtered_stats = filtered_stats[filtered_stats[:, -1] > 4]
filtered_centroids = centers[stats[:, -1] < 1000]
filtered_centroids = filtered_centroids[filtered_stats[:, -1] > 4]
aspect_ratio = filtered_stats[:, 2] / filtered_stats[:, 3]
filtered_stats = filtered_stats[aspect_ratio < 15]
filtered_stats = filtered_stats[aspect_ratio > (1 / 15)]
filtered_centroids = filtered_centroids[aspect_ratio < 15]
filtered_centroids = filtered_centroids[aspect_ratio > (1 / 15)]
print("BEFORE filtering: " + str(stats.shape), "AFTER filtering: " + str(filtered_stats.shape))

widths = filtered_stats[:, -2]
heights = filtered_stats[:, -3]
length_set = np.concatenate([widths, heights])
mode = np.argmax(np.bincount(length_set))
print("MODE: " + str(mode))

BEFORE filtering: (100, 5) AFTER filtering: (98, 5)
MODE: 16


In [4]:
colors = []
for i in range(filtered_stats.shape[0]):
    b = np.random.randint(0, 256)
    g = np.random.randint(0, 256)
    r = np.random.randint(0, 256)
    colors.append((b, g, r))

colors[0] = (0, 0, 0)
image = np.copy(src)
for t in range(1, filtered_stats.shape[0], 1):
    x, y, w, h, area = filtered_stats[t]
    cx, cy = filtered_centroids[t]
    cv2.rectangle(image, (x, y), (x+w, y+h), colors[t], 1, 8, 0)
    cv2.putText(image, str(t), (x, y), cv2.FONT_HERSHEY_SIMPLEX, .35, (0, 0, 255), 1);

cv2.imshow("colored labels", image)
cv2.waitKey(0)
cv2.destroyAllWindows()

In [5]:
def calculate_distance(center1, center2):
    return np.sqrt((center1[0] - center2[0]) ** 2 + (center1[1] - center2[1]) ** 2)

def calculate_closest_edge_distance(stats, id1, id2):
    l1, l2 = stats[id1, 0], stats[id2, 0]
    t1, t2 = stats[id1, 1], stats[id2, 1]
    w1, w2 = stats[id1, 2], stats[id2, 2]
    h1, h2 = stats[id1, 3], stats[id2, 3]
    r1, r2 = l1 + w1, l2 + w2
    b1, b2 = t1 + h1, t2 + h2
    alignment = check_alignment(stats, id1, id2)
    if alignment == 'vertical':
        if(l1 >= r2):
            return l1 - r2
        elif(l2 >= r1):
            return l2 - r1
        else:
            return min(l1 - r2, l2 - r1)
    elif alignment == 'horizontal':
        if(t1 >= b2):
            return t1 - b2
        elif(t2 >= b1):
            return t2 - b1
        else:
            return min(t1 - b2, t2 - b1)
    else:
        return INF

def check_alignment(stats, id1, id2):
    '''
    INPUTS: 
        stats: (left, top, width, height, area);
        id1: id of the first bounding box
        id2: id of the second bounding box
    OUTPUTS:
        mode: 'vertical' / 'horizontal' / 'neither'
        'vertical' means these two bounding boxes have overlapping height.
        'horizontal' means these two bounding boxes have overlapping width.
    '''
    l1, l2 = stats[id1, 0], stats[id2, 0]
    t1, t2 = stats[id1, 1], stats[id2, 1]
    w1, w2 = stats[id1, 2], stats[id2, 2]
    h1, h2 = stats[id1, 3], stats[id2, 3]
    r1, r2 = l1 + w1, l2 + w2
    b1, b2 = t1 + h1, t2 + h2
    if((b1 <= t2) or (b2 <= t1)):
        vertical_overlap = 0
    else:
        vertical_overlap = max(0, b1 - t2, b2 - t1)
    if((r1 <= l2) or (r2 <= l1)):
        horizontal_overlap = 0
    else:
        horizontal_overlap = max(0, r1 - l2, r2 - l1)
    # print(vertical_overlap, horizontal_overlap)
    if(horizontal_overlap > min(w1, w2) / 2):
        return 'horizontal'
    elif(vertical_overlap > min(h1, h2) / 2):
        return 'vertical'
    else:
        return 'neither'

def graph_initialization(stats, centers, mode):
    graph = [[] for _ in range(len(centers))]
    for i in range(0, len(centers)):
        for j in range(0, len(centers)):
            if(i == j):
                graph[i].append(0)
            else:
                distance = calculate_distance(centers[i], centers[j])
                graph[i].append(distance if distance < 2 * mode else INF)
    return graph

def edges_initialization(stats, centers, mode):
    edges = []
    for i in range(0, len(centers)):
        for j in range(0, len(centers)):
            if(i == j):
                continue
            else:
                distance = calculate_closest_edge_distance(stats, i, j)
                # distance = calculate_distance(centers[i], centers[j])
                if(distance < mode / 3):
                    if(check_alignment(stats, i, j) == 'neither'):
                        continue
                    else:
                        if((j, i, distance) not in edges):
                            edges.append((i, j, distance))
                else:
                    continue
    return edges

In [6]:
print(filtered_stats[9])
print(filtered_stats[19])
check_alignment(filtered_stats, 9, 19)

[419  56  19  22 157]
[401  63  11  11  97]


'vertical'

In [7]:
## Initialize adjacency lists
edges = edges_initialization(filtered_stats, filtered_centroids, mode)
print(len(edges))
edges

76


[(0, 1, 3),
 (0, 4, 2),
 (1, 3, 1),
 (1, 4, 3),
 (2, 4, 4),
 (5, 8, 3),
 (6, 7, 1),
 (8, 18, 1),
 (9, 12, 1),
 (10, 14, 2),
 (10, 15, 1),
 (11, 15, 0),
 (11, 16, 0),
 (12, 13, 0),
 (13, 14, 1),
 (16, 17, 3),
 (18, 20, 3),
 (20, 27, 2),
 (21, 25, 1),
 (22, 25, 2),
 (22, 26, 2),
 (23, 24, 2),
 (24, 26, 1),
 (27, 33, 1),
 (28, 29, 1),
 (30, 34, 1),
 (31, 32, 2),
 (31, 36, 2),
 (32, 36, 2),
 (32, 37, 2),
 (33, 39, 2),
 (34, 35, 1),
 (35, 36, 2),
 (40, 42, 1),
 (41, 45, 2),
 (42, 43, 2),
 (43, 44, 1),
 (44, 45, 1),
 (45, 46, 1),
 (48, 62, 1),
 (49, 53, 2),
 (50, 54, 2),
 (51, 52, 1),
 (51, 59, 2),
 (52, 59, 1),
 (52, 60, 1),
 (53, 54, 2),
 (54, 55, 1),
 (55, 56, 1),
 (56, 57, 1),
 (57, 58, 0),
 (58, 59, 3),
 (63, 64, 1),
 (65, 66, 0),
 (66, 67, 1),
 (67, 68, 1),
 (68, 69, 1),
 (69, 70, 2),
 (72, 73, 3),
 (73, 74, 2),
 (74, 77, 5),
 (77, 78, 3),
 (78, 79, 3),
 (81, 84, 1),
 (82, 85, 1),
 (83, 86, 1),
 (87, 88, 5),
 (88, 89, 5),
 (89, 90, 3),
 (90, 96, 1),
 (91, 97, 2),
 (92, 93, 3),
 (92, 95

In [8]:
## Kruskal algorithm
class Edge:
    def __init__(self, x, y, length):
        self.x = x
        self.y = y
        self.length = length

class UnionFindSet:
    def __init__(self, start, n):
        self.start = start  # start和n分别用于指示并查集里节点的起点和终点
        self.n = n
        self.pre = [0 for i in range(self.n - self.start + 2)]  # pre数组用于存放某个节点的上级
        self.rank = [0 for i in range(self.n - self.start + 2)]  # rank数组用于降低关系树的高度

    def init(self):
        for i in range(self.start, self.n+1):
            self.pre[i] = i
            self.rank[i] = 1

    def find_pre(self, x):
        if self.pre[x] == x:
            return x
        else:
            self.pre[x] = self.find_pre(self.pre[x])
        return self.pre[x]

    def is_same(self, x, y):
        return self.find_pre(x) == self.find_pre(y)

    def unite(self, x, y):
        x = self.find_pre(x)
        y = self.find_pre(y)
        if x == y:
            return False
        if self.rank[x] > self.rank[y]:
            self.pre[y] = x
        else:
            if self.rank[x] == self.rank[y]:
                self.rank[y] += 1
            self.pre[x] = y
        return True

    def is_one(self):
        temp = self.find_pre(self.start)
        for i in range(self.start+1, self.n+1):
            if self.find_pre(i) != temp:
                return False
        return True


class Kruskal:
    def __init__(self, n, m, edges):
        self.n = n  # n,m分别表示输入的点和边的个数
        self.m = m
        self.e = []  # 存放录入的无向连通图的所有边
        self.s = []  # 存放最小生成树里的所有边
        self.u = UnionFindSet(1, self.n)  # 并查集：抽象实现Graphnew，并完成节点间的连接工作以及判断整个图是否连通

    def graphy(self):
        """这里只是存储所有边的信息并按边的长度排序"""
        for i in range(self.m):
            x, y, length = list(map(int, edges[i]))
            self.e.append(Edge(x, y, length))
        self.e.sort(key=lambda e: e.length)
        self.u.init()

    def run(self):
        """执行函数：求解录入的无向连通图的最小生成树 """
        for i in range(self.m):
            if self.u.unite(self.e[i].x, self.e[i].y):
                self.s.append(self.e[i])
            if self.u.is_one():
                """一旦Graphnew的连通分支数为1，则说明求出了最小生成树 """
                break

    def print(self):
        print(f'构成最小生成树的边为：')
        edge_sum = 0
        for i in range(len(self.s)):
            print(f'边 <{self.s[i].x},{self.s[i].y}> = {self.s[i].length}')
            edge_sum += self.s[i].length
        print(f'最小生成树的权值为：{edge_sum}')

In [9]:
n, m = list(map(int, (len(filtered_centroids), len(edges))))
kruskal = Kruskal(n, m, edges)
kruskal.graphy()
kruskal.run()
kruskal.print()

构成最小生成树的边为：
边 <11,15> = 0
边 <11,16> = 0
边 <12,13> = 0
边 <57,58> = 0
边 <65,66> = 0
边 <1,3> = 1
边 <6,7> = 1
边 <8,18> = 1
边 <9,12> = 1
边 <10,15> = 1
边 <13,14> = 1
边 <21,25> = 1
边 <24,26> = 1
边 <27,33> = 1
边 <28,29> = 1
边 <30,34> = 1
边 <34,35> = 1
边 <40,42> = 1
边 <43,44> = 1
边 <44,45> = 1
边 <45,46> = 1
边 <48,62> = 1
边 <51,52> = 1
边 <52,59> = 1
边 <52,60> = 1
边 <54,55> = 1
边 <55,56> = 1
边 <56,57> = 1
边 <63,64> = 1
边 <66,67> = 1
边 <67,68> = 1
边 <68,69> = 1
边 <81,84> = 1
边 <82,85> = 1
边 <83,86> = 1
边 <90,96> = 1
边 <0,4> = 2
边 <10,14> = 2
边 <20,27> = 2
边 <22,25> = 2
边 <22,26> = 2
边 <23,24> = 2
边 <31,32> = 2
边 <31,36> = 2
边 <32,37> = 2
边 <33,39> = 2
边 <35,36> = 2
边 <41,45> = 2
边 <42,43> = 2
边 <49,53> = 2
边 <50,54> = 2
边 <53,54> = 2
边 <69,70> = 2
边 <73,74> = 2
边 <91,97> = 2
边 <93,94> = 2
边 <0,1> = 3
边 <5,8> = 3
边 <16,17> = 3
边 <18,20> = 3
边 <58,59> = 3
边 <72,73> = 3
边 <77,78> = 3
边 <78,79> = 3
边 <89,90> = 3
边 <92,93> = 3
边 <92,95> = 3
边 <95,97> = 3
边 <96,97> = 3
边 <2,4> = 4
边 <74,77> = 5
边 <87,88

In [10]:
## Combine all connected components
trees = []
for edge in kruskal.s:
    fail = 0
    id1 = edge.x
    id2 = edge.y
    for i in range(len(trees)):
        if(id1 in trees[i]):
            trees[i].append(id2)
        elif(id2 in trees[i]):
            trees[i].append(id1)
        else:
            fail += 1
    if(fail == len(trees)):
        trees.append([id1, id2])

b = len(trees)
for i in range(b):
    for j in range(b):
        x = list(set(trees[i]+trees[j]))
        y = len(trees[j])+len(trees[i])
        if i == j or trees[i] == 0 or trees[j] == 0:
            break
        elif len(x) < y:
            trees[i] = x
            trees[j] = [0]
word_trees = [i for i in trees if i != [0]]
word_trees

[[9, 10, 11, 12, 13, 14, 15, 16, 17],
 [65, 66, 67, 68, 69, 70],
 [6, 7],
 [21, 22, 23, 24, 25, 26],
 [33, 5, 39, 8, 18, 20, 27],
 [28, 29],
 [40, 41, 42, 43, 44, 45, 46],
 [48, 62],
 [63, 64],
 [81, 84],
 [82, 85],
 [83, 86],
 [0, 1, 2, 3, 4],
 [32, 34, 35, 36, 37, 30, 31],
 [49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60],
 [72, 73, 74, 77, 78, 79],
 [96, 97, 87, 88, 89, 90, 91, 92, 93, 94, 95]]

In [11]:
## Draw integrated textboxes.
temp_stats = filtered_stats.copy()
for tree in word_trees:
    for i in range(0, len(tree) - 1):
        id1 = tree[i]
        id2 = tree[i + 1]
        l1, l2 = temp_stats[id1, 0], temp_stats[id2, 0]
        t1, t2 = temp_stats[id1, 1], temp_stats[id2, 1]
        w1, w2 = temp_stats[id1, 2], temp_stats[id2, 2]
        h1, h2 = temp_stats[id1, 3], temp_stats[id2, 3]
        r1, r2 = l1 + w1, l2 + w2
        b1, b2 = t1 + h1, t2 + h2
        new_top = min(t1, t2)
        new_bottom = max(b1, b2)
        new_height = new_bottom - new_top
        new_left = min(l1, l2)
        new_right = max(r1, r2)
        new_width = new_right - new_left
        new_area = new_width * new_height
        new_bounding_box = (new_left, new_top, new_width, new_height, new_area)
        temp_stats[id1] = [0, ] * 5
        temp_stats[id2] = new_bounding_box
indexes = np.where(~temp_stats.any(axis=1))
print("BEFORE filtering: ", temp_stats.shape)
print("Removed edges: ", indexes)
temp_stats = np.delete(temp_stats, indexes, axis=0)
print("AFTER filtering: ", temp_stats.shape)
colors = []
for i in range(num_labels):
    b = np.random.randint(0, 256)
    g = np.random.randint(0, 256)
    r = np.random.randint(0, 256)
    colors.append((b, g, r))

colors[0] = (0, 0, 0)
image = np.copy(src)
for t in range(0, len(temp_stats)):
    x, y, w, h, area = temp_stats[t]
    # 画出外接矩形
    cv2.rectangle(image, (x, y), (x+w, y+h), colors[t], 1, 8, 0)
    cv2.putText(image, str(t), (x, y), cv2.FONT_HERSHEY_SIMPLEX, .35, (0, 0, 255), 1);
    # print("label index %d, area of the label : %d"%(t, area))

cv2.imshow("combined", image)
cv2.waitKey(0)
cv2.destroyAllWindows()

BEFORE filtering:  (98, 5)
Removed edges:  (array([ 0,  1,  2,  3,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15, 16, 18, 20,
       21, 22, 23, 24, 25, 28, 30, 32, 33, 34, 35, 36, 37, 39, 40, 41, 42,
       43, 44, 45, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 63, 65,
       66, 67, 68, 69, 72, 73, 74, 77, 78, 81, 82, 83, 87, 88, 89, 90, 91,
       92, 93, 94, 96, 97], dtype=int64),)
AFTER filtering:  (25, 5)


In [12]:
# cv2.imwrite('./bounded.png', image)

## Tesseract OCR

In [13]:
## Filter out all the textual type data, assign None to other non-detectable textboxes.
normal_binary = 255 - binary_
text_list = []
conf_list = []
for i in range(temp_stats.shape[0]):
    best_conf = -INF
    best_word = None
    l, t, w, h = temp_stats[i, :4]
    text_box = normal_binary[max(0, min(normal_binary.shape[0], t-1)):t+h+1, max(0, l-1):min(normal_binary.shape[1], l+w+1)]
    # cv2.imshow("OpenCV",text_box)  
    # cv2.waitKey()  
    rotate_text_boxes = text_box, cv2.rotate(text_box, cv2.ROTATE_90_CLOCKWISE), cv2.rotate(text_box, cv2.ROTATE_180)
    for j in range(len(rotate_text_boxes)):
        rotate_text_box = cv2.resize(rotate_text_boxes[j],dsize=None,fx=3,fy=3,interpolation=cv2.INTER_LINEAR)
        textbox = Image.fromarray(cv2.cvtColor(rotate_text_box, cv2.COLOR_BGR2RGB))
        text = pytesseract.image_to_data(textbox, output_type=Output.DICT, config='--psm 6')
        while('' in text['text']):
            text['text'].remove('')
            text['conf'].remove('-1')
        if(len(text['text']) != 1):
            text['text'] = []
            text['conf'] = []
        if(len(text['text']) != 0):
            if(best_conf < float(text['conf'][0])):
                best_conf = float(text['conf'][0])
                best_word = text['text'][0]
    print(best_word)
    text_list.append((temp_stats[i, 0], temp_stats[i, 1], temp_stats[i, 2], temp_stats[i, 3], best_word))
    conf_list.append(best_conf)

site
40
Crookston
@
Duluth
Records
30
Rapids
@
Morris
@
University
@
of
20
Waseca
@
1
O
Number
0
20
40
60
BIN(yield)


In [14]:
## Filter numerical type data from the data that were marked as None.
config = r'-c tessedit_char_whitelist=0123456789 --psm 6'
for i in list(np.where(np.array(text_list) == None)[0]):
    best_conf = -INF
    best_word = None
    l, t, w, h = temp_stats[i, :4]
    text_box = normal_binary[max(0, min(normal_binary.shape[0], t-1)):t+h+1, max(0, l-1):min(normal_binary.shape[1], l+w+1)]
    # cv2.imshow("OpenCV",text_box)  
    # cv2.waitKey()  
    rotate_text_boxes = text_box, cv2.rotate(text_box, cv2.ROTATE_90_CLOCKWISE), cv2.rotate(text_box, cv2.ROTATE_90_COUNTERCLOCKWISE), cv2.rotate(text_box, cv2.ROTATE_180)
    for j in range(len(rotate_text_boxes)):
        rotate_text_box = cv2.resize(rotate_text_boxes[j],dsize=None,fx=3,fy=3,interpolation=cv2.INTER_LINEAR)
        textbox = Image.fromarray(cv2.cvtColor(rotate_text_box, cv2.COLOR_BGR2RGB))
        text = pytesseract.image_to_data(textbox, output_type=Output.DICT, config=config)
        while('' in text['text']):
            text['text'].remove('')
            text['conf'].remove('-1')
        if(len(text['text']) != 1):
            text['text'] = []
            text['conf'] = []
        if(len(text['text']) != 0):
            if(best_conf < float(text['conf'][0])):
                best_conf = float(text['conf'][0])
                best_word = text['text'][0]
    print(best_word)
    text_list[i] = (temp_stats[i, 0], temp_stats[i, 1], temp_stats[i, 2], temp_stats[i, 3], best_word)
    conf_list[i] = best_conf

In [15]:
## Final detected text list
text_list

[(395, 3, 80, 36, 'site'),
 (49, 32, 37, 27, '40'),
 (419, 56, 125, 22, 'Crookston'),
 (401, 63, 11, 11, '@'),
 (420, 91, 78, 21, 'Duluth'),
 (0, 16, 33, 163, 'Records'),
 (49, 126, 37, 27, '30'),
 (421, 126, 84, 27, 'Rapids'),
 (401, 133, 11, 11, '@'),
 (420, 161, 76, 21, 'Morris'),
 (401, 168, 11, 11, '@'),
 (421, 196, 120, 27, 'University'),
 (401, 203, 11, 10, '@'),
 (0, 194, 33, 39, 'of'),
 (49, 220, 37, 26, '20'),
 (419, 231, 99, 22, 'Waseca'),
 (401, 238, 11, 11, '@'),
 (51, 312, 10, 26, '1'),
 (68, 312, 18, 26, 'O'),
 (0, 245, 33, 156, 'Number'),
 (68, 405, 18, 27, '0'),
 (157, 432, 26, 37, '20'),
 (297, 432, 26, 37, '40'),
 (437, 432, 26, 37, '60'),
 (214, 480, 192, 40, 'BIN(yield)')]

In [16]:
## Final detected confidence level list
conf_list

[96.226944,
 96.144012,
 96.159698,
 82.355499,
 96.278496,
 85.696266,
 96.639877,
 96.570541,
 91.821976,
 95.056419,
 20.708656,
 96.60305,
 55.109879,
 89.049118,
 96.117836,
 94.641388,
 48.042492,
 95.638756,
 71.542084,
 57.900421,
 81.672318,
 94.406685,
 96.540146,
 57.313164,
 90.315033]

In [17]:
text_arrays = np.array(text_list)
text_arrays = text_arrays[text_arrays[:, 4] != '@']
text_arrays

array([['395', '3', '80', '36', 'site'],
       ['49', '32', '37', '27', '40'],
       ['419', '56', '125', '22', 'Crookston'],
       ['420', '91', '78', '21', 'Duluth'],
       ['0', '16', '33', '163', 'Records'],
       ['49', '126', '37', '27', '30'],
       ['421', '126', '84', '27', 'Rapids'],
       ['420', '161', '76', '21', 'Morris'],
       ['421', '196', '120', '27', 'University'],
       ['0', '194', '33', '39', 'of'],
       ['49', '220', '37', '26', '20'],
       ['419', '231', '99', '22', 'Waseca'],
       ['51', '312', '10', '26', '1'],
       ['68', '312', '18', '26', 'O'],
       ['0', '245', '33', '156', 'Number'],
       ['68', '405', '18', '27', '0'],
       ['157', '432', '26', '37', '20'],
       ['297', '432', '26', '37', '40'],
       ['437', '432', '26', '37', '60'],
       ['214', '480', '192', '40', 'BIN(yield)']], dtype='<U11')

In [18]:
text_infos = pd.DataFrame(text_arrays, columns=['x', 'y', 'width', 'height', 'text'])
text_infos['id'] = text_infos.index
ids = text_infos.pop('id')
text_infos.insert(0,"id",ids)
text_infos

Unnamed: 0,id,x,y,width,height,text
0,0,395,3,80,36,site
1,1,49,32,37,27,40
2,2,419,56,125,22,Crookston
3,3,420,91,78,21,Duluth
4,4,0,16,33,163,Records
5,5,49,126,37,27,30
6,6,421,126,84,27,Rapids
7,7,420,161,76,21,Morris
8,8,421,196,120,27,University
9,9,0,194,33,39,of


In [19]:
#text_infos.to_csv('./bounded-pred2-texts.csv')

In [20]:
chart = rev.Chart(fn='bounded.png', text_from=2)
chart

<rev.chart.Chart at 0x13b68d49898>

In [21]:
# text role classification
text_clf = rev.text.TextClassifier('default')
text_type_preds = text_clf.classify(chart)
print(text_type_preds)

KeyError: 232

In [6]:
from sklearn.externals import joblib
from sklearn import svm
import classifier

ImportError: attempted relative import with no known parent package

In [26]:
model_files = {
    'default': 'models/text_role_classifier/text_type_classifier.pkl',
    'testing': 'models/text_role_classifier/text_type_classifier_new.pkl'
}
model_file = model_files['default']
joblib.load(model_file)



Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('svc', SVC(C=100, cache_size=200, class_weight='balanced', coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma=0.1, kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False))])

In [29]:
text_clf = rev.text.classifier.TextClassifier('default')

KeyError: 232