# Word detection

## Libraries required

In [1]:
# !pip install Pillow
# !pip install pytesseract
# !pip install opencv-python

'''
For pytesseract please install exe file in the same folder
'''

'\nFor pytesseract please install exe file in the same folder\n'

In [2]:
try:
    from PIL import Image
except ImportError:
    import Image

import pytesseract
from pytesseract import Output
import urllib.request    
import pandas as pd
import numpy as np

In [3]:
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
pytesseract.get_tesseract_version()

LooseVersion ('5.0.0.20190623')

In [4]:
# from os import listdir
# from os.path import isfile, join
# onlyfiles = [f for f in listdir('./') if isfile(join('./', f))]
# print(onlyfiles)

In [5]:
import cv2

# get grayscale image
def get_grayscale(image):
    return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)

# noise removal
def remove_noise(image):
    return cv2.medianBlur(image,5)
 
#thresholding
def thresholding(image):
    return cv2.threshold(image, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]

#dilation
def dilate(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.dilate(image, kernel, iterations = 1)
    
#erosion
def erode(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.erode(image, kernel, iterations = 1)

#opening - erosion followed by dilation
def opening(image):
    kernel = np.ones((5,5),np.uint8)
    return cv2.morphologyEx(image, cv2.MORPH_OPEN, kernel)

#canny edge detection
def canny(image):
    return cv2.Canny(image, 100, 200)

#skew correction
def deskew(image):
    coords = np.column_stack(np.where(image > 0))
    angle = cv2.minAreaRect(coords)[-1]
    if angle < -45:
        angle = -(90 + angle)
    else:
        angle = -angle
    (h, w) = image.shape[:2]
    center = (w // 2, h // 2)
    M = cv2.getRotationMatrix2D(center, angle, 1.0)
    rotated = cv2.warpAffine(image, M, (w, h), flags=cv2.INTER_CUBIC, borderMode=cv2.BORDER_REPLICATE)
    return rotated

#template matching
def match_template(image, template):
    return cv2.matchTemplate(image, template, cv2.TM_CCOEFF_NORMED) 

In [6]:
def url_to_image(url):
    # download the image, convert it to a NumPy array, and then read
    # it into OpenCV format
    try:
        with urllib.request.urlopen(url) as url:
            resp = url.read()
            image = np.asarray(bytearray(resp), dtype="uint8")
            image = cv2.imdecode(image, cv2.IMREAD_COLOR)
        # return the image
        # print(type(image))
            return image
    except:
        return None

movie_poster_data = pd.read_csv('./MovieGenre.csv', encoding = "ISO-8859-1")
movie_poster_url = movie_poster_data['Poster']
movie_poster_url_list = movie_poster_url.tolist()

In [7]:
image_list = []


count = 0
for url in movie_poster_url_list:
    if count == 100:
        break
    count += 1
#     print(count)
    image_list.append(url_to_image(url))

In [8]:
movie_poster_words = pd.DataFrame()

movie_poster_words['imdbId'] = movie_poster_data['imdbId']
movie_poster_words['Imdb Link'] = movie_poster_data['Imdb Link']
movie_poster_words['Title'] = movie_poster_data['Title']
movie_poster_words['Poster'] = movie_poster_data['Poster']


In [9]:
movie_poster_detected_words = []
for image in image_list:
    word_list = []
    try:
        d = pytesseract.image_to_data(image, output_type=Output.DICT)['text']
#         print(d)
        grayed = get_grayscale(image)
        threshed = thresholding(grayed)
        openinged = opening(grayed)
        cannyed = canny(grayed)
        grayscale = pytesseract.image_to_data(grayed, output_type=Output.DICT)['text']
        threshscale = pytesseract.image_to_data(threshed, output_type=Output.DICT)['text']
        openingscale = pytesseract.image_to_data(openinged, output_type=Output.DICT)['text']
        cannyscale = pytesseract.image_to_data(cannyed, output_type=Output.DICT)['text']
        
        word_list += d
        word_list += grayscale
        word_list += threshscale
        word_list += openingscale
        word_list += cannyscale
        
        word_list = set(word_list)
        word_list = list(set(word_list))
        print(word_list)
        movie_poster_detected_words.append(word_list)

    except:
        list_nothing = ['']
        movie_poster_detected_words.append(list_nothing)


['', 't', 'as', 'PTY,', 'Hy', 'i', 'xe', '|', 'gh', 'Pty,', '2', 'a', 'x.', 'oA', 'S%', 'ale', 'wee', 'ey', 'SY', '»', 'os', 'CAE']
['']
['', 'OLD', 'a', 'MEN', 'i“', ' ', 'GRUMPIER', 'lee']
['', 'oF', 'oEnbole', 'OD', 'Dhritiog', 'Whitney', 'Bassett', 'Houston', 'Whitnoy', 'Angelo', 'Kiting', '9&', '‘Whitney', 'Mbiting']
['']
['', 'ar', 'a', 'PO', 'an', '£', '“Mins', 'au', 'i', 'ae', '©', '|', 'E']
['', 'has', 'RUIN', 'I)', 'Mera', 'nc', 'E']
['', '   ', 'Ait', 'cies', ' ']
['']
['', 'Destin', 'Saat', ' ']
['', 'ia', 'LA', 'a', 'DRACULA', 'oan', 'ed']
['', 'Pe', '2']
['', 'Lr', 'a', 'ae', '3', 'ital']
['', 'Gem', 'Mette', 'Drvis', 'Modine']
['']
['', 'wilin', 'SENSIBILITY', 'vib', 'SYNSIBILILY,', ' ', 'Ne', 'w', 'D4', 'i', 'SENSE3SENSIBILITY', 'nition']
['']
['', 'Sh', 'de']
['', 'ak)', '|', 'ales']
['', 'ri', 'iat', 'a', 'TL', 'eras', 'A', ' ', ':', 'ea', 'i', 'ee)', 'ees', '5']
['', 'pn', 'Pe', 'Fey', 'ao', 'ew', ' ']
['', 'at', 'vd']
['', ',', 'if}', ':']
['', 'aNd', 'a)', 'ee', 'C

In [10]:
movie_poster_words_temp = movie_poster_words.head(100)
movie_poster_words_trimmed = movie_poster_words_temp.copy()
movie_poster_words_trimmed['detected_words'] = movie_poster_detected_words

In [11]:
movie_poster_words_trimmed.head(50)

Unnamed: 0,imdbId,Imdb Link,Title,Poster,detected_words
0,114709,http://www.imdb.com/title/tt114709,Toy Story (1995),https://images-na.ssl-images-amazon.com/images...,"[, t, as, PTY,, Hy, i, xe, |, gh, Pty,, 2, a, ..."
1,113497,http://www.imdb.com/title/tt113497,Jumanji (1995),https://images-na.ssl-images-amazon.com/images...,[]
2,113228,http://www.imdb.com/title/tt113228,Grumpier Old Men (1995),https://images-na.ssl-images-amazon.com/images...,"[, OLD, a, MEN, i“, , GRUMPIER, lee]"
3,114885,http://www.imdb.com/title/tt114885,Waiting to Exhale (1995),https://images-na.ssl-images-amazon.com/images...,"[, oF, oEnbole, OD, Dhritiog, Whitney, Bassett..."
4,113041,http://www.imdb.com/title/tt113041,Father of the Bride Part II (1995),https://images-na.ssl-images-amazon.com/images...,[]
5,113277,http://www.imdb.com/title/tt113277,Heat (1995),https://images-na.ssl-images-amazon.com/images...,"[, ar, a, PO, an, £, “Mins, au, i, ae, ©, |, E]"
6,114319,http://www.imdb.com/title/tt114319,Sabrina (1995),https://images-na.ssl-images-amazon.com/images...,"[, has, RUIN, I), Mera, nc, E]"
7,112302,http://www.imdb.com/title/tt112302,Tom and Huck (1995),https://images-na.ssl-images-amazon.com/images...,[]
8,114576,http://www.imdb.com/title/tt114576,Sudden Death (1995),https://images-na.ssl-images-amazon.com/images...,"[, , Ait, cies, ]"
9,113189,http://www.imdb.com/title/tt113189,GoldenEye (1995),https://images-na.ssl-images-amazon.com/images...,[]


In [12]:
movie_poster_words_trimmed.to_csv('./pytesseract/movie_poster_detected_words.csv')