# Analysis of getting text from instagram images

Note: 
- Need to use home brew on mac to get tessaract for the pytesseract package. To get the executable location, use `homebrew info tesseract` command

In [1]:
# text recognition
from matplotlib import pyplot as plt
import cv2
import pytesseract
from pytesseract import Output
import re
import os 
import pandas as pd
# configurations
config = ('-l eng --oem 1 --psm 3')
# pytesserct executable location 
pytesseract.pytesseract.tesseract_cmd = r'/opt/homebrew/bin/tesseract'

In [2]:
# crop image to just the text box of the image for processing
def preprocess_image(img):
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = img_rgb[160:950,100:875]
    return img

In [3]:
# Clean text after extraction from dictionary output into one string line
def cleaning_text(text):
    # Remove all instances of empty strings in the list 
    combined_string = [x for x in text if x != '']
    # Combine the list
    text = ' '.join(combined_string)
    # Spit the string again to standardize the spacing of the data
    result2 = text.split()
    # Join the data back together
    return ' '.join(result2)

In [4]:
# assign directory
directory = 'yikyakyeo/image_type_1/'
data_dict = {"image_name":[], "text":[], "image_type_folder":[]}

In [5]:
# iterate over files in
# that directory
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        print(f)
        img = cv2.imread(f)
        img = preprocess_image(img)
        d = pytesseract.image_to_data(img, output_type=Output.DICT)
        result = cleaning_text(d["text"])
        data_dict["image_name"].append(filename)
        data_dict["text"].append(result)
        data_dict["image_type_folder"].append(directory)


yikyakyeo/image_type_1/2022-01-23_02-08-42_UTC_5.jpg
yikyakyeo/image_type_1/2022-03-15_17-38-12_UTC_8.jpg
yikyakyeo/image_type_1/2022-03-12_01-47-55_UTC_2.jpg
yikyakyeo/image_type_1/2022-10-06_03-53-13_UTC_6.jpg
yikyakyeo/image_type_1/2022-03-02_19-15-28_UTC_3.jpg
yikyakyeo/image_type_1/2022-05-20_20-41-13_UTC_9.jpg
yikyakyeo/image_type_1/2022-05-24_18-54-30_UTC_2.jpg
yikyakyeo/image_type_1/2022-02-23_17-38-33_UTC_4.jpg
yikyakyeo/image_type_1/2022-05-22_12-24-15_UTC_3.jpg
yikyakyeo/image_type_1/2022-01-30_01-06-22_UTC_9.jpg
yikyakyeo/image_type_1/2022-09-28_01-12-39_UTC_7.jpg
yikyakyeo/image_type_1/2022-03-27_04-28-56_UTC_4.jpg
yikyakyeo/image_type_1/2022-09-07_22-04-39_UTC_2.jpg
yikyakyeo/image_type_1/2022-05-11_02-26-58_UTC_2.jpg
yikyakyeo/image_type_1/2022-03-09_03-19-19_UTC_4.jpg
yikyakyeo/image_type_1/2022-03-21_19-11-37_UTC_8.jpg
yikyakyeo/image_type_1/2022-03-03_16-01-26_UTC_2.jpg
yikyakyeo/image_type_1/2022-01-29_00-53-24_UTC_6.jpg
yikyakyeo/image_type_1/2022-03-30_20-13-53_UTC

In [7]:
def preprocess_image2(img):
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = img_rgb[300:,:]
    return img

In [8]:
# iterate over files in
# that directory
# assign directory
directory = 'yikyakyeo/image_type_2/'
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        print(f)
        img = cv2.imread(f)
        img = preprocess_image2(img)
        d = pytesseract.image_to_data(img, output_type=Output.DICT)
        result = cleaning_text(d["text"])
        data_dict["image_name"].append(filename)
        data_dict["text"].append(result)
        data_dict["image_type_folder"].append(directory)

yikyakyeo/image_type_2/2022-04-08_01-03-18_UTC_6.jpg
yikyakyeo/image_type_2/2022-04-08_01-03-18_UTC_10.jpg
yikyakyeo/image_type_2/2022-04-05_00-57-50_UTC_1.jpg
yikyakyeo/image_type_2/2022-04-07_13-10-21_UTC_1.jpg
yikyakyeo/image_type_2/2022-01-28_12-12-52_UTC_4.jpg
yikyakyeo/image_type_2/2022-01-28_12-12-52_UTC_5.jpg
yikyakyeo/image_type_2/2022-04-08_01-03-18_UTC_7.jpg
yikyakyeo/image_type_2/2022-04-08_01-03-18_UTC_5.jpg
yikyakyeo/image_type_2/2022-04-07_13-10-21_UTC_2.jpg
yikyakyeo/image_type_2/2022-04-05_00-57-50_UTC_2.jpg
yikyakyeo/image_type_2/2022-01-28_12-12-52_UTC_6.jpg
yikyakyeo/image_type_2/2022-04-05_00-57-50_UTC_3.jpg
yikyakyeo/image_type_2/2022-04-08_01-03-18_UTC_4.jpg
yikyakyeo/image_type_2/2022-04-07_13-10-21_UTC_7.jpg
yikyakyeo/image_type_2/2022-04-05_00-57-50_UTC_7.jpg
yikyakyeo/image_type_2/2022-04-07_13-10-21_UTC_10.jpg
yikyakyeo/image_type_2/2022-01-28_12-12-52_UTC_2.jpg
yikyakyeo/image_type_2/2022-01-28_12-12-52_UTC_3.jpg
yikyakyeo/image_type_2/2022-04-05_20-52-04_U

In [9]:
def preprocess_image3(img):
    img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
    img = img_rgb[300:,:]
    return img

In [10]:
# iterate over files in
# that directory
# assign directory
directory = 'yikyakyeo/image_type_3/'
for filename in os.listdir(directory):
    f = os.path.join(directory, filename)
    # checking if it is a file
    if os.path.isfile(f):
        print(f)
        img = cv2.imread(f)
        img = preprocess_image3(img)
        d = pytesseract.image_to_data(img, output_type=Output.DICT)
        result = cleaning_text(d["text"])
        data_dict["image_name"].append(filename)
        data_dict["text"].append(result)
        data_dict["image_type_folder"].append(directory)

yikyakyeo/image_type_3/2022-09-27_01-22-09_UTC_6.jpg
yikyakyeo/image_type_3/2022-10-07_01-43-46_UTC_4.jpg
yikyakyeo/image_type_3/2022-09-25_01-14-40_UTC_8.jpg
yikyakyeo/image_type_3/2022-10-10_06-07-58_UTC_7.jpg
yikyakyeo/image_type_3/2022-10-02_00-10-52_UTC_4.jpg
yikyakyeo/image_type_3/2022-09-23_01-16-00_UTC_8.jpg
yikyakyeo/image_type_3/2022-09-23_01-16-00_UTC_9.jpg
yikyakyeo/image_type_3/2022-10-02_00-10-52_UTC_5.jpg
yikyakyeo/image_type_3/2022-10-10_06-07-58_UTC_6.jpg
yikyakyeo/image_type_3/2022-09-25_01-14-40_UTC_9.jpg
yikyakyeo/image_type_3/2022-10-07_01-43-46_UTC_5.jpg
yikyakyeo/image_type_3/2022-09-27_01-22-09_UTC_7.jpg
yikyakyeo/image_type_3/2022-09-27_01-22-09_UTC_5.jpg
yikyakyeo/image_type_3/2022-09-29_02-37-33_UTC_4.jpg
yikyakyeo/image_type_3/2022-10-07_01-43-46_UTC_7.jpg
yikyakyeo/image_type_3/2022-02-19_04-28-01_UTC_9.jpg
yikyakyeo/image_type_3/2022-10-02_00-10-52_UTC_7.jpg
yikyakyeo/image_type_3/2022-10-10_06-07-58_UTC_4.jpg
yikyakyeo/image_type_3/2022-10-10_01-30-44_UTC

In [12]:
df = pd.DataFrame(data_dict)

In [14]:
df.count()

image_name           2342
text                 2342
image_type_folder    2342
dtype: int64

In [11]:
df.to_csv("yikyakyeo/data_sets/yikyakyeo.csv")