# Data Prep

In [1]:
!pip install scikit-image



In [2]:
import numpy as np
import os
from PIL import Image
import skimage.transform as st
import pandas as pd

In [3]:
base_path = "data"
words_list = []

words = open(f"{base_path}/words.txt", "r").readlines()
for line in words:
    if line[0] == "#":
        continue
    if line.split(" ")[1] != "err":  # We don't need to deal with errored entries.
        words_list.append(line)

np.random.shuffle(words_list)

# don't wanna be here all night
split_idx = int(0.9 * len(words_list))
train_samples = words_list[:split_idx]
test_samples = words_list[split_idx:]


val_split_idx = int(0.5 * len(test_samples))
validation_samples = test_samples[:val_split_idx]
test_samples = test_samples[val_split_idx:]

#assert len(words_list) == len(train_samples) + len(validation_samples) + len(
#    test_samples
#)

print(f"Total training samples: {len(train_samples)}")
print(f"Total validation samples: {len(validation_samples)}")
print(f"Total test samples: {len(test_samples)}")


Total training samples: 86810
Total validation samples: 4823
Total test samples: 4823


# Data Prep

In [4]:
base_image_path = os.path.join(base_path, "words")


def get_image_paths_and_labels(samples):
    paths = []
    corrected_samples = []
    for (i, file_line) in enumerate(samples):
        line_split = file_line.strip()
        line_split = line_split.split(" ")

        # Each line split will have this format for the corresponding image:
        # part1/part1-part2/part1-part2-part3.png
        image_name = line_split[0]
        partI = image_name.split("-")[0]
        partII = image_name.split("-")[1]
        img_path = os.path.join(
            base_image_path, partI, partI + "-" + partII, image_name + ".png"
        )
        if os.path.getsize(img_path):
            paths.append(img_path)
            corrected_samples.append(file_line.split("\n")[0])

    return paths, corrected_samples


train_img_paths, train_labels = get_image_paths_and_labels(train_samples)
validation_img_paths, validation_labels = get_image_paths_and_labels(validation_samples)
test_img_paths, test_labels = get_image_paths_and_labels(test_samples)

"""
Then we prepare the ground-truth labels.
"""

# Find maximum length and the size of the vocabulary in the training data.
train_labels_cleaned = []
characters = set()
max_len = 0

for label in train_labels:
    label = label.split(" ")[-1].strip()
    for char in label:
        characters.add(char)

    max_len = max(max_len, len(label))
    train_labels_cleaned.append(label)

characters = sorted(list(characters))

print("Maximum length: ", max_len)
print("Vocab size: ", len(characters))

# Check some label samples.
train_labels_cleaned[:10]

Maximum length:  21
Vocab size:  78


['should', '.', 'by', 'a', 'not', 'she', 'warm-hearted', 'in', 'who', 'he']

In [5]:
def clean_labels(labels):
    cleaned_labels = []
    for label in labels:
        label = label.split(" ")[-1].strip()
        cleaned_labels.append(label)
    return cleaned_labels

validation_labels_cleaned = clean_labels(validation_labels)
test_labels_cleaned = clean_labels(test_labels)

In [6]:
"""most popular labels"""

dict = {}
for x in train_labels_cleaned:
    if x in dict:        
        dict[x] += 1
    else:
        dict[x] = 1
sorted_dict = {key: value for (key, value) in sorted(dict.items(), key=lambda x: x[1], reverse=True)}
most_popular = list(sorted_dict.keys())[:15]
most_popular

['the',
 ',',
 '.',
 'of',
 'to',
 'and',
 'a',
 'in',
 '"',
 'that',
 'was',
 'is',
 'he',
 'for',
 'had']

In [7]:
indexes_popular = [i for i in range(len(train_labels_cleaned)) if train_labels_cleaned[i] in most_popular][:1000]

X_train_preprocessing = np.take(train_img_paths, indexes_popular)
Y_train = np.take(train_labels_cleaned, indexes_popular)

In [8]:
"""resizing image"""
def resize(im):
    size = 50, 50
    im = st.resize(im, (8, 8))
    return im


"""converting images to numpy arrays"""
def convert_image(array_image_paths):
    output = np.empty((len(array_image_paths), 64)) 
    
    for i in range(len(array_image_paths)):
        img = Image.open(array_image_paths[i])
        numpydata = np.asarray(img)
        numpydata = resize(numpydata)
        numpydata = numpydata.reshape(1, 64)
        output[i] = numpydata
        break
    return output

X_train = convert_image(X_train_preprocessing)
X_train

array([[7.56617647e-001, 6.45710784e-001, 7.70098039e-001, ...,
        6.30514706e-001, 7.44485294e-001, 8.83088235e-001],
       [2.31297541e-312, 2.20687562e-312, 2.05833592e-312, ...,
        0.00000000e+000, 2.20687562e-312, 5.48412867e-322],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        4.79243676e-322, 0.00000000e+000, 0.00000000e+000],
       ...,
       [2.20687562e-312, 2.44029516e-312, 0.00000000e+000, ...,
        0.00000000e+000, 0.00000000e+000, 1.78247646e-312],
       [2.35541533e-312, 2.18565567e-312, 5.13828272e-322, ...,
        0.00000000e+000, 2.05833592e-312, 0.00000000e+000],
       [0.00000000e+000, 0.00000000e+000, 0.00000000e+000, ...,
        1.54905693e-312, 0.00000000e+000, 0.00000000e+000]])

In [9]:
from pycaret.classification import *

df = pd.DataFrame(X_train)
df["target"] = Y_train
s = setup(df, target = 'target')

Unnamed: 0,Description,Value
0,session_id,1379
1,Target,target
2,Target Type,Multiclass
3,Label Encoded,""": 0, ,: 1, .: 2, a: 3, and: 4, for: 5, had: 6, he: 7, in: 8, is: 9, of: 10, that: 11, the: 12, to: 13, was: 14"
4,Original Data,"(1000, 65)"
5,Missing Values,False
6,Numeric Features,64
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


In [10]:
compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
lr,Logistic Regression,0.1645,0.5,0.0667,0.0271,0.0465,0.0,0.0,1.425
dt,Decision Tree Classifier,0.1645,0.5,0.0667,0.0271,0.0465,0.0,0.0,0.042
ridge,Ridge Classifier,0.1645,0.0,0.0667,0.0271,0.0465,0.0,0.0,0.038
rf,Random Forest Classifier,0.1645,0.5,0.0667,0.0271,0.0465,0.0,0.0,0.454
ada,Ada Boost Classifier,0.1645,0.5,0.0667,0.0271,0.0465,0.0,0.0,0.313
gbc,Gradient Boosting Classifier,0.1645,0.5,0.0667,0.0271,0.0465,0.0,0.0,2.471
et,Extra Trees Classifier,0.1645,0.5,0.0667,0.0271,0.0465,0.0,0.0,0.312
lightgbm,Light Gradient Boosting Machine,0.1645,0.5,0.0667,0.0271,0.0465,0.0,0.0,0.128
dummy,Dummy Classifier,0.1645,0.5,0.0667,0.0271,0.0465,0.0,0.0,0.038
knn,K Neighbors Classifier,0.1302,0.5,0.0667,0.0182,0.0318,0.0,0.0,0.121


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=1379, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [11]:
import warnings
warnings.filterwarnings('ignore')

  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value


  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)
  _warn_prf(average, modifier, msg_start, len(result))
  mcc = cov_ytyp / np.sqrt(cov_ytyt * cov_ypyp)


 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value


Traceback (most recent call last):
  File "/opt/conda/envs/pycaret/lib/python3.8/site-packages/sklearn/model_selection/_validation.py", line 531, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/opt/conda/envs/pycaret/lib/python3.8/site-packages/pycaret/internal/pipeline.py", line 118, in fit
    result = super().fit(X, y=y, **fit_kwargs)
  File "/opt/conda/envs/pycaret/lib/python3.8/site-packages/imblearn/pipeline.py", line 281, in fit
    self._final_estimator.fit(Xt, yt, **fit_params)
  File "/opt/conda/envs/pycaret/lib/python3.8/site-packages/sklearn/discriminant_analysis.py", line 464, in fit
    self._solve_svd(X, y)
  File "/opt/conda/envs/pycaret/lib/python3.8/site-packages/sklearn/discriminant_analysis.py", line 394, in _solve_svd
    _, S, V = linalg.svd(X, full_matrices=0)
  File "/opt/conda/envs/pycaret/lib/python3.8/site-packages/scipy/linalg/decomp_svd.py", line 121, in svd
    lwork = _compute_lwork(gesXd_lwork, a1.shape[0], a1.shape[1],
  Fil

 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
 ** On entry to DGESDDM parameter number 10 had an illegal value
