In [1]:
# ML Project Logistic Regression Models

import sklearn
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
import numpy as np

import os
import os.path
import glob
from PIL import Image, ImageOps
from numpy import asarray
import random

CAPTCHA_IMAGE_FOLDER = "datasets" # the correct folder to store the dataset

# Get a list of all the captcha images we need to process
captcha_image_files = glob.glob(os.path.join(CAPTCHA_IMAGE_FOLDER, "*"))

captcha_image_files = random.sample(captcha_image_files, 1000)

captchas = []
captchas_y = []
for captcha_image_file in captcha_image_files:
    image = Image.open(captcha_image_file)
# #   Feature Transformation 1  
# #   shrink the image by 2 horizontally and vertically
#     image = image.resize((image.width//2,image.height//2))

# #   Feature Transformation 2
#     gray_image = ImageOps.grayscale(image)
    
    data = asarray(image)
    data = data.reshape(data.shape[0]*data.shape[1]*3)
    captchas.append(data)
    captchas_y.append((captcha_image_file.split("/")[1]).split(".")[0])
captchas = np.array(captchas)
captchas_y = np.array(captchas_y)

In [2]:
X_train, X_test, y_train, y_test = \
    train_test_split(captchas, captchas_y, test_size=0.25, random_state=42)

In [3]:
# L1 and L2 regularizations used below
def logistic_regression(captchas_train_x, captchas_train_y):
    clfs = []
    for i in range(5):
        print("Training the", i, "th character's model")

        clfs_i = []
        for j,ch in enumerate("123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"):
#             print("Training the", i, "th character's", ch,"model")

            
            captchas_y_i_j = np.array([1 if cap[i] == ch else 0 for cap in captchas_train_y])
            if(sum(captchas_y_i_j) != 0):
#           Try several different regularization techniques below
#                 C=0.01, penalty="l2",
#                 C=10, penalty='l1',solver="liblinear"
#                 penalty='none'
                clf = LogisticRegression(C=0.01, penalty='l1',solver="liblinear", random_state=0, max_iter=10000).fit(captchas_train_x, captchas_y_i_j)
                clfs_i.append(clf)
            else:
                clfs_i.append(-1)
           
            
        clfs.append(clfs_i)
    return clfs

In [4]:
def predict(X_train_1, clfs):
    likelihood = 0
    models = "123456789abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    y_hat = []
    for ith, cap in enumerate(X_train_1):
        y_hat_sigomid = []
        for i in range(5):
            y_hat_sigomid_i = []
            for j,ch in enumerate(models):
                if clfs[i][j] == -1:
                    y_hat_sigomid_i.append(0)
                else:
                    y_hat_sigomid_i.append(clfs[i][j].predict_proba([cap])[0][1])
                    
            y_hat_sigomid.append(y_hat_sigomid_i)
        
        pred = ""
        for i in range(5):
            max_value = max(y_hat_sigomid[i])
            index = y_hat_sigomid[i].index(max_value)
            pred += models[index]
            
            likelihood += np.log(max_value) # log likelihood            
            
        y_hat.append(pred)
    return y_hat, likelihood

In [5]:
def accuracy(y_predicted, y_test):
    res = np.where(np.array(y_predicted) == np.array(y_test), 1, 0)
    return np.sum(res)/res.shape[0]

In [6]:
# train the model and get coefficients and the intercept
clfs = logistic_regression(X_train, y_train)

Training the 0 th character's model
Training the 1 th character's model
Training the 2 th character's model
Training the 3 th character's model
Training the 4 th character's model


In [7]:
coeff = 0
intercept = 0
for i in range(5):
    for j in range(61):
        if clfs[i][j] != -1:
            coeff += sum(clfs[i][j].coef_[0])
            intercept += clfs[i][j].intercept_[0]
print("The sum of coeff:", coeff)
print("The sum of intercept", intercept)

The sum of coeff: -16.054795445804512
The sum of intercept 0.0


In [8]:
y_hat_train, likelihood_train = predict(X_train, clfs)
print("accuracy", accuracy(y_hat_train, y_train))
print("likelihood on the training set", likelihood_train)

accuracy 1.0
likelihood on the training set -884.9291212926576


In [9]:
y_hat, likelihood_test = predict(X_test, clfs)
accuracy(y_hat, y_test)

0.0