In [None]:
%matplotlib inline
%load_ext autoreload
%autoreload 2

import warnings
import json
import os
import pandas as pd
import numpy as np
import json
import util
import matplotlib.pyplot as plt
import skimage #require scikit-image pkg
import sys
import torchvision.models as models
import pickle as pickle
import urllib.request

import seaborn as sns

from PIL import Image
from torch.autograd import Variable
from torch import topk
from PIL import Image
from img2vec import img_to_vec
from torch.nn import functional as F
from sklearn import svm
from sklearn.metrics import confusion_matrix, r2_score
from docopt import docopt
from pprint import pprint
from os import walk
from sklearn.manifold import TSNE
from os import listdir
from os.path import isfile, join
from sklearn.decomposition import PCA
from sklearn.metrics.pairwise import cosine_similarity



from skimage import io as skio
import io

from svm_classifier import predict_test
from test_model import *
from constants import * 
from util import get_splits_csv, get_scores

In [82]:
def get_components(df):
    """
    Takes as input a dataframe containing image name, column of string features, and true rating.
    Outputs original dataframe with additional columns for the first four PCA components as well as two TSNE components
    """
    feature_array = features_to_array(pd.DataFrame(df.features))
    pca = PCA(n_components=4)
    pca_result = pca.fit_transform(feature_array)
    pca_df = pd.DataFrame()
    pca_df['pca-one'] = pca_result[:,0]
    pca_df['pca-two'] = pca_result[:,1] 
    pca_df['pca-three'] = pca_result[:,2]
    pca_df['pca-four'] = pca_result[:,3]
    pca_df['image_name'] = df.image_name
    pca_df['rating'] = df.rating
    pca_df['features'] = df.features

    print('Explained variation per principal component: {}'.format(pca.explained_variance_ratio_))

    tsne = TSNE(n_components=2, verbose=1, perplexity=40, n_iter=300)
    tsne_results = tsne.fit_transform(feature_array)
    len(tsne_results)
    pca_df['tsne-2d-one'] = tsne_results[:,0]
    pca_df['tsne-2d-two'] = tsne_results[:,1]
    return(pca_df)

In [40]:
def get_feature_similarity(df, df2):
    """
    Given a dataframe (with features column) and another dataframe.
    Outputs a column of length of dataframe with float values for cosine similarities between target image 
    and every other image in dataframe.
    """
    #target_features = features_to_array(pd.DataFrame(df.loc[(df.image_name == target_image)].features))
    target_features = features_to_array(pd.DataFrame(df2.features))
    feature_array = features_to_array(pd.DataFrame(df.features))
    similarities = cosine_similarity(feature_array, target_features)
    print(similarities.shape)
    return similarities


In [112]:
def get_images_for_feature(df, feature, direction, n=10):
    """
    Given dataframe with image names, the name of a column in the dataframe, and a direction (True for ascending,
    False for descending). Dataframe must have a column for the S3 url of the image.
    Returns an image grid of images with the highest (or lowest) values along the given axis.
    """
    image_sample = df.sort_values(by = feature, ascending=direction)[:n]
    image_datas = image_sample.url

    fig=plt.figure(figsize=(20, n*3))
    columns = 2
    rows = int(n/2)
    for i, imagerow in enumerate(image_datas):
        img = skio.imread(imagerow)
        fig.add_subplot(rows, columns, i+1)
        plt.title(imagerow)
        plt.imshow(img)
    plt.show()

    

In [75]:
def extract_image_features(image_csv_path, output_name, num_output_labels, model_name, url_path, quiet=False):
    """
    Extracts convolutional image features given dataframe of images [image_csv_path], trained resnet model [model_name], 
    and number of output labels, as well as url path for images. 
    Returns data and writes all data to desired location [output_name]
    """
    print(model_name)
    if model_name:
        img_2_vec = img_to_vec.Img2Vec(model=model_name, num_output_labels=num_output_labels)
    else:
        img_2_vec = img_to_vec.Img2Vec(num_output_labels=num_output_labels)
    # csv file image_name, trueskill_score
    data = pd.read_csv(image_csv_path)
    print(data.columns)
    data['pred_resnet'] = ''
    data['features'] = ''

    def get_image_features(row):
        try:
            if not quiet:  # for some reason, apply runs the first row twice
                print(' EXTRACTING IMAGE FEATURES:', row.image_name)
            URL = url_path + row.image_name

            with urllib.request.urlopen(URL) as url:
                f = io.BytesIO(url.read())

            img = Image.open(f)
            
            vec = img_2_vec.get_vec(img)
            pred = img_2_vec.predict_image(img)
            row.features = json.dumps(vec.tolist())
            row.pred_resnet = json.dumps(pred.tolist())
#            print(row)
            return row
        except Exception as e:
            print(e)

    print('extracting features for {} ({} images) using {}'.format(image_csv_path, len(data), model_name))

    data = data.apply(get_image_features, axis=1)
    data.to_csv(output_name, index=False)
    return data

In [65]:
image_url_path = 'WHERE IMAGES ARE STORED'
# This should be a URL or directory that points to where images are saved.


Generate PCA components for images with discrepancies:

In [None]:
discrepancies_features = extract_image_features(
'PATH TO CSV WITH IMAGES WITH DISCREPANCIES',
'PATH TO OUTPUT CSV NAME',
2, # NUMBER OF OUTPUT LABELS
'PATH TO TRAINED RESNET MODEL',
image_url_path)

In [None]:
all_features = extract_image_features(
'PATH TO CSV WITH FULL SET OF IMAGES',
'PATH TO OUTPUT CSV NAME',
2, # NUMBER OF OUTPUT LABELS
'PATH TO TRAINED RESNET MODEL',
image_url_path)

In [None]:
from svm_classifier import *
from sklearn.decomposition import PCA
from sklearn.manifold import TSNE

pca_df = get_components(discrepancies_features.dropna()).dropna()

Generate similarity between all images and images with discrepancies:

In [None]:
dis_df = pd.read_csv('PATH TO DISCREPANCIES FEATURES')

full_df = pd.read_csv('PATH TO ALL IMAGES FEATURES')

In [None]:
similarity = get_feature_similarity(dis_df,full_df)