# Downloading files from Drive


In [0]:
#import statements
import cv2 #opencv
from google.colab import files #Not sure what this does, copied this stuff
import urllib.request #Downloads images from urls.
import os #for chdir and such
import pandas as pd #reading csv data and manipulating it.

In [0]:
#Clear out all filler data populated by colab upon creating environment
!rm -r /content/*
!pwd
!ls
#Output should display "/content" as pwd and that's it, since nothing in ls.

/content


In [0]:
#Allowing drive permissions
!pip install -U -q PyDrive
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

# Authenticate and create the PyDrive client.
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

In [0]:
#Downloading the stuff
#CHECK THE LINK IS ACCURATE AND POINTS TO THE DESIRED FILES
urls_link = "https://drive.google.com/open?id=1FrWI-OzdwrEFUxzmMPl12Hp6XfU6iRo9" #URL file.  Maps name to url
fluff, urls_id = urls_link.split('=')

data_link = "https://drive.google.com/open?id=1XZr2j9WwiLll3Ww4VrhBYNez-dn_ecuE" #image data file.
fluff, data_id = data_link.split('=')

labels_link = "https://drive.google.com/open?id=1uHQi7XvLXNX7WLNhQ3j3WGhYxzX7zefo" #Gives us key of "LabelName" to actual object name. (i.e. "/m/01g317" -> "Human person" or whatever it is.)
fluff, labels_id = labels_link.split('=')

pos_zip_link = 'https://drive.google.com/open?id=1Sty72byXuXDhTAo4wki41bUmSho9dzJV'
fluff, pos_zip_id = pos_zip_link.split('=')

pos_txt_link = 'https://drive.google.com/open?id=1l5E4tFiOQY7LBLmRuf2T5Nwqt50zPupF'
fluff, pos_txt_id = pos_txt_link.split('=')

print(urls_id+"\n"+data_id+"\n"+labels_id+"\n"+pos_zip_id+"\n"+pos_txt_id)
#Verify the IDs match the bit after = in the above urls.

1FrWI-OzdwrEFUxzmMPl12Hp6XfU6iRo9
1XZr2j9WwiLll3Ww4VrhBYNez-dn_ecuE
1uHQi7XvLXNX7WLNhQ3j3WGhYxzX7zefo
1Sty72byXuXDhTAo4wki41bUmSho9dzJV
1l5E4tFiOQY7LBLmRuf2T5Nwqt50zPupF


In [0]:
#Actually download the file
print('Donloading URLs')
downloaded = drive.CreateFile({'id':urls_id}) 
downloaded.GetContentFile('urls.csv')

print('Downloading Data')
downloaded = drive.CreateFile({'id':data_id}) 
downloaded.GetContentFile('data.csv')

print('Downloading Labels')
downloaded = drive.CreateFile({'id':labels_id}) 
downloaded.GetContentFile('labels.csv')

print('Downloading pos images')
downloaded = drive.CreateFile({'id':pos_zip_id}) 
downloaded.GetContentFile('pos.zip')

print('Downloading pos txt')
downloaded = drive.CreateFile({'id':pos_txt_id}) 
downloaded.GetContentFile('pos.txt')

print('Downloads done!\n')
#The only files you sould see are "urls.csv", "data.csv", and "labels.csv".  The
#last one has something to do with the import procedure. #There may be "adc.json", which is irrelevant
!ls

Donloading URLs
Downloading Data
Downloading Labels
Downloads done!

adc.json  data.csv  labels.csv	urls.csv


In [0]:
#Unzip zipped files.
!mkdir pos
!unzip pos.zip

# Setting up Data
We imported 3 files above.
urls.csv maps the image url, from which we can download the image, to a name.  
Data.csv maps the name to the data about each image.  
Labels.csv lets us translate the Label from gibberish to English.


In [0]:
#Read in the data and store as dataframes
os.chdir("/content")
print("reading urls")
urls = pd.read_csv("urls.csv")
print("reading data")
data = pd.read_csv("data.csv")
print("reading labels")
labels = pd.read_csv("labels.csv",names=['Label','Name']) #Label is the auto-gen, Name is english.
print(urls.head())
print(data.head())
print(labels.head())

In [0]:
#Find label corresponding to "Mobile phone"
phone_name = 'Mobile phone'
phone_label = labels[labels.Name == phone_name].iat[0,0]
#Check that the value is "/m/050k8".
print(phone_label)

/m/050k8


In [0]:
#Split data into phone and not phone
phone_data = data[data.LabelName == phone_label]

print(phone_data.head())
print(phone_data.shape)

               ImageID  Source LabelName  ...  IsGroupOf  IsDepiction  IsInside
1149  0000b9115cdf1e54  xclick  /m/050k8  ...          0            0         0
2487  00016982f6086d39  xclick  /m/050k8  ...          0            0         0
3963  0002a1a755d730ca  xclick  /m/050k8  ...          0            0         0
5408  000384bb6da4764b  xclick  /m/050k8  ...          0            0         0
8040  0005829bfcf77ca3  xclick  /m/050k8  ...          0            0         0

[5 rows x 13 columns]
(6365, 13)


# Preparing Training Data Files
We need a negative/background text file which has a bunch of filepaths to negative images.  Should be about half as many as positive images.
We need a positive file which has a bunch of "image filepath, num_Objects, x1,y1,x2,y2" lines


In [0]:
#Setting up directories for positive/negative files.
os.chdir("/content")

rm: cannot remove 'neg': No such file or directory
rm: cannot remove 'pos': No such file or directory
rm: cannot remove 'neg.txt': No such file or directory


In [0]:
#@MATT DO NOT RUN THIS BLOCK.  WE HAVE IMPORTED POS.TXT AND POS.ZIP ABOVE
#DO NOT RUN THIS IF POS.TXT AND POS.ZIP WERE IMPORTED ABOVE
%%time
#Download positive images from urls and save them locally, grayscaled and scaled down.
#NOTE this imports a few images per second, and imports a total of 6k images.
#Thus, it will take 30-60 mins, maybe even longer.
#TODO: Refactor code, make phone be 100x100 instead of the image.  Requires slightly more complicated math.
!rm pos.txt
!rm -r pos
!mkdir pos
pos_file = open('pos.txt','w')
base_url = "https://requestor-proxy.figure-eight.com/figure_eight_datasets/open-images/train/"
height = 100  #Size of resized images.  Change if training time/acc suffers.
width = 100
numpics = 0
for index, row in phone_data.iterrows():
    try:
        id = row['ImageID']
        url = base_url + id + ".jpg"
        path = "pos/"+ id +".jpg"
        print(numpics,id)
        #Avoid downloading pic if it's already here.
        #File doesn't exist yet, download it.
        if not os.path.exists(path):
            urllib.request.urlretrieve(url, path)   #Downloads img from url to path
            img = cv2.imread(path,cv2.IMREAD_GRAYSCALE) #Grayscales image
            resized_image = cv2.resize(img, (height, width))  #resizes image.  
            cv2.imwrite(path,resized_image)

        pos_file.write(path + " 1 " + str(row['XMin']*width//1) + " " +   #Record the data to pos.txt
                       str(row["YMin"]*height//1) + " " + str(row["XMax"]*width//1) + " "  
                       + str(row["YMax"]*height//1) + "\n")
        numpics += 1
        
    except Exception as e:
        print(str(e))
pos_file.close()
#print(phone_data.head())

In [0]:
#@MATT RUN THIS BLOCK PLEASE.  IT WILL PROB TAKE AN HOUR BUT IT NEEDS TO BE DONE.
#Negative Data Generation.
#DO NOT RUN THIS IF NEG.TXT AND NEG.ZIP WERE IMPORTED ABOVE
!rm -r neg
!mkdir neg 
!rm neg.txt
#Get list of all ids of imgs with phones in them.
pos_set = set()
for filename in os.listdir('pos'):
    id, fluff = filename.split('.')
    pos_set.add(id)
#Find num pos examples in pos.txt
num_pos = 0
with open('pos.txt','r') as f:
    for line in f:
        num_pos += 1
print(num_pos)
#Gather num_pos/2 negative images that are not in pos_set
num_neg = 0
neg_file = open('neg.txt','w')
for index, row in urls.iterrows():
    id,fluff = row['image_name'].split('.')
    if id in pos_set:
        continue

    #image is not in positive set
    try:
        url = base_url + id + ".jpg"
        path = "neg/"+ id +".jpg"
        print(num_neg,id)
        #Avoid downloading pic if it's already here.
        #File is already downloaded, skip it.
        if os.path.exists(path):
            continue

        urllib.request.urlretrieve(url, path)   #Downloads img from url to path
        img = cv2.imread(path,cv2.IMREAD_GRAYSCALE) #Grayscales image
        resized_image = cv2.resize(img, (height, width))  #resizes image.  
        cv2.imwrite(path,resized_image)

        neg_file.write(path + "\n")
        num_neg += 1
        if num_neg >= num_pos/2:
            break
        
    except Exception as e:
        print(str(e))

neg_file.close()
#Sanity checks
print('pos examples:',num_pos)
print('neg examples:',len(os.listdir('neg')))
numnegim = 0
with open('neg.txt','r') as f:
    for line in f:
        numnegim += 1
print(numnegim)

6636 e43f0103723b20e4
6637 e6c095c72d9c1a56
6638 e901a331a70f0020
6639 ffb7c04d8c9b63f5
6640 ecee7c2b0564c8d9
6641 ef53c4f0d56c57d4
6642 e658cfe1b462af26


KeyboardInterrupt: ignored

In [0]:
#This block is a sanity check.  Please check it prints a number roughly 3000-3500.
print(numnegim)

Index(['image_name', 'image_url'], dtype='object')


In [0]:
#Zip and save the work because this took an hour and we're not doing this for every test.
#Only redo if we're changing something about the positive values.
!zip neg.zip neg/*
#Upload pos data to drive
upload = drive.CreateFile({'title': 'neg.zip'})
upload.SetContentFile('neg.zip')
upload.Upload()

upload = drive.CreateFile({'title': 'neg.txt'})
upload.SetContentFile('neg.txt')
upload.Upload()