<a href="https://colab.research.google.com/github/EleonoraBartolomucci/Fairness/blob/master/support_functions.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [65]:
!pip install clarifai



In [66]:
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials
from clarifai.rest import ClarifaiApp
from clarifai.rest import ApiError
import numpy as np
import time
import json
import csv
import random
import pandas as pd
import requests
from lxml import html
import shutil
import os
import json


# AUTHENTICATE IN GOOGLE DRIVE
def authenticate():
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  return drive
drive = authenticate()


def create_folder_in_drive(gdrive, folder_name, parent_folder_id):
  folder_metadata = {'title': folder_name,'mimeType': 'application/vnd.google-apps.folder',
                    'parents': [{"kind": "drive#fileLink", "id": parent_folder_id}]
                    }
  folder = gdrive.CreateFile(folder_metadata)
  folder.Upload()
  print(folder)
  # Return folder informations
  print('title: %s, id: %s' % (folder['title'], folder['id']))
  return folder['id']


def drop_unnamed(df):
  cols = [c for c in df.columns if c.lower()[:7] != 'unnamed']
  return df[cols]


def upload_file(filename, folder_id):
  drive = authenticate()
  fileList = drive.ListFile({'q': "'" + folder_id + "' in parents and trashed=false"}).GetList()
  drive_file = drive.CreateFile({'title': filename, 'parents': [{'id': folder_id}]})
  # Check if file already exists in Google Drive (prevents duplicates)
  for file in fileList:
      if file['title'] == filename:  # The file already exists, then overwrite it
          fileID = file['id']
          drive_file = drive.CreateFile({'id': fileID, 'title': filename, 'parents': [{'id': folder_id}]})
  # Upload user picture on Google Drive
  drive_file.SetContentFile(filename)  # path of local file content
  drive_file.Upload()  # Upload the file.
  return drive_file['id']


def downloadUser(business_id, user_id, photo_folder, counter):
    authenticate()
    
    filename = user_id + '.jpg'
    url = 'https://www.yelp.com/user_details?userid=' + user_id
    folder_id = photo_folder

    # CHECK DUPLICATE
    fileList = drive.ListFile({'q': "'" + folder_id + "' in parents and trashed=false"}).GetList()
    exists = False
    # Check if file already exists in Google Drive (prevents duplicates)
    for file in fileList:
        if file['title'] == filename:  # The file already exists
            exists = True
    
    if exists:
      return True
    else:
      # Find user picture from web page
      page_content = requests.get(url)
      print(page_content)
      if not page_content.ok:
        return False
      tree = html.fromstring(page_content.content)
      if len(tree.xpath('//*[@id="wrap"]/div[2]/div[1]/div/div[2]/div[1]/div/div/div/a/img/@src')) == 0:
        print('PICTURE NOT FOUND')
        print(user_id)
        return False
      else: # Picture found
        image_url = tree.xpath('//*[@id="wrap"]/div[2]/div[1]/div/div[2]/div[1]/div/div/div/a/img/@src')[0]

        drive_file = drive.CreateFile({'title': filename, 'parents': [{'id': folder_id}]})
      
        response = requests.get(image_url, stream=True)

        # Create a local copy of user picture
        with open(filename, 'wb') as out_file:
            shutil.copyfileobj(response.raw, out_file)
        del response

        # Upload user picture on Google Drive
        drive_file.SetContentFile(filename)
        drive_file.Upload()
        if os.path.exists(filename):
            os.remove(filename)
        else:
            print("The file does not exist")
        return True

In [None]:
################################################################################
######################### DOWNLOAD RANKINGS ####################################

def get_ranking_from_call(url_business, lang, sort, query):
    headers = [{"name": "Accept", "value": "*/*"}, {"name": "Accept-Encoding", "value": "gzip, deflate, br"},
               {"name": "Accept-Language", "value": "it-IT,it;q=0.8,en-US;q=0.5,en;q=0.3"},
               {"name": "Connection", "value": "keep-alive"},
               {"name": "Content-Type", "value": "application/x-www-form-urlencoded; charset=utf-8"}, {"name": "Cookie",
                                                                                                       "value": "qntcst=D; hl=en_US; wdi=1|3C26116D69138F61|0x1.78d019f71a444p+30|a7756ff94751d3a9; _ga=GA1.2.3C26116D69138F61; location=%7B%22city%22%3A+%22New+York%22%2C+%22state%22%3A+%22NY%22%2C+%22country%22%3A+%22US%22%2C+%22latitude%22%3A+40.713%2C+%22longitude%22%3A+-74.0072%2C+%22max_latitude%22%3A+40.8523%2C+%22min_latitude%22%3A+40.5597%2C+%22max_longitude%22%3A+-73.7938%2C+%22min_longitude%22%3A+-74.1948%2C+%22zip%22%3A+%22%22%2C+%22address1%22%3A+%22%22%2C+%22address2%22%3A+%22%22%2C+%22address3%22%3A+null%2C+%22neighborhood%22%3A+null%2C+%22borough%22%3A+null%2C+%22provenance%22%3A+%22YELP_GEOCODING_ENGINE%22%2C+%22display%22%3A+%22New+York%2C+NY%22%2C+%22unformatted%22%3A+%22New+York%2C+NY%2C+US%22%2C+%22accuracy%22%3A+4.0%2C+%22language%22%3A+null%7D; xcj=1|Ptt9P03gfc75x_PBT9zmqCkUuSuyB7PR-wWUBvABNi4; __qca=P0-60561249-1581956668708; G_ENABLED_IDPS=google; __cfduid=db8764ff59d8028a6c2e1b214867927d81583160194; _gid=GA1.2.2014867238.1583835527; bse=05dcd9d5de304ef0b1d9a76fa768b10f; sc=8a1ca0dbc2; pid=505721aa4569e7bb"},
               {"name": "Host", "value": "www.yelp.com"},
               {"name": "Referer", "value": "https://www.yelp.com/biz/noche-de-margaritas-new-york"},
               {"name": "TE", "value": "Trailers"}, {"name": "User-Agent",
                                                     "value": "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:73.0) Gecko/20100101 Firefox/73.0"},
               {"name": "X-Requested-By-React", "value": "true"},
               {"name": "X-Requested-With", "value": "XMLHttpRequest"}]
    headers_ok = {}
    for header in headers:
        temp = {
            header['name']: header['value']
        }
        headers_ok.update(temp)

    x = 0
    reviews_list = []
    position = 1
    url = url_business + "/review_feed?rl=" + lang + "&sort_by=" + sort + "&q=" + query

    while 1:
        if x == 0:
            page_load = requests.get(url + '&start=', headers=headers_ok)
        else:
            page_load = requests.get(url + '&start=' + str(x), headers=headers_ok)
        print(page_load)
        x = x + 20
        reviews = page_load.json()['reviews']
        # print(json.dumps(reviews, indent=4, sort_keys=True))
        if not reviews:
            break
        for review in reviews:
            reviews_list.append((position, review['userId'], review['user'],
                                 review['comment'], review['feedback'],
                                 datetime.datetime.strptime(review['localizedDate'], '%m/%d/%Y')))
            position = position + 1
    df_reviews = pd.DataFrame(reviews_list, columns=["position", "user_id", "user",
                                                     "comment", "feedback", "date"])
    return df_reviews


def retrieve_rankings(business_id, folder_id):
    df_rel_ranking = get_ranking_from_call("https://www.yelp.com/biz/" + business_id, "en", "relevance_desc", "")
    df_date_ranking = df_rel_ranking.sort_values(by=['date']).reset_index(drop=True)
    df_date_ranking['position'] = df_date_ranking.index + 1
    df_rand_ranking = df_rel_ranking.sample(frac=1).reset_index(drop=True)
    df_rand_ranking['position'] = df_rand_ranking.index + 1

    df_rel_ranking = drop_unnamed(df_rel_ranking)
    df_rand_ranking = drop_unnamed(df_rand_ranking)
    df_date_ranking = drop_unnamed(df_date_ranking)

    df_rel_ranking.to_csv("rel_ranking_" + business_id + ".csv")
    df_date_ranking.to_csv("date_ranking_" + business_id + ".csv")
    df_rand_ranking.to_csv("rand_ranking_" + business_id + ".csv")

    #destination = set_file_destination('', 'ranking', business_id)
    upload_file("rel_ranking_" + business_id + ".csv", folder_id)
    upload_file("date_ranking_" + business_id + ".csv", folder_id)
    upload_file("rand_ranking_" + business_id + ".csv", folder_id)


def create_folder_in_drive(gdrive, folder_name, parent_folder_id):
  folder_metadata = {'title': folder_name,'mimeType': 'application/vnd.google-apps.folder',
                    'parents': [{"kind": "drive#fileLink", "id": parent_folder_id}]
                    }
  folder = gdrive.CreateFile(folder_metadata)
  folder.Upload()
  print(folder)
  # Return folder informations
  print('title: %s, id: %s' % (folder['title'], folder['id']))
  return folder['id']


numrest = 87
id_lista = restaurants_id_list[numrest-1:]
for id in id_lista:

  download = drive.CreateFile({'id': '1MIWS9uI9GKxsgAlzB0wXFBiutE025u2c'}) # id file gtree.csv
  download.GetContentFile('gtree.csv')
  gtree = pd.read_csv('gtree.csv')

  gfolder_id = create_folder_in_drive(drive, str(numrest) + ' - ' + id, '1p9X-C0p1191dYlRnrGUZZUkTXLymJ49_')
  gfolder_rankings = create_folder_in_drive(drive, '1_rankings', gfolder_id)
  gfolder_groups = create_folder_in_drive(drive, '2_groups_exposure', gfolder_id)
  new_row = {'business_id':id, 'gfolder_id':gfolder_id, 'gfolder_rankings':gfolder_rankings, 
                              'gfolder_groups':gfolder_groups, 'gfolder_#11':'', 'gfolder_#12':'',
                              'gfolder_#14':'', 'gfolder_userphoto':'', 'gfolder_clarifai':''}
  gtree = gtree.append(new_row, ignore_index=True)
  gtree = gtree.drop_duplicates('business_id',keep='last').reset_index(drop=True)
  gtree = drop_unnamed(gtree)
  gtree.to_csv('gtree.csv')
  upload_file('gtree.csv', '1eUt2wyCOULW0-LdL6vyRJi1mUs_U46DD') # id folder data
  retrieve_rankings(id, gfolder_rankings)
  numrest += 1
  print(id)

In [None]:
################################################################################
###################### DOWNLOAD USER PHOTOS ####################################


def scarica_tutti_utenti(business_id, df_utenti, photo_folder_id, startfrom):
  df_utenti = df_utenti[startfrom:]
  cnt = startfrom
  for a, user in df_utenti.iterrows():
    user_id = user['user_id']
    completed = downloadUser(business_id, user_id, photo_folder_id, cnt)
    if not completed:
      print('ULTIMO SCARICATO:', cnt)
      return False
    print(cnt)
    cnt = cnt + 1


# DOWNLOAD restaurants.txt FROM DRIVE
restaurants_dataset_id = '1BqMsph8flQMGmcE_WxYV0vSC_uqi-nge'  # FILE ID, got on google drive with condivision link
download = drive.CreateFile({'id': restaurants_dataset_id})
download.GetContentFile('restaurants.txt')
with open('restaurants.txt') as f:
    content = f.readlines()
restaurants_id_list = [x.strip() for x in content] 

print(restaurants_id_list)


numrest = 18
id_lista = restaurants_id_list[(numrest-1):(numrest)]
starting_from_photo = 2299


download = drive.CreateFile({'id': '1MIWS9uI9GKxsgAlzB0wXFBiutE025u2c'}) # id file gtree.csv
download.GetContentFile('gtree.csv')
gtree = pd.read_csv('gtree.csv')

for id in id_lista:
  print(id)
  user_ranking_folder = gtree.loc[gtree['business_id']==id, 'gfolder_rankings'].tolist()[0]
  ranking_file = drive.ListFile({'q': "'" + user_ranking_folder + "' in parents and trashed=false"}).GetList()[0]  
  download = drive.CreateFile({'id': ranking_file['id']})
  download.GetContentFile('ranking_' + id + '.csv')
  ranking = pd.read_csv('ranking_' + id + '.csv')
  if starting_from_photo == 0:
    id_cartella = create_folder_in_drive(drive, str(numrest) + ' - ' + id,
                                         '12OKc1s2VeJRv5kUS_fp9f4dFLn1LkSPG')  # folder face_data>Foto_User
    gtree.loc[gtree['business_id']==id, 'gfolder_userphoto'] = id_cartella
    gtree = gtree.drop_duplicates('business_id',keep='last').reset_index(drop=True)
    gtree = drop_unnamed(gtree)
    gtree.to_csv('gtree.csv')
    upload_file('gtree.csv', '1eUt2wyCOULW0-LdL6vyRJi1mUs_U46DD') # id folder data
  else:
    id_cartella = gtree.loc[gtree['business_id']==id, 'gfolder_userphoto'].tolist()[0]
  
  success = scarica_tutti_utenti(id, ranking[['user_id']], id_cartella, starting_from_photo)
  if not success:
    print('RISTORANTE:',id)
    print(numrest)
    break
  else:
    print(numrest)
    print('FINE DOWNLOAD RISTORANTE ' + id)
    numrest += 1


In [67]:
################################################################################
################## DOWNLOAD MISSING USER PHOTO #################################

import json
import csv
import random
import pandas as pd

def scarica_utenti_mancanti(business_id, df_utenti, photo_folder_id):
  cnt=0
  for user_id in df_utenti:
    completed = downloadUser(business_id, user_id, photo_folder_id, cnt)
    if not completed:
      print('ULTIMO NON SCARICATO:', user_id)
      return False
    print(cnt)
    cnt = cnt + 1
  return True


# DOWNLOAD restaurants.txt FROM DRIVE
restaurants_dataset_id = '1BqMsph8flQMGmcE_WxYV0vSC_uqi-nge'  # FILE ID, got on google drive with condivision link
download = drive.CreateFile({'id': restaurants_dataset_id})
download.GetContentFile('restaurants.txt')
with open('restaurants.txt') as f:
    content = f.readlines()
restaurants_id_list = [x.strip() for x in content] 

print(restaurants_id_list)

numrest = 74
while numrest<75:

  id = restaurants_id_list[(numrest-1):(numrest)][0]
  download = drive.CreateFile({'id': '1MIWS9uI9GKxsgAlzB0wXFBiutE025u2c'}) # id file gtree.csv
  download.GetContentFile('gtree.csv')
  gtree = pd.read_csv('gtree.csv')

  print(id)

  user_photo_folder = gtree.loc[gtree['business_id']==id, 'gfolder_userphoto'].tolist()[0]
  photo_list = drive.ListFile({'q': "'" + user_photo_folder + "' in parents and trashed=false"}).GetList() 
  user_id_list2 = [photo['title'][0:22] for photo in photo_list]
  print(user_id_list2)

  user_ranking_folder = gtree.loc[gtree['business_id']==id, 'gfolder_rankings'].tolist()[0]
  ranking_file = drive.ListFile({'q': "'" + user_ranking_folder + "' in parents and trashed=false"}).GetList()[0]  
  download = drive.CreateFile({'id': ranking_file['id']})
  download.GetContentFile('ranking_' + id + '.csv')
  ranking = pd.read_csv('ranking_' + id + '.csv')
  user_id_list = ranking['user_id'].values.tolist()
  print(user_id_list)

  difference = (list(set(user_id_list) - set(user_id_list2)))

  print('Utenti mancanti:', len(difference))
  print(difference)
  if difference:
    success = scarica_utenti_mancanti(id, difference, user_photo_folder)
  else:
    success = True
  if not success:
    print('ERRORE RISTORANTE:',id)
    print(numrest)
    break
  else:
    print(numrest)
    print('FINE DOWNLOAD RISTORANTE ' + id)
    numrest += 1


['WbJ1LRQdOuYYlRLyTkuuxw', 'T2tEMLpTeSMxLKpxwFdS3g', 'ALwAlxItASeEs2vYAeLXHA', 'OVTZNSkSfbl3gVB9XQIJfw', 'Sovgwq-E-n6wLqNh3X_rXg', 'j5nPiTwWEFr-VsePew7Sjg', 'aiX_WP7NKPTdF9CfI-M-wg', 'e4NQLZynhSmvwl38hC4m-A', 'S-oLPRdhlyL5HAknBKTUcQ', 'VyVIneSU7XAWgMBllI6LnQ', 'pSQFynH1VxkfSmehRXlZWw', 'JzOp695tclcNCNMuBl7oxA', 'OgJ0KxwJcJ9R5bUK0ixCbg', '3l54GTr8-E3XPbIxnF_sAA', '9a3DrZvpYxVs3k_qwlCNSw', 'frCxZS7lPhEnQRJ3UY6m7A', 'yNPh5SO-7wr8HPpVCDPbXQ', '0FUtlsQrJI7LhqDPxLumEw', 'K-uQkfSUTwu5LIwPB4b_vg', 'L2p0vO3fsS2LC6hhQo3CzA', 'd10IxZPirVJlOSpdRZJczA', 'wUKzaS1MHg94RGM6z8u9mw', 'z6-reuC5BYf_Rth9gMBfgQ', 'aiX_WP7NKPTdF9CfI-M-wg', '3C5Z9homtzkWHouH2BHXYQ', 'C8D_GU9cDDjbOJfCaGXxDQ', 'Yl05MqCs9xRzrJFkGWLpgA', 'eS29S_06lvsDW04wVrIVxg', 'IsoLzudHC50oJLiEWpwV-w', '3N9U549Zse8UP-MwKZAjAQ', '_XN-GwzZwAyIqLKJsl2htg', 'r5PLDU-4mSbde5XekTXSCA', 'Iq7NqQD-sESu3vr9iEGuTA', 'u-SJ5QUwrNquL9VnXwl8cg', 'sJNcipFYElitBrtiJx0ezQ', '7m1Oa1VYV98UUuo_6i0EZg', 'e4NQLZynhSmvwl38hC4m-A', 'k1QpHAkzKTrFYfk6u--VgQ', '-6tvduBzjL

In [52]:
################################################################################
################ ELABORATE USER PHOTOS WITH CLARIFAI ###########################

def detect_clarifai(id, url_photo):
  app = ClarifaiApp(api_key='2fdc162ad6534a79a2c562b27c357db1')
  model = app.models.get(model_id="c0c0ac362b03416da06ab3fa36fb58e3")
  response = model.predict_by_url(url = '%s' % url_photo)
  
  df_result = pd.DataFrame(columns=['user_id', 'age', 'gender', 'ethnicity'])

  if response['outputs'][0]['data'] != {}:
    x = 0
    while (x < len(response['outputs'][0]['data']['regions'])):
      age = response['outputs'][0]['data']['regions'][x]['data']['concepts'][0]['name']
      gender = response['outputs'][0]['data']['regions'][x]['data']['concepts'][20]['name']
      ethnicity = response['outputs'][0]['data']['regions'][x]['data']['concepts'][22]['name']

      new_row = {'user_id':id,
                 'age':age,
                 'gender':gender,
                 'ethnicity':ethnicity}
      df_result = df_result.append(new_row, ignore_index=True)
      x = x + 1
  else:
    new_row = {'user_id':id,
              'age':np.NaN,
              'gender':np.NaN,
              'ethnicity':np.NaN}
    df_result = df_result.append(new_row, ignore_index=True)
  return df_result


# LEGGO GTREE
download = drive.CreateFile({'id': '1MIWS9uI9GKxsgAlzB0wXFBiutE025u2c'}) # id file gtree.csv
download.GetContentFile('gtree.csv')
gtree = pd.read_csv('gtree.csv')
gtree = drop_unnamed(gtree)
id_clarifai_folder = '1wfZ0d3cTufosLRhPKkShf7kQbJlUbBvH' # folder data > Clarifai

# DOWNLOAD restaurants.txt FROM DRIVE
restaurants_dataset_id = '1BqMsph8flQMGmcE_WxYV0vSC_uqi-nge'  # FILE ID, got on google drive with condivision link
download = drive.CreateFile({'id': restaurants_dataset_id})
download.GetContentFile('restaurants.txt')
with open('restaurants.txt') as f:
    content = f.readlines()
restaurants_id_list = [x.strip() for x in content] 

print(restaurants_id_list)

numrest = 7
id_restaurant = restaurants_id_list[(numrest-1):(numrest)][0]
startfrom = 0  # BE CAREFUL, IF 0 OVERWRITE THE CSV IN DRIVE

id_photo_folder = gtree.loc[gtree['business_id'] == id_restaurant, 'gfolder_userphoto'].tolist()[0]
print(id_restaurant)
print(id_photo_folder)

#if startfrom == 0:
  #df_clarifai = pd.DataFrame(columns=['user_id', 'age', 'gender', 'ethnicity'])
  #df_clarifai.to_csv(str(numrest) + ' - ' + id_restaurant + '.csv')
  #id_new_file = upload_file(str(numrest) + ' - ' + id_restaurant + '.csv', id_clarifai_folder)
  #gtree.loc[gtree['business_id']==id_restaurant, 'gfolder_clarifai'] = id_new_file
  #gtree.to_csv('gtree.csv')
  #upload_file('gtree.csv', '1eUt2wyCOULW0-LdL6vyRJi1mUs_U46DD')
#else:
id_clarifai_file = gtree.loc[gtree['business_id']==id_restaurant, 'gfolder_clarifai'].tolist()[0]
print(id_clarifai_file)
download = drive.CreateFile({'id': id_clarifai_file})
download.GetContentFile(str(numrest) + ' - ' + id_restaurant + '.csv')
df_clarifai = pd.read_csv(str(numrest) + ' - ' + id_restaurant + '.csv')

file_list = drive.ListFile({'q': "'" + id_photo_folder + "' in parents and trashed=false"}).GetList()    #cartella da cui prendo le foto

count = 0

for photo in file_list[startfrom:]:
  user_id = photo['title'][0:22] 
  count = count + 1
  if user_id not in df_clarifai['user_id'].values:
    minidf_result = detect_clarifai(user_id, photo['thumbnailLink'])
    df_clarifai = df_clarifai.append(minidf_result, ignore_index=True)
    df_clarifai = drop_unnamed(df_clarifai)
    if count == 100:
      count = 0
      df_clarifai.to_csv(str(numrest) + ' - ' + id_restaurant + '.csv')
      id_new_file = upload_file(str(numrest) + ' - ' + id_restaurant + '.csv', id_clarifai_folder)
      print('csv caricato: ' + id_new_file)
      
    print(startfrom)
    startfrom = startfrom + 1
  else:
    if count == 100:
      count = 0
    print(str(startfrom) + ' già fatto')
    startfrom = startfrom + 1
      
df_clarifai.to_csv(str(numrest) + ' - ' + id_restaurant + '.csv')
id_new_file = upload_file(str(numrest) + ' - ' + id_restaurant + '.csv', id_clarifai_folder)
print('csv caricato: ' + id_new_file) 

['WbJ1LRQdOuYYlRLyTkuuxw', 'T2tEMLpTeSMxLKpxwFdS3g', 'ALwAlxItASeEs2vYAeLXHA', 'OVTZNSkSfbl3gVB9XQIJfw', 'Sovgwq-E-n6wLqNh3X_rXg', 'j5nPiTwWEFr-VsePew7Sjg', 'aiX_WP7NKPTdF9CfI-M-wg', 'e4NQLZynhSmvwl38hC4m-A', 'S-oLPRdhlyL5HAknBKTUcQ', 'VyVIneSU7XAWgMBllI6LnQ', 'pSQFynH1VxkfSmehRXlZWw', 'JzOp695tclcNCNMuBl7oxA', 'OgJ0KxwJcJ9R5bUK0ixCbg', '3l54GTr8-E3XPbIxnF_sAA', '9a3DrZvpYxVs3k_qwlCNSw', 'frCxZS7lPhEnQRJ3UY6m7A', 'yNPh5SO-7wr8HPpVCDPbXQ', '0FUtlsQrJI7LhqDPxLumEw', 'K-uQkfSUTwu5LIwPB4b_vg', 'L2p0vO3fsS2LC6hhQo3CzA', 'd10IxZPirVJlOSpdRZJczA', 'wUKzaS1MHg94RGM6z8u9mw', 'z6-reuC5BYf_Rth9gMBfgQ', 'aiX_WP7NKPTdF9CfI-M-wg', '3C5Z9homtzkWHouH2BHXYQ', 'C8D_GU9cDDjbOJfCaGXxDQ', 'Yl05MqCs9xRzrJFkGWLpgA', 'eS29S_06lvsDW04wVrIVxg', 'IsoLzudHC50oJLiEWpwV-w', '3N9U549Zse8UP-MwKZAjAQ', '_XN-GwzZwAyIqLKJsl2htg', 'r5PLDU-4mSbde5XekTXSCA', 'Iq7NqQD-sESu3vr9iEGuTA', 'u-SJ5QUwrNquL9VnXwl8cg', 'sJNcipFYElitBrtiJx0ezQ', '7m1Oa1VYV98UUuo_6i0EZg', 'e4NQLZynhSmvwl38hC4m-A', 'k1QpHAkzKTrFYfk6u--VgQ', '-6tvduBzjL

In [72]:
################################################################################
############################### CHECK NUMBERS ##################################

download = drive.CreateFile({'id': '1MIWS9uI9GKxsgAlzB0wXFBiutE025u2c'}) # id file gtree.csv
download.GetContentFile('gtree.csv')
gtree = pd.read_csv('gtree.csv')
gtree = drop_unnamed(gtree)

# DOWNLOAD restaurants.txt FROM DRIVE
restaurants_dataset_id = '1BqMsph8flQMGmcE_WxYV0vSC_uqi-nge'  # FILE ID, got on google drive with condivision link
download = drive.CreateFile({'id': restaurants_dataset_id})
download.GetContentFile('restaurants.txt')
with open('restaurants.txt') as f:
    content = f.readlines()
restaurants_id_list = [x.strip() for x in content] 

print(restaurants_id_list)

numero_risto = 71

while numero_risto < 72:
  print(numero_risto)
  id_risto = restaurants_id_list[(numero_risto-1):(numero_risto)][0]
  print(id_risto)

  ########################## RANKING ##############################
  user_ranking_folder = gtree.loc[gtree['business_id']==id_risto, 'gfolder_rankings'].tolist()[0]
  ranking_file = drive.ListFile({'q': "'" + user_ranking_folder + "' in parents and trashed=false"}).GetList()[0]  
  download = drive.CreateFile({'id': ranking_file['id']})
  download.GetContentFile('ranking_' + id_risto + '.csv')
  file_ranking = pd.read_csv('ranking_' + id_risto + '.csv')
  ranking_lista = file_ranking['user_id'].values.tolist()

  ########################## PHOTO ##############################
  user_photo_folder = gtree.loc[gtree['business_id']==id_risto, 'gfolder_userphoto'].tolist()[0]
  photo_lista = drive.ListFile({'q': "'" + user_photo_folder + "' in parents and trashed=false"}).GetList() 
  photo_lista = [photo['title'][0:22] for photo in photo_lista]
  seen = set()
  print('duplicates:')
  for x in photo_lista:
    if x not in seen:
        seen.add(x)
    else:
      print(x)

  ########################## CLARIFAI ##############################
  user_clarifai_id = gtree.loc[gtree['business_id']==id_risto, 'gfolder_clarifai'].tolist()[0]
  download = drive.CreateFile({'id': user_clarifai_id})
  download.GetContentFile('clarifai_' + id_risto + '.csv')
  file_clarifai = pd.read_csv('clarifai_' + id_risto + '.csv')
  clarifai_without_duplicates = file_clarifai.drop_duplicates(subset=('user_id')).reset_index()
  clarifai_lista = clarifai_without_duplicates['user_id'].values.tolist()

  print('tot ranking = tot foto = tot clarifai')
  tot_ranking = len(file_ranking.index)
  tot_foto = len(photo_lista)
  tot_clarifai = len(clarifai_without_duplicates.index)
  toomuch = False
  print(str(tot_ranking) + ' = ' + str(tot_foto) + ' = ' + str(tot_clarifai))
  #if tot_ranking!=tot_foto or tot_ranking!=tot_clarifai or tot_foto!=tot_clarifai:
  print('FOTO CHE NON STANNO NEL RANKING:')
  for unid in photo_lista:
    if unid not in ranking_lista:
      print(unid)
  print('FOTO ANCORA DA SCARICARE (CHE STANNO NEL RANKING)')
  for unid in ranking_lista:
    if unid not in photo_lista:
      print(unid)
  print('CLARIFAI ANCORA DA ELABORARE (CHE STANNO NEL RANKING)')
  for unid in ranking_lista:
    if unid not in clarifai_lista:
      print(unid)
  print('CLARIFAI CHE NON STANNO NEL RANKING:')
  for unid in clarifai_lista:
    if unid not in ranking_lista:
      print(unid)
      toomuch = True
      file_clarifai = file_clarifai[file_clarifai['user_id'] != unid]
  if toomuch:
    file_clarifai = drop_unnamed(file_clarifai)
    file_clarifai.to_csv(str(numero_risto) + ' - ' + id_risto + '.csv')
    upload_file(str(numero_risto) + ' - ' + id_risto + '.csv', '1wfZ0d3cTufosLRhPKkShf7kQbJlUbBvH')
  numero_risto += 1


['WbJ1LRQdOuYYlRLyTkuuxw', 'T2tEMLpTeSMxLKpxwFdS3g', 'ALwAlxItASeEs2vYAeLXHA', 'OVTZNSkSfbl3gVB9XQIJfw', 'Sovgwq-E-n6wLqNh3X_rXg', 'j5nPiTwWEFr-VsePew7Sjg', 'aiX_WP7NKPTdF9CfI-M-wg', 'e4NQLZynhSmvwl38hC4m-A', 'S-oLPRdhlyL5HAknBKTUcQ', 'VyVIneSU7XAWgMBllI6LnQ', 'pSQFynH1VxkfSmehRXlZWw', 'JzOp695tclcNCNMuBl7oxA', 'OgJ0KxwJcJ9R5bUK0ixCbg', '3l54GTr8-E3XPbIxnF_sAA', '9a3DrZvpYxVs3k_qwlCNSw', 'frCxZS7lPhEnQRJ3UY6m7A', 'yNPh5SO-7wr8HPpVCDPbXQ', '0FUtlsQrJI7LhqDPxLumEw', 'K-uQkfSUTwu5LIwPB4b_vg', 'L2p0vO3fsS2LC6hhQo3CzA', 'd10IxZPirVJlOSpdRZJczA', 'wUKzaS1MHg94RGM6z8u9mw', 'z6-reuC5BYf_Rth9gMBfgQ', 'aiX_WP7NKPTdF9CfI-M-wg', '3C5Z9homtzkWHouH2BHXYQ', 'C8D_GU9cDDjbOJfCaGXxDQ', 'Yl05MqCs9xRzrJFkGWLpgA', 'eS29S_06lvsDW04wVrIVxg', 'IsoLzudHC50oJLiEWpwV-w', '3N9U549Zse8UP-MwKZAjAQ', '_XN-GwzZwAyIqLKJsl2htg', 'r5PLDU-4mSbde5XekTXSCA', 'Iq7NqQD-sESu3vr9iEGuTA', 'u-SJ5QUwrNquL9VnXwl8cg', 'sJNcipFYElitBrtiJx0ezQ', '7m1Oa1VYV98UUuo_6i0EZg', 'e4NQLZynhSmvwl38hC4m-A', 'k1QpHAkzKTrFYfk6u--VgQ', '-6tvduBzjL

In [5]:
################################################################################
#################### MODIFY SOME VALUES IN GTREE ###############################

download = drive.CreateFile({'id': '1MIWS9uI9GKxsgAlzB0wXFBiutE025u2c'}) # id file gtree.csv
download.GetContentFile('gtree.csv')
df = pd.read_csv('gtree.csv')
df = drop_unnamed(df)

#BE CAREFUL AT THE INDEX
indice = 31  # IF RESTAURANT 20, THEN 10 IN GTREE
#df.loc[indice, 'business_id'] = ''
#df.loc[indice, 'gfolder_id'] = ''
#df.loc[indice, 'gfolder_rankings'] = ''
#df.loc[indice, 'gfolder_groups'] = ''
#df.loc[indice, 'gfolder_userphoto'] = ''
#df.loc[indice, 'gfolder_clarifai'] = ''
df.loc[df['business_id']=='K-uQkfSUTwu5LIwPB4b_vg', 'gfolder_clarifai'] = '1TwJ8l0tvpLStvHOl3nIz0s7qgXBmkQeF'

gtree = drop_unnamed(df)
gtree.to_csv('gtree.csv')
upload_file('gtree.csv', '1eUt2wyCOULW0-LdL6vyRJi1mUs_U46DD') # id folder data

'1MIWS9uI9GKxsgAlzB0wXFBiutE025u2c'

In [None]:
################################################################################
######################## INSERT ONE ROW IN GTREE ###############################

def Insert_row(row_number, df, row_value):
    start_upper = 0
    end_upper = row_number
    start_lower = row_number
    end_lower = df.shape[0]
    upper_half = [*range(start_upper, end_upper, 1)]
    lower_half = [*range(start_lower, end_lower, 1)]
    lower_half = [x.__add__(1) for x in lower_half]
    index_ = upper_half + lower_half
    df.index = index_
    df.loc[row_number] = row_value
    df = df.sort_index()
    return df 

download = drive.CreateFile({'id': '1MIWS9uI9GKxsgAlzB0wXFBiutE025u2c'}) # id file gtree.csv
download.GetContentFile('gtree.csv')
df = pd.read_csv('gtree.csv')
print(df)
df = drop_unnamed(df)
 
row_number = 66
row_value = ['fL-b760btOaGa85OJ9ut3w', '1DhrpTWJygfaPrYkHNH1xuz-HxWZBZe__',
             '1Vm5kyWMQQCSMBLSGpDwVfHIR4bp5-JIt', '19bUEYHBUGKxt7DjStk0qJIlqq4NKslw8',
             '','','','','']
  
if row_number > df.index.max()+1: 
  print("Invalid row_number") 
else:
  df = Insert_row(row_number, df, row_value) 
  print(df[row_number-1:])

gtree = drop_unnamed(df)
gtree.to_csv('gtree.csv')
upload_file('gtree.csv', '1eUt2wyCOULW0-LdL6vyRJi1mUs_U46DD') # id folder data

In [None]:
################################################################################
##################### INSERT ROWS AT BEGIN IN GTREE ############################

download = drive.CreateFile({'id': '1MIWS9uI9GKxsgAlzB0wXFBiutE025u2c'}) # id file gtree.csv
download.GetContentFile('gtree.csv')
gtree = pd.read_csv('gtree.csv')

values = [['WbJ1LRQdOuYYlRLyTkuuxw','1DFANWg2Io81FaV_GxlwBWfpG9Bx-Dgr7','1U0Ml5puObfOk3qHCKhO9bQ06WyhJoffW','1Ba-IYi1icz5tuO_AYN-f-FvjXjA5NmJi','','','','19r7YGFp7oAqItv63TZOPyAt4hDa_UyRr',''],
          ['T2tEMLpTeSMxLKpxwFdS3g','1ENW6Pyiyt7v6p54Uhn9zrM9HQ28MB1-G','136FZ0Y90Zx-4UYDyW50cX8zUa_DhN3XS','1wAgWQ6Vmy3fe-RJ8_Ezwow75ftvOFP5_','','','','1f1NhSvHMwyJOxIrGbDix1lQ5NN4eHoG4',''],
          ['ALwAlxItASeEs2vYAeLXHA','1oEws_0F-s4hS6CpBxqxuzoJ2Wo-Nf8eL','12zXi3XyQaNgukGHW_805cyrkHhSdF7Df','1lZKqsV-sTAEOnLv-MQxpdao7eBWkjKP1','','','','1mWA3-mH9Kl0Xho24he8__4nZJBVPx9Qh',''],
          ['OVTZNSkSfbl3gVB9XQIJfw','1hgql_Ey5epZKj-SuB4F5Fu1eYOd2P8RX','1ZyNQavpG0akr3ca3PJjjl_D89IA5wlcc','1-drKKIaitJ0tjEY1W80B3kRXI5Ive5lC','','','','1OqlKSXZUvZCOOmTAONENS9Sq0twMCRM8',''],
          ['Sovgwq-E-n6wLqNh3X_rXg','10fJv8tOFOwBiAZBVEedsqxi3g6zYLxjD','1EZqvt9x5PN07BgUad1RtNIUXTVqujA0g','1QcSsJsvQne8a9jMvL9_1VAmeGIrsq8R8','','','','1ArjfNsamdhSpG7BJwnJZ_qULh_aklp62',''],
          ['j5nPiTwWEFr-VsePew7Sjg','1yRDL_lrQFsCijdyvDssRgdZ5a2wyjCXj','18bQVXYZ03vIpfEFLPh8I2i7cPTlokGxv','1VysI0VkliBEU8Y1sCX0PU9eKYE6GaCzC','','','','1KW9VPvY33rbkgPfzrREpxMfSQJg4xxoL',''],
          ['aiX_WP7NKPTdF9CfI-M-wg','1lQu_nXTCrzMFOm3uuXmCNVzx0PmYv0Of','1nl6A997UnuR5ceYRZS1242JJCvK-Sf_u','12Z4PEBlcTkz6kjeJ0JYRKI3WCrjRi4bF','','','','11H38dweaCKJ4-TD88nuzV70lMF5n4yzG',''],
          ['e4NQLZynhSmvwl38hC4m-A','1f3bGHoK6Hy44pKRqSsnhN8KM-W3AAChW','1K566Y5Q2N6Lw7S6yDicKC_R-zFVRqnFr','1reeSQL8MfkxOQg-NsVxazVdKfG1XNPEF','','','','1fVLmOBbg44H8q_UcwAOQiX6CeN6hf84Z',''],
          ['S-oLPRdhlyL5HAknBKTUcQ','1bG_50_Yyh9vq7YDQ3zSNTqS4acm1Cs7x','1FJqfJgaJinXS3oIvdA53FMuO9O7bCDVn','17QEFZUC_rhrLk9mSDXoegq7TSCI2_JLS','','','','1CMUT4RhSeuPi0j4VGoLCe-iP9-Aj5Fsy','']]
          
df1=pd.DataFrame(values, columns=['business_id', 'gfolder_id', 'gfolder_rankings', 
                              'gfolder_groups', 'gfolder_#11', 'gfolder_#12',
                              'gfolder_#14','gfolder_userphoto','gfolder_clarifai'])
df2=gtree

df3 = pd.concat([df1, df2]).reset_index(drop=True)
df3 = drop_unnamed(df3)
print(df3)
df3.to_csv('gtree.csv')
gtree = drop_unnamed(gtree)
print(gtree)
upload_file('gtree.csv', '1eUt2wyCOULW0-LdL6vyRJi1mUs_U46DD') # id folder data

    #restaurant  ... gfolder_clarifai
0             1  ...              NaN
1             2  ...              NaN
2             3  ...              NaN
3             4  ...              NaN
4             5  ...              NaN
..          ...  ...              ...
94           95  ...              NaN
95           96  ...              NaN
96           97  ...              NaN
97           98  ...              NaN
98           99  ...              NaN

[99 rows x 10 columns]


'1MIWS9uI9GKxsgAlzB0wXFBiutE025u2c'

In [None]:
################################################################################
##################### ADD AND REORDER COLUMNS IN GTREE #########################

download = drive.CreateFile({'id': '1MIWS9uI9GKxsgAlzB0wXFBiutE025u2c'}) # id file gtree.csv
download.GetContentFile('gtree.csv')
gtree = pd.read_csv('gtree.csv')

gtree['#restaurant'] = gtree.index + 1
cols = gtree.columns.tolist()
cols = cols[-1:] + cols[:-1]
gtree = gtree[cols]

gtree = drop_unnamed(gtree)
print(gtree)
upload_file('gtree.csv', '1eUt2wyCOULW0-LdL6vyRJi1mUs_U46DD') # id folder data