<a href="https://colab.research.google.com/github/EleonoraBartolomucci/Fairness/blob/master/plots.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from lxml import html
import pandas as pd
import json
import csv
import numpy as np
import itertools
import os
import pickle
import random
import datetime
import math
from pydrive.auth import GoogleAuth
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials


import matplotlib.mlab as mlab
import matplotlib.pyplot as plt
import statistics as st
from scipy import stats
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.cluster import SpectralClustering
from sklearn.cluster import KMeans

from sklearn.cluster import DBSCAN
from sklearn import metrics
from sklearn.datasets import make_blobs
from sklearn.preprocessing import StandardScaler

#!pip install balanced_kmeans
#from balanced_kmeans import kmeans
#from balanced_kmeans import kmeans_equal

import networkx as nx
import time

# CONSTANTS
business_headers = ['index', 'business_id', 'name', 'address', 'city', 'state', 'postal_code', 'latitude', 'longitude', 'stars', 'review_count', 'is_open', 'attributes', 'categories', 'hours']

In [None]:
# AUTHENTICATE IN GOOGLE DRIVE
def authenticate():
  auth.authenticate_user()
  gauth = GoogleAuth()
  gauth.credentials = GoogleCredentials.get_application_default()
  drive = GoogleDrive(gauth)
  return drive
drive = authenticate()

In [None]:
def upload_file(filename, folder_id):
  drive = authenticate()
  fileList = drive.ListFile({'q': "'" + folder_id + "' in parents and trashed=false"}).GetList()
  drive_file = drive.CreateFile({'title': filename, 'parents': [{'id': folder_id}]})
  # Check if file already exists in Google Drive (prevents duplicates)
  for file in fileList:
      if file['title'] == filename:  # The file already exists, then overwrite it
          fileID = file['id']
          drive_file = drive.CreateFile({'id': fileID, 'title': filename, 'parents': [{'id': folder_id}]})
  # Upload user picture on Google Drive
  drive_file.SetContentFile(filename)  # path of local file content
  drive_file.Upload()  # Upload the file.
  return drive_file['id']
  

def create_folder_in_drive(gdrive, folder_name, parent_folder_id):
  folder_metadata = {'title': folder_name,'mimeType': 'application/vnd.google-apps.folder',
                    'parents': [{"kind": "drive#fileLink", "id": parent_folder_id}]
                    }
  folder = gdrive.CreateFile(folder_metadata)
  folder.Upload()
  print(folder)
  # Return folder informations
  print('title: %s, id: %s' % (folder['title'], folder['id']))
  return folder['id']


# READ JSON
def read_json(json_path):
  data = []
  with open(json_path, "r") as my_file: 
    for line in my_file:
      line_json = json.loads(line)
      data.append(line_json)
  return data


# PARSE JSON IN CSV
def json2csv(csv_path, json_path):
  data = read_json(json_path)
  df = pd.DataFrame(data)
  df.to_csv(csv_path)


def drop_unnamed(df):
  cols = [c for c in df.columns if c.lower()[:7] != 'unnamed']
  return df[cols]


# JOIN USER FROM YELP WITH USER FROM DATASET AND PRINT LOST USERS
def filter_user_from_dataset(df, users):
  df = df[['position','user_id','date','location']] # Tolgo alcune colonne perché prendo quelle del dataset
  df_merged = df.merge(users, on='user_id')
  total_review = df['position'].max()
  print('Utenti persi: totali ' + str(total_review) + ' - utenti nel dataset ' +
        str(len(df_merged.index)) + ' = ' + str(total_review - len(df_merged.index)))
  df_merged['position'] = df_merged.index + 1
  df_merged = drop_unnamed(df_merged)
  return df_merged


def compute_groups_percents(df_groups, N_of_groups):
  total = len(df_groups.index)
  percents = []
  i = 0
  while i < N_of_groups:
    current_length = len(df_groups[df_groups['group_id'] == i].index)
    percents.append((current_length/total)*100)
    i = i + 1
  print(percents)
  return percents


# ADD REVIEW INFO IN RANKING DATAFRAME
def integrate_review_info(df, reviews, business_id):
  business_reviews = reviews[reviews['business_id'] == business_id]
  df_merged = business_reviews.merge(df, on='user_id')
  del df_merged['date_y']  # it's date from yelp website (not updated in dataset)
  df_merged = df_merged.rename(columns={'date_x':'date'})
  # drop duplicates reviews from same user
  df_merged = df_merged.sort_values('date').drop_duplicates('user_id',keep='last')
  df_merged = df_merged.sort_values(by=['position']).reset_index(drop=True)
  print('Review perse: ', len(df.index) - len(df_merged.index))
  df_merged['position'] = df_merged.index + 1
  df_merged = drop_unnamed(df_merged)
  df_merged = df_merged[['position', 'user_id', 'review_id', 'date', 'name', 'location', 'text', 'fans', 'average_stars', 'review_count', 'business_id']]
  
  return df_merged

import ast
def reorder_user_data(df):
  df['review_count'] = np.NaN
  df['friend_count'] = np.NaN
  df['location'] = np.NaN
  df['photo_count'] = np.NaN
  df['useful_votes'] = np.NaN
  df['funny_votes'] = np.NaN
  df['cool_votes'] = np.NaN
  df['text'] = ""
  for i, user in df.iterrows():
    print(i)
    user_data = user['user']
    user_data_converted = ast.literal_eval(user_data)
    df.loc[i, 'review_count'] = user_data_converted["reviewCount"]
    df.loc[i, 'friend_count'] = user_data_converted["friendCount"]
    df.loc[i, 'location'] = user_data_converted['displayLocation']
    df.loc[i, 'photo_count'] = user_data_converted['photoCount']
    comment_data = user['comment']
    comment_data_converted = ast.literal_eval(comment_data)
    df.loc[i, 'text'] = comment_data_converted["text"]
    feedback_data = user['feedback']
    feedback_data_converted = ast.literal_eval(feedback_data)
    counts_data = feedback_data_converted['counts']
    #counts_data_converted = ast.literal_eval(counts_data)
    df.loc[i, 'useful_votes'] = counts_data['useful']
    df.loc[i, 'funny_votes'] = counts_data['funny']
    df.loc[i, 'cool_votes'] = counts_data['cool']
  print('REORDER-----------------------------------------')
  print(df)
  df = drop_unnamed(df)
  return df



def generate_dummies(df, text_attribute_list):
  dummy_columns = []
  for attr in text_attribute_list:
    gender_dummies = pd.get_dummies(df[attr])
    dummy_columns = dummy_columns + list(gender_dummies.columns)
    df = pd.merge(df, gender_dummies, how="left",left_index=True, right_index=True)
    
    # drop all the unnamed columns
    df = drop_unnamed(df)
  return df, dummy_columns


def create_vectors_yelp(df_users, business_id, list_of_attributes, method, destination):
  vectors = df_users[['user_id', 'review_count', 'friend_count', 'location', 'photo_count']]
  # DEMOGRAPHICS
  vectors = get_demographics(vectors, business_id)
  # SENTIMENT
  #vectors = get_sentiment(vectors, business_id)

  print(vectors.columns)
  # UPLOAD VECTORS IN DRIVE
  vectors.to_csv('user_vectors_' + business_id + '.csv')
  upload_file('user_vectors_' + business_id + '.csv',destination)
  return vectors


def get_sentiment(vectors, id):
  df_sentiment = pd.read_csv('sentiment_' + id + '.csv')
  new_vectors = pd.merge(vectors, df_sentiment, on='user_id', how='left')
  new_vectors = drop_unnamed(new_vectors)
  return new_vectors


def get_demographics(vectors, id):
  demographics_file_id = gtree.loc[gtree['business_id']==id, 'gfolder_clarifai'].tolist()[0]
  download = drive.CreateFile({'id': demographics_file_id})
  download.GetContentFile('demographics_'  + id + '.csv')
  df_demographics = pd.read_csv('demographics_' + id + '.csv')
  df_demographics = df_demographics.sort_values(['user_id', 'age'], ascending=[True, False])
  df_demographics = df_demographics.drop_duplicates('user_id',keep='first').reset_index(drop=True)
  new_vectors = pd.merge(vectors, df_demographics, on='user_id', how='left')
  new_vectors = drop_unnamed(new_vectors)
  return new_vectors



def pipeline1(business_id, alluser):
  ### READ THE RANKING FROM GTREE
  local_ranking_folder = gtree.loc[gtree['business_id']==id, 'gfolder_rankings'].tolist()[0]
  print(local_ranking_folder)
  local_ranking_files = drive.ListFile({'q': "'" + local_ranking_folder + "' in parents and trashed=false"}).GetList()
  download = drive.CreateFile({'id': local_ranking_files[0]['id']})
  download.GetContentFile(local_ranking_files[0]['title'])
  print(local_ranking_files[0]['id'])
  df_date_ranking = pd.read_csv(local_ranking_files[0]['title'])

  download = drive.CreateFile({'id': local_ranking_files[1]['id']})
  download.GetContentFile(local_ranking_files[1]['title'])
  print(local_ranking_files[1]['id'])
  df_rand_ranking = pd.read_csv(local_ranking_files[1]['title'])

  download = drive.CreateFile({'id': local_ranking_files[2]['id']})
  download.GetContentFile(local_ranking_files[2]['title'])
  print(local_ranking_files[2]['id'])
  df_rel_ranking = pd.read_csv(local_ranking_files[2]['title'])
  
  print("Ranking by Yelp filter:")
  pd.set_option('display.max_columns', None)
  print(df_rel_ranking)
  print('\n')
  print("Ranking by Date:")
  print(df_date_ranking)
  print('\n')
  print("Ranking Random:")
  print(df_rand_ranking)
  print('\n')

  ###

  df_rel_ranking = reorder_user_data(df_rel_ranking)
  df_date_ranking = reorder_user_data(df_date_ranking)
  df_rand_ranking = reorder_user_data(df_rand_ranking)
    

  print("\n++++++++++++++++ RANKING ++++++++++++++++++\n")

  print("Ranking by Yelp filter:")
  pd.set_option('display.max_columns', None)
  print(df_rel_ranking)
  print('\n')
  print("Ranking by Date:")
  print(df_date_ranking)
  print('\n')
  print("Ranking Random:")
  print(df_rand_ranking)
  print('\n')

  return df_rel_ranking, df_date_ranking, df_rand_ranking


def pipeline2(df_ranking_by_relevance, id, N_of_groups,
              local_list_of_attributes, method, alluser, destination, assumption):
  print("\n++++++++++++++++ GROUPS CREATION ++++++++++++++++++\n")

  if alluser:
    df_vectors = create_vectors_yelp(df_ranking_by_relevance, id, local_list_of_attributes,
                                     method, destination)
    
  if method == 'custom':
    df_groups, destination = create_groups_custom(N_of_groups,local_list_of_attributes,id, destination, assumption)
  
  percents = compute_groups_percents(df_groups, N_of_groups)
  
  print(df_groups)
  print(percents)
  return df_groups, percents, destination


def pipeline3(business_id, method, df_ranking_by_relevance, df_ranking_by_date, 
              df_ranking_by_random, percents, list_of_attributes,
              alluser, destination, df_groups):
  
  df_result = pd.DataFrame(columns=['Exposure_method', 'Context', 'Means', 'P-value'])
  df_result.to_csv('result.csv')
  id_new_file = upload_file('result.csv', destination)


  print("\n++++++++++++++++ EXPOSURE CALCULATION ++++++++++++++++++\n")
  
  print("----------------DEMOGRAPHIC PARITY EXP------------------\n")
  print("------------Ranking by Yelp filter:------------\n")
  yelp_exposures, yelp_user_exposures = print_demographic_parity_exposure(business_id,
                                                    method, df_ranking_by_relevance,
                                                     "yelp_", list_of_attributes, destination)
  print("------------Ranking by Date:------------\n")
  date_exposures, date_user_exposures = print_demographic_parity_exposure(business_id,
                                                    method, df_ranking_by_date,
                                                     "date_", list_of_attributes, destination)
  print("------------Ranking Random:------------\n")
  random_exposures, rand_user_exposures = print_demographic_parity_exposure(business_id,
                                                      method, df_ranking_by_random,
                                                       "rand_", list_of_attributes, destination)
  
  array_group_id = np.arange(0,df_groups['group_id'].max()+1)
  filePath = stat_significance_inter_rankings(yelp_user_exposures, rand_user_exposures, method,
                                   "demgr_yelp_", business_id, array_group_id, destination) # group_id
  upload_file(filePath, destination)
  filePath = stat_significance_inter_rankings(date_user_exposures, rand_user_exposures, method,
                                   "demgr_date_", business_id, array_group_id, destination)
  upload_file(filePath, destination)

  if alluser and not set(list_of_attributes).intersection(set(['age', 'gender', 'ethnicity'])):
    yelp_error = st.pstdev(yelp_user_exposures['exposure'])
    date_error = st.pstdev(date_user_exposures['exposure'])
    rand_error = st.pstdev(rand_user_exposures['exposure'])
  else:
    yelp_error = st.stdev(yelp_user_exposures['exposure'])
    date_error = st.stdev(date_user_exposures['exposure'])
    rand_error = st.stdev(rand_user_exposures['exposure'])
  
  get_plots(yelp_exposures, date_exposures, random_exposures, percents,
            'plot_demgr_' + method + '_' + business_id, list_of_attributes,
            method, business_id, alluser, [yelp_error, date_error, rand_error], destination)
  get_scatter_plots(yelp_user_exposures, date_user_exposures, rand_user_exposures,
                    'scatter_demgr_' + method + '_' + business_id, list_of_attributes,
                    method, business_id, alluser, destination)

  print("----------------DISPARATE IMPACT EXP------------------\n")
  print("------------Ranking by Yelp filter:------------\n")
  yelp_exposures, yelp_user_exposures = print_disparate_impact_exposure(business_id, method, df_ranking_by_relevance,
                                                   "yelp_", list_of_attributes, alluser, destination)
  
  print("------------Ranking by Date:------------\n")
  date_exposures, date_user_exposures = print_disparate_impact_exposure(business_id, method, df_ranking_by_date,
                                                   "date_", list_of_attributes, alluser, destination)
  
  print("------------Ranking Random:------------\n")
  random_exposures, rand_user_exposures = print_disparate_impact_exposure(business_id, method, df_ranking_by_random,
                                                     "rand_", list_of_attributes, alluser, destination)
  
  filePath = stat_significance_inter_rankings(yelp_user_exposures, rand_user_exposures, method,
                                   "dispimp_yelp_", business_id, array_group_id, destination)
  upload_file(filePath, destination)
  filePath = stat_significance_inter_rankings(date_user_exposures, rand_user_exposures, method,
                                   "dispimp_date_", business_id, array_group_id, destination)
  upload_file(filePath, destination)

  if alluser and not set(list_of_attributes).intersection(set(['age', 'gender', 'ethnicity'])):
    yelp_error = st.pstdev(yelp_user_exposures['exposure'])
    date_error = st.pstdev(date_user_exposures['exposure'])
    rand_error = st.pstdev(rand_user_exposures['exposure'])
  else:
    yelp_error = st.stdev(yelp_user_exposures['exposure'])
    date_error = st.stdev(date_user_exposures['exposure'])
    rand_error = st.stdev(rand_user_exposures['exposure'])
  
  get_plots(yelp_exposures, date_exposures, random_exposures, percents,
            'plot_dispimp_' + method + '_' + business_id, list_of_attributes,
            method, business_id, alluser, [yelp_error, date_error, rand_error], destination)
  get_scatter_plots(yelp_user_exposures, date_user_exposures, rand_user_exposures,
                    'scatter_dispimp_' + method + '_' + business_id, list_of_attributes,
                    method, business_id, alluser, destination)


def create_groups_custom(N_of_groups, list_of_attributes, id, destination, assumption):
  df_vectors = pd.read_csv("user_vectors_" + id + ".csv")

  local_list_of_attributes = list_of_attributes

  if set(list_of_attributes).intersection(set(['age', 'gender', 'ethnicity', 'review_sentiment'])):
    text_attribute_list = ['gender', 'ethnicity']
    df_vectors, dummy_columns_name = generate_dummies(df_vectors, text_attribute_list)
    # list_of_attributes - text_attribute_list + dummy_columns_name
    # subtraction
    temp = [item for item in list_of_attributes if item not in text_attribute_list]
    list_of_attributes = temp + dummy_columns_name
    #serve per la descrizione dei gruppi, da fare

    #EXCLUDE USERS WITH NO INFO, CLUSTER THE REMAINING IN C-1
    df_no_info = df_vectors[df_vectors[local_list_of_attributes[0]].isnull()]
    df_yes_info = df_vectors[df_vectors[local_list_of_attributes[0]].notnull()]
    print('All users =', len(df_vectors))
    print('Users with info =',len(df_yes_info.index))
    print('User without info =',len(df_no_info.index))
  
  else:
    df_no_info = pd.DataFrame()
    df_yes_info = df_vectors

  
  # GRUPPI PER GENERE
  if local_list_of_attributes == ['gender']:
    if assumption=='':
      femmine = df_yes_info[df_yes_info['gender']=='feminine']
      maschi = df_yes_info[df_yes_info['gender']=='masculine']
      new_df_users = df_yes_info[['user_id']].reset_index(drop=True)
      new_df_users['group_id'] = np.NaN
      for j, user in femmine.iterrows():
        new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 1
        
      for j, user in maschi.iterrows():
        new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 2
    else:
      new_df_users, destination = do_assumptions(df_yes_info, df_no_info,
                                                 destination, assumption)
      df_no_info = df_no_info.iloc[0:0]

  # GRUPPI PER REVIEW SENTIMENT
  if local_list_of_attributes == ['review_sentiment']:
    df_yes_info = df_yes_info[['user_id', 'review_sentiment']]
    print(df_yes_info)
    positivi = df_yes_info[df_yes_info['review_sentiment'].str.startswith('positive')]
    negativi = df_yes_info[df_yes_info['review_sentiment'].str.startswith('negative')]
    neutrali = df_yes_info[df_yes_info['review_sentiment'].str.startswith('neutral')]
    new_df_users = df_yes_info[['user_id']].reset_index(drop=True)
    new_df_users['group_id'] = np.NaN
    for j, user in positivi.iterrows():
      new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 1
      
    for j, user in negativi.iterrows():
      new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 2
    
    for j, user in neutrali.iterrows():
      new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 3
    
  # GRUPPI PER ETNIA
  if local_list_of_attributes == ['ethnicity']:
    bianchi = df_yes_info[df_yes_info['ethnicity']=='white']
    neri = df_yes_info[df_yes_info['ethnicity']=='black or african american']
    altri = df_yes_info[(df_yes_info['ethnicity']=='hispanic, latino, or spanish origin') | (df_yes_info['ethnicity']=='asian') | (df_yes_info['ethnicity']=='middle eastern or north african') | (df_yes_info['ethnicity']=='native hawaiian or pacific islander')]
    new_df_users = df_yes_info[['user_id']].reset_index(drop=True)
    new_df_users['group_id'] = np.NaN
    for j, user in bianchi.iterrows():
      new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 1
      
    for j, user in neri.iterrows():
      new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 2
    
    for j, user in altri.iterrows():
      new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 3

  # GRUPPI PER ETNIA, GENERE
  if local_list_of_attributes == ['gender','ethnicity']:
    bianchi = df_yes_info[df_yes_info['ethnicity']=='white']
    b_under_39 = bianchi[(bianchi['age']<39)]
    b_over_39 = bianchi[(bianchi['age']>=39)]
    b_fem = bianchi[bianchi['gender']=='feminine'].reset_index(drop=True)
    b_mas = bianchi[bianchi['gender']=='masculine'].reset_index(drop=True)
    
    neri = df_yes_info[df_yes_info['ethnicity']=='black or african american']
    n_under_39 = neri[(neri['age']<39)]
    n_over_39 = neri[(neri['age']>=39)]
    n_fem = neri[neri['gender']=='feminine'].reset_index(drop=True)
    n_mas = neri[neri['gender']=='masculine'].reset_index(drop=True)
    
    altri = df_yes_info[(df_yes_info['ethnicity']=='hispanic, latino, or spanish origin') | (df_yes_info['ethnicity']=='asian') | (df_yes_info['ethnicity']=='middle eastern or north african') | (df_yes_info['ethnicity']=='native hawaiian or pacific islander')]
    a_under_39 = altri[(altri['age']<39)]
    a_over_39 = altri[(altri['age']>=39)]
    a_fem = altri[altri['gender']=='feminine'].reset_index(drop=True)
    a_mas = altri[altri['gender']=='masculine'].reset_index(drop=True)
    new_df_users = df_yes_info[['user_id']].reset_index(drop=True)
    new_df_users['group_id'] = np.NaN
    for j, user in b_fem.iterrows():
      new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 1
      
    for j, user in b_mas.iterrows():
      new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 2
    
    for j, user in n_fem.iterrows():
      new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 3
      
    for j, user in n_mas.iterrows():
      new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 4
      
    for j, user in a_fem.iterrows():
      new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 5
      
    for j, user in a_mas.iterrows():
      new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 6

  if not df_no_info.empty:
    # ADD THE EXCLUDED IN LAST C
    new_df_users = pd.merge(new_df_users,df_no_info,how='outer')
    new_df_users = drop_unnamed(new_df_users)
    new_df_users = new_df_users.fillna(0)

  #destination = set_file_destination(local_list_of_attributes, 'custom', id)

  new_df_users.to_csv('groups_custom_' + id + '.csv')
  upload_file('groups_custom_' + id + '.csv',destination)
  return new_df_users, destination


def do_assumptions(df_yes, df_no_info, destination, assumption):
  total = len(df_yes.index)+len(df_no_info.index)
  already_men = len(df_yes[df_yes['gender']=='masculine'].values.tolist())
  # ALL IN ONE GROUP
  if assumption=='all_men':
    id_new_folder = create_folder_in_drive(drive, 'All_men', destination)
    df_no = df_no_info
    df_no['gender'].fillna('masculine', inplace = True)
    df = pd.concat([df_yes, df_no])
  if assumption=='all_women':
    id_new_folder = create_folder_in_drive(drive, 'All_women', destination)
    df_no = df_no_info
    df_no['gender'] = 'feminine'
    df = pd.concat([df_yes, df_no])
  # 50%
  if assumption=='50':
    men_size = total//2
    id_new_folder = create_folder_in_drive(drive, '50and50', destination)
    df_no = df_no_info
    while (already_men+len(df_no[df_no['gender']=='masculine'].values.tolist()))<=men_size:
      index_found = df_no.index[df_no['gender'].isnull()].tolist()[0]
      df_no.loc[index_found, 'gender'] = 'masculine'
    df_no['gender'].fillna('feminine', inplace = True)
    df = pd.concat([df_yes, df_no])
  # EQUALLY DISTRIBUTED
  if assumption=='equal':
    id_new_folder = create_folder_in_drive(drive, 'Equally_distributed', destination)
    df_no = df_no_info
    even = 0
    for ind, row in df_no.iterrows():
      if even==0:
        even=1
        df_no.loc[ind,'gender']='masculine'
      else:
        df_no.loc[ind,'gender']='feminine'
        even=0
    df = pd.concat([df_yes, df_no])
  # MAINTAIN PROPORTION
  if assumption=='proportioned':
    #male_size:partial=new_male_size:total
    partial = len(df_yes.index)
    men_size = (total*(len(df_yes[df_yes['gender']=='masculine'].reset_index(drop=True).index)))//partial
    id_new_folder = create_folder_in_drive(drive, 'Maintaining_proportion', destination)
    df_no = df_no_info
    while (already_men+len(df_no[df_no['gender']=='masculine'].values.tolist()))<=men_size:
      index_found = df_no.index[df_no['gender'].isnull()].tolist()[0]
      df_no.loc[index_found, 'gender'] = 'masculine'
    df_no['gender'].fillna('feminine', inplace = True)
    df = pd.concat([df_yes, df_no])
  femmine = df[df['gender']=='feminine']
  maschi = df[df['gender']=='masculine']

  new_df_users = df[['user_id']].reset_index(drop=True)
  new_df_users['group_id'] = np.NaN
  for j, user in femmine.iterrows():
    new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 0
  for j, user in maschi.iterrows():
    new_df_users.loc[new_df_users['user_id'] == user['user_id'], 'group_id'] = 1
  return new_df_users, id_new_folder


def print_disparate_impact_exposure(business_id, method,  df_ranking,
                                    filename, list_of_attributes, alluser, destination):
  df_groups = pd.read_csv('groups_' + method + '_' + business_id + '.csv')
  i = 0
  exposures = pd.DataFrame(columns=['group_id', 'exposure'])
  user_exposures = pd.DataFrame(columns=['user_id', 'group_id', 'exposure'])
  user_exposures['user_id'] = df_ranking['user_id']
  while i <= df_groups['group_id'].max():
      current_exp, user_exposures = disparate_impact_exposure(df_groups[df_groups['group_id'] == i], df_ranking,user_exposures, business_id, i, alluser)
      exposures.loc[i, 'group_id'] = i
      exposures.loc[i, 'exposure'] = current_exp
      i = i + 1
  
  #destination = set_file_destination(list_of_attributes, method, business_id)
  user_exposures.to_csv('user_exp_dispimp_' + method + '_' + filename + business_id + '.csv')
  upload_file('user_exp_dispimp_' + method + '_' + filename + business_id + '.csv',
                  destination)
  
  filePath = stat_significance_inter_groups(df_groups, 'dispimp_'+filename, business_id,
                                            method, user_exposures, destination)

  exposures.to_csv('exp_dispimp_' + method + '_' + filename + business_id + '.csv')
  upload_file('exp_dispimp_' + method + '_' + filename + business_id + '.csv',
                  destination)
  upload_file(filePath, destination)
  print(exposures)
  print('\n')
  return exposures, user_exposures


def print_demographic_parity_exposure(business_id, method, df_ranking, filename,
                                      list_of_attributes, destination):
  df_groups = pd.read_csv('groups_' + method + '_' + business_id + '.csv')
  i = 0
  exposures = pd.DataFrame(columns=['group_id', 'exposure'])
  user_exposures = pd.DataFrame(columns=['user_id', 'group_id', 'exposure'])
  user_exposures['user_id'] = df_ranking['user_id']
  while i <= df_groups['group_id'].max():
      current_exp, user_exposures = demographic_parity_exposure(df_groups[df_groups['group_id'] == i], df_ranking, user_exposures, i)
      exposures.loc[i, 'group_id'] = i
      exposures.loc[i, 'exposure'] = current_exp
      i = i + 1
  
  #destination = set_file_destination(list_of_attributes, method, business_id)
  user_exposures.to_csv('user_exp_demgr_' + method + '_' + filename + business_id + '.csv')
  upload_file('user_exp_demgr_' + method + '_' + filename + business_id + '.csv',
                  destination)
  filePath = stat_significance_inter_groups(df_groups, 'demgr_'+filename, business_id,
                                            method, user_exposures, destination)
  '''array_group_id = np.arange(0,df_groups['group_id'].max()+1)
  couples = list(itertools.combinations(array_group_id,2))
  couples = [(x,y) for (x,y) in couples if x!=y]
  print(couples) # [(0,1), (0,2), (1,2)]
  filePath = 'stat_' + filename + business_id + '.txt'
  if os.path.exists(filePath):
    os.remove(filePath)
  for id1,id2 in couples:
    group1 = user_exposures[user_exposures['group_id'] == id1]['exposure']
    group2 = user_exposures[user_exposures['group_id'] == id2]['exposure']
    result = stats.ttest_ind(group1, group2)
    percent1 = 100*(len(group1.index)/len(user_exposures.index))
    percent2 = 100*(len(group2.index)/len(user_exposures.index))
    print('+++++++++TEST STATISTICAL SIGNIFICANT+++++++++')
    print('GROUP '+ str(id1) +':')
    print(user_exposures[user_exposures['group_id'] == id1])
    print('GROUP '+ str(id2) +':')
    print(user_exposures[user_exposures['group_id'] == id2])
    print('RESULT:')
    print(result)
    with open(filePath, 'a') as output:
      output.write('Percent group '+ str(id1) +': ' + str(percent1) + '\nPercent group '+
                   str(id2) +': ' + str(percent2) + '\nResult: ')
      output.write(str(result)+'\n\n')'''

  exposures.to_csv('exp_demgr_' + method + '_' + filename + business_id + '.csv')
  upload_file('exp_demgr_' + method + '_' + filename + business_id + '.csv',
                  destination)
  upload_file(filePath, destination)
  print(exposures)
  print('\n')
  return exposures, user_exposures


def stat_significance_inter_groups(df_groups, filename, business_id, method, df_user_exposures, destination):
  array_group_id = np.arange(0,df_groups['group_id'].max()+1)
  couples = list(itertools.combinations(array_group_id,2))
  couples = [(x,y) for (x,y) in couples if x!=y]
  print("Group couples:", couples) # [(0,1), (0,2), (1,2)]
  filePath = 'stat_groups_' + method + '_' + filename + business_id + '.txt'
  if os.path.exists(filePath):
    os.remove(filePath)
  ciclo = 0
  while ciclo < 3:
    with open(filePath, 'a') as output:
        output.write('Without first and last ' + str(ciclo) + ' rows:\n\n')
    user_exposures = df_user_exposures.iloc[ciclo:(len(df_user_exposures.index)-ciclo)]
    print('Size:', len(user_exposures.index))
    for id1,id2 in couples:
      group1 = user_exposures[user_exposures['group_id'] == id1]['exposure']
      group2 = user_exposures[user_exposures['group_id'] == id2]['exposure']
      array1 = group1.values
      array2 = group2.values
      result = stats.ks_2samp(array1, array2)
      percent1 = 100*(len(group1.index)/len(user_exposures.index))
      percent2 = 100*(len(group2.index)/len(user_exposures.index))
      with open(filePath, 'a') as output:
        output.write('Percent GROUP '+ str(id1) +': ' + str(percent1) + '\nPercent GROUP '+
                    str(id2) +': ' + str(percent2) + '\nResult: ')
        output.write(str(result)+'\n\n')
      if result[1] <= 0.05:
        update_result_table(filename[:-6], filename[-5:-1], (int(id1),int(id2)),
                          result[1], destination)
    with open(filePath, 'a') as output:
        output.write('----------------------------------------\n\n')
    ciclo = ciclo + 1
  return filePath

def stat_significance_inter_rankings(df_user_exposures, df_random_user_exposures, method,
                                     filename, business_id, group_ids, destination):
  filePath = 'stat_rankings_' + method + '_' + filename + business_id + '.txt'
  if os.path.exists(filePath):
    os.remove(filePath)
  ciclo = 0
  while ciclo < 3:
    with open(filePath, 'a') as output:
        output.write('Without first and last ' + str(ciclo) + ' rows:\n\n')
    user_exposures = df_user_exposures.iloc[ciclo:(len(df_user_exposures.index)-ciclo)]
    random_user_exposures = df_random_user_exposures.iloc[ciclo:(len(df_random_user_exposures.index)-ciclo)]
    for group_id in group_ids:
      group1 = user_exposures[user_exposures['group_id'] == group_id]['exposure']
      group2 = random_user_exposures[random_user_exposures['group_id'] == group_id]['exposure']
      array1 = group1.values
      array2 = group2.values
      result = stats.ks_2samp(array1, array2)
      percent1 = 100*(len(group1.index)/len(user_exposures.index))
      percent2 = 100*(len(group2.index)/len(random_user_exposures.index))
      print('+++++++++TEST STAT SIGNIFICANCE INTER RANKINGS+++++++++')
      print('RESULT:')
      print(result)
      with open(filePath, 'a') as output:
        output.write('Percent GROUP '+ str(group_id) +': ' + str(percent1) + '\nResult: ')
        output.write(str(result)+'\n\n')
      if result[1] <= 0.05:
        update_result_table(filename[:-6], group_id, (filename[-5:-1],'random'), result[1], destination)
    with open(filePath, 'a') as output:
        output.write('----------------------------------------\n\n')
    ciclo = ciclo + 1
  return filePath


def update_result_table(exp_method, context, means, p_value, destination):
  file_list = drive.ListFile({'q': "'" + destination + "' in parents and trashed=false"}).GetList()
  for file in file_list:
    if file['title']=='result.csv':
      id_result_file = file['id']
  download = drive.CreateFile({'id': id_result_file}) # id file gtree.csv
  download.GetContentFile('result.csv')
  df = pd.read_csv('result.csv')
  new_row = {'Exposure_method':exp_method, 'Context':context,
             'Means':str(means[0])+' - '+str(means[1]), 'P-value':p_value}
  df = df.append(new_row, ignore_index=True)
  df = drop_unnamed(df)
  df = drop_unnamed(df)
  df.to_csv('result.csv')
  upload_file('result.csv', destination) # id folder data


def disparate_impact_exposure(df_group, ranking, user_exposures, business_id, group_index, alluser):
  all_exposures = []
  for i, user in df_group.iterrows():
    user_id = user['user_id']

    if alluser:
      useful = ranking[ranking['user_id'] == user_id]['useful_votes'].values[0]
      funny = ranking[ranking['user_id'] == user_id]['funny_votes'].values[0]
      cool = ranking[ranking['user_id'] == user_id]['cool_votes'].values[0]
    
    counts = useful + funny + cool + 2
    
    base = 2  # con 10 i valori sono troppo bassi
    counts = math.log(counts, base)
    
    position = ranking[ranking['user_id'] == user_id]['position'].values[0]
    
    current_exp = exp(position) * counts
    
    all_exposures.append(current_exp)
    user_exposures.loc[user_exposures['user_id'] == user_id, 'exposure'] = current_exp
    user_exposures.loc[user_exposures['user_id'] == user_id, 'group_id'] = group_index
  print('all_exposures size:', len(all_exposures))
  mean = st.mean(all_exposures)
  return mean, user_exposures


def demographic_parity_exposure(df_group, ranking, user_exposures, group_index):
    all_exposures = []
    for i, user in df_group.iterrows():
        user_id = user['user_id']
        position = ranking[ranking['user_id'] == user_id]['position'].values[0]
        current_exp = exp(position)
        all_exposures.append(current_exp)
        user_exposures.loc[user_exposures['user_id'] == user_id, 'exposure'] = current_exp
        user_exposures.loc[user_exposures['user_id'] == user_id, 'group_id'] = group_index
    print('all_exposures size:', len(all_exposures))
    mean = st.mean(all_exposures)
    return mean, user_exposures


def exp(position):
    if position == 'no match':
        return 0
    else:
        return 1/(np.log(1 + position))
  


def get_plots(yelp_exposures, date_exposures, random_exposures, percents, title,
              list_of_attributes, method, business_id, alluser, errors, destination):
  width = 0.20
  y_min = 0.0
  y_max = 0.5

  x1 = [el['group_id'] - width for i, el in yelp_exposures[['group_id']].iterrows()]
  x2 = [el['group_id'] for i, el in date_exposures[['group_id']].iterrows()]
  x3 = [el['group_id'] + width for i, el in random_exposures[['group_id']].iterrows()]

  y1 = [el['exposure'] for i, el in yelp_exposures[['exposure']].iterrows()]
  y2 = [el['exposure'] for i, el in date_exposures[['exposure']].iterrows()]
  y3 = [el['exposure'] for i, el in random_exposures[['exposure']].iterrows()]

  print(y1)

  '''y1_error = np.std(y1)
  y2_error = np.std(y2)
  y3_error = np.std(y3)'''
  '''# I have missing user when using dataset OR using demographic attributes
  if alluser and set(list_of_attributes).intersection(set(['age', 'gender', 'ethnicity'])):
    y1_error = st.pstdev(y1)
    y2_error = st.pstdev(y2)
    y3_error = st.pstdev(y3)
  else:
    y1_error = st.stdev(y1)
    y2_error = st.stdev(y2)
    y3_error = st.stdev(y3)'''

  plt.bar(x1,y1,width=width,align='center', color='red', label='yelp', yerr=errors[0], capsize=5)
  plt.bar(x2,y2,width=width,align='center', color='green', label='date', yerr=errors[1], capsize=5)
  plt.bar(x3,y3,width=width,align='center', color='blue', label='random', yerr=errors[2], capsize=5)
  plt.legend(loc="upper center")
  plt.xlabel('Group id')
  plt.ylabel('Exposure')
  this_range = [str(int(id)) + ":" + "{:.{}f}".format(percent,1) + "%" for id, percent in zip(np.arange(min(x2), max(x2)+1, 1.0), percents)]
  plt.xticks(np.arange(min(x2), max(x2)+1, 1.0),this_range)
  plt.yticks(np.arange(y_min, y_max, 0.05))
  axes = plt.gca()
  axes.set_ylim([y_min,y_max])
  axes.yaxis.grid()

  #destination = set_file_destination(list_of_attributes, method, business_id)
  
  #plt.show()
  plt.savefig(title + '.png')
  upload_file(title + '.png', destination)
  plt.close()


def get_scatter_plots(yelp_user_exposures, date_user_exposures, random_user_exposures, title,
              list_of_attributes, method, business_id, alluser, destination):
  width = 0.20
  y_min = 0.0
  y_max = 1.5

  x1 = [el['group_id'] - width for i, el in yelp_user_exposures[['group_id']].iterrows()]
  x2 = [el['group_id'] for i, el in date_user_exposures[['group_id']].iterrows()]
  x3 = [el['group_id'] + width for i, el in random_user_exposures[['group_id']].iterrows()]

  y1 = [el['exposure'] for i, el in yelp_user_exposures[['exposure']].iterrows()]
  y2 = [el['exposure'] for i, el in date_user_exposures[['exposure']].iterrows()]
  y3 = [el['exposure'] for i, el in random_user_exposures[['exposure']].iterrows()]

  plt.scatter(x1,y1, s=10, color='red', label='yelp')
  plt.scatter(x2,y2, s=10, color='green', label='date')
  plt.scatter(x3,y3, s=10, color='blue', label='random')
  plt.legend(loc="upper center")
  plt.xlabel('Group id')
  plt.ylabel('Exposure')
  #this_range = [str(int(id)) + ":" + "{:.{}f}".format(percent,1) + "%" for id, percent in zip(np.arange(min(x2), max(x2)+1, 1.0), percents)]
  plt.xticks(np.arange(min(x2), max(x2)+1, 1.0))
  plt.yticks(np.arange(y_min, y_max, 0.1))
  axes = plt.gca()
  axes.set_ylim([y_min,y_max])
  axes.yaxis.grid()

  #destination = set_file_destination(list_of_attributes, method, business_id)
  
  #plt.show()
  plt.savefig(title + '.png')
  upload_file(title + '.png', destination)
  plt.close()
