In [1]:
import numpy as np
import pandas as pd
import math
from scipy import stats
from matplotlib import pyplot as plt
from itertools import combinations

# Pré-processamento do dataset

In [2]:
def preProcessamento(dataset):
    dataset = dataset.drop_duplicates()
    for index, row in dataset.iterrows():
        if(math.isnan(row['gross']) or type(row['language'])==float):
            dataset = dataset.drop(index)
    return np.array(dataset)[:2000]

# Funções de consultas

In [3]:
def Q1(dataset):
    return dataset[np.argmax(dataset[:,8]),11]

In [4]:
def Q2(dataset):
    languages = np.unique(dataset[:,19])
    cluster_languages = len(languages)*[[]]
    for lang in range(0, len(languages)):
        cluster_lang = []
        for registro in range(0,len(dataset)):
            if(dataset[registro,19] == languages[lang]):
                cluster_lang.append(dataset[registro])
                cluster_languages[lang] = cluster_lang
    output = []
    for cluster in cluster_languages:
        output.append(Q1(np.array(cluster)))
    return output

In [5]:
def Q3(dataset):
    countries = np.unique(dataset[:,20])
    n = len(countries)
    count_countries = n*[0]
    for c in range(0,n):
        for r in range(len(dataset[:,0])):
            if(dataset[r,20] == countries[c]):
                count_countries[c] += 1
    output = []  
    for i in range(0,3):
        movie_country = np.argmax(count_countries)
        output.append(countries[movie_country])   
        count_countries = np.delete(count_countries, movie_country, axis=0)
        countries = np.delete(countries, movie_country,axis=0)
    return output

# Funções de score

In [6]:
def scoreFunctionQ1(registro, saida):
    if(registro[11] == saida):        
        return registro[8]
    else:
        return 0

In [7]:
def scoreFunctionQ3(registro,saida):
    ds_countries = dataset[:,20]
    countries = {}
    for i in range(0,len(ds_countries)):
        if(ds_countries[i] not in countries):
            countries[ds_countries[i]] = 0
    for reg in dataset:
        if(type(reg[20]) != float):
            countries[reg[20]] += 1
    scoreFuncQ3 = []
    if(registro[20] == saida):
        return countries[registro[20]]
    else:
        return 0

# Funções de sensibilidade

In [8]:
def sensQ1(dataset):
    saidas = dataset[:,11]
    sensibilidade = 0
    sensOut = []
    for saida in saidas:
        for registro in dataset:
            sensOut.append(scoreFunctionQ1(registro,saida))
    sensibilidade = max(sensOut)
    return sensibilidade

In [9]:
def sensQ2(dataset):
    saidas = dataset[:,11]
    sensibilidade = 0
    sensOut = []
    for saida in saidas:
        for registro in dataset:
            sensOut.append(scoreFunctionQ1(registro,saida))
    sensibilidade = max(sensOut)
    return sensibilidade

In [10]:
def sensQ3(dataset):
    saidas = np.unique(dataset[:,20])
    sensibilidade = 0
    scores = []
    for registro in dataset:
        for saida in saidas:
            scores.append(scoreFunctionQ3(registro,saida))
    sensibilidade = max(scores)
    return sensibilidade

# Funções de randomização das respostas

In [11]:
def Q1Rand(budget, sensibilidade, dataset):
    xk = np.arange(len(dataset))
    pk = []
    saidas = dataset[:,11]
    probabilidades = []
    i = 0
    for out in saidas:
        linha = []
        for registro in dataset:
            numerador = np.exp(budget*scoreFunctionQ1(registro,out)/(2*sensibilidade))
            denominador = 0
            for saida in saidas:
                denominador += np.exp(budget*scoreFunctionQ1(registro, saida)/(2*sensibilidade))
            linha.append(numerador/denominador)
        i += 1
        probabilidades.append(linha)
    for prob in probabilidades:
        pk.append(np.sum(prob)/np.sum(probabilidades))
    dist = stats.rv_discrete(name='dist', values=(xk, pk))
    reg = dist.rvs()
    return dataset[reg,11]

In [12]:
def Q2Rand(budget, sensibilidade, dataset):
    languages = np.unique(dataset[:,19])
    cluster_languages = len(languages)*[[]]
    for lang in range(0, len(languages)):
        cluster_lang = []
        for registro in range(0,len(dataset)):
            if(dataset[registro,19] == languages[lang]):
                cluster_lang.append(dataset[registro])
                cluster_languages[lang] = cluster_lang
    output = []
    for cluster in cluster_languages:
        output.append(Q1Rand(budget, sensibilidade, np.array(cluster)))
    return output

In [13]:
def Q3Rand(budget, sensibilidade, dataset):
    budget = budget/3
    probabilidades = []
    pk = []
    xk = []
    saidas = np.unique(dataset[:,20])
    xk = np.arange(len(saidas))
    for out in saidas:
        linha = []
        for registro in dataset:
            numerador = np.exp(budget*scoreFunctionQ3(registro,out)/(2*sensibilidade))
            denominador = 0
            for saida in saidas:
                denominador += np.exp(budget*scoreFunctionQ3(registro, saida)/(2*sensibilidade))
            linha.append(numerador/denominador)
        probabilidades.append(linha)
    for prob in probabilidades:
        pk.append(np.sum(prob)/np.sum(probabilidades))
    dist = stats.rv_discrete(name='dist', values=(xk, pk))
    output = []
    for i in range(0, 3):
        valido = False
        while(not valido):
            reg = dist.rvs()
            if(saidas[reg] not in output):
                output.append(saidas[reg])
                valido = True
    return output

# Obter as sensibilidades das funções

In [14]:
def adquirirSensibilidades(funcSens, dataset):
    sensibilidades = []
    for func in funcSens:
        sensibilidades.append(func(dataset))
    return sensibilidades

# Gerar CSV

In [15]:
def posProcessamento(budgets, respostas, sensibilidades):
    datasetFinal = []
    for i in range(0, len(budgets)):
        registro = [budgets[i]]
        for j in range(0, len(respostas[i])):
            registro.append(respostas[i][j])
        for j in range(0, len(sensibilidades)):
            registro.append(sensibilidades[j])
        datasetFinal.append(registro)
    datasetFinal = pd.DataFrame(datasetFinal, columns=['budget', 'result_q1', 'result_q2', 'result_q3', 
                                                       'sens _q1', 'sens_q2', 'sens_q3'])
    arquivo = 'result.csv'
    datasetFinal.to_csv(arquivo, index=False)

# Execução

In [16]:
dataset = pd.read_csv('movie_metadata.csv')
dataset = preProcessamento(dataset)

In [17]:
consultas = [Q1, Q2, Q3]
funcRand = [Q1Rand, Q2Rand, Q3Rand]
funcSens = [sensQ1, sensQ2, sensQ3]
budgets = [0.1, 1, 10]
sensibilidades = adquirirSensibilidades(funcSens, dataset)

In [None]:
respostas = []
for budget in budgets:
    respostaBudget = []
    for i in range(0, len(funcRand)):
        resposta = funcRand[i](budget, sensibilidades[i], dataset)
        respostaBudget.append(resposta)
    respostas.append(respostaBudget)
posProcessamento(budgets, respostas, sensibilidades)

In [None]:
for i in range(0, len(budgets)):
    for j in range(0, len(consultas)):
        print('Budget:', budgets[i])
        print('Consulta: Q'+str(j+1))
        print('Sensibilidade:', sensibilidades[j])
        print('Resposta Original:', consultas[j](dataset))
        print('Resposta Randomizada:', respostas[i][j],'\n')