# Notebook for Downloading and Recording the Hatebase.org Database

In [None]:
#imports
import requests
import json 
import csv
import pandas as pd
import numpy as np

In [None]:
#this is my personal key to access the API
key = {
    "api_key": ""
}

## Authentication Handshake 

This simply tells the API that you are about to use it, and gives you access. 
Base URL: https://api.hatebase.org/1-0/authenticate

"1-0" represents the version number, and "authenticate" represents the endpoint.

In [None]:
auth_response = requests.post("https://api.hatebase.org/4-4/authenticate", data=key)
#print(auth_response.json()['errors']) #in the event of errors uncomment this line to read them

#parameters used for the return.  
parameters = {
    "token": auth_response.json()['result']['token'] 
}

print("Status Code: ", auth_response.status_code, "Token: ", parameters["token"]) #200 means all good

### Parameters and Initializing

In [None]:
page_count = 1

parameters["format"] = "json"
parameters["language"] = "ENG"
parameters["page"] = page_count 

#Initialize dictionaries to store results in until they can be put to files
unambiguous = {
}

ambiguous = {
}

total = {
}

In [None]:
response = requests.post("https://api.hatebase.org/4-4/get_vocabulary", data=parameters) #used to retrieve max page number for full API
pages_number = response.json()['number_of_pages'] #to be used later to limit the full loop 

## Request Loop for Total Database

In [None]:
page_count = 1 #make sure page_count has been set to 1 in case other loops have been run. 
parameters["page"] = page_count 

for p in range(1, pages_number+1): 
    response = requests.post("https://api.hatebase.org/4-4/get_vocabulary", data=parameters)
    print("Status Code: ", response.status_code, "Page: ", response.json()['page']) #200 means all good
    
    response_result = response.json()['result'] #retrieve only the results, not the version number, etc
    
    for i in response_result: #for each result from the current page
        total.update( {i['term'] : i}) #add term to total
    
    page_count += 1
    parameters["page"] = page_count #increase and update page_count to retrieve the next page

#### Record Results

Record result as in JSON file. 

In [None]:
t_file_json = "total_results.json"
t_file_csv = "total_results.csv"
t_file_T_csv = "total_results_T.csv"

# Writing JSON data to file
with open(t_file_json, 'w') as f:
    json.dump(total, f)

In [None]:
pd.read_json(t_file_json).to_csv(t_file_csv) #read in json file to create a csv file from it

In [None]:
pd.read_csv(t_file_csv, header=None).T.to_csv(t_file_T_csv, header=False, index=False)  #transpose csv so that each term is a row

In [None]:
temp_t=pd.read_csv(t_file_T_csv)

keep_col = ['term', 'vocabulary_id', 'hateful_meaning', 'is_unambiguous', 'is_unambiguous_in', 'average_offensiveness', 'plural_of', 'variant_of', \
            'is_about_nationality', 'is_about_ethnicity', 'is_about_religion', 'is_about_gender', 'is_about_sexual_orientation', \
            'is_about_disability', 'is_about_class', 'number_of_sightings']

new_temp_t = temp_t[keep_col] #keep only relevant columns, not date spotted etc
new_temp_t.to_csv("total_results_T_trim.csv", index=False) #create final csv for full API

## Total Ambiguous Split

Split the total_results_T_trim.csv into total_unambiguous.csv and total_ambiguous.csv

In [None]:
total_results = pd.read_csv('total_results_T_trim.csv')

force_arr = ['niger', 'nigger', 'nigers', 'niggers'] #create array of terms you want to force to be unambiguous

force_index_arr = total_results.index[total_results['term'].isin(force_arr)].tolist()
force_index_arr

In [None]:
for element in force_index_arr:
    total_results.at[element, 'is_unambiguous'] = True #make all terms previously stated forced to unambiguous 

In [None]:
unambiguous_results = total_results[total_results.is_unambiguous == True]
ambiguous_results = total_results[total_results.is_unambiguous == False] 
print("Total Shape: ", total_results.shape, "Unambiguous Shape: ", unambiguous_results.shape, \
      "Ambiguous Shape: ", ambiguous_results.shape)

In [None]:
unambiguous_results.to_csv("total_unambiguous_results.csv", index=False)
ambiguous_results.to_csv("total_ambiguous_results.csv", index=False)

## Trim NoSwearing Data
Remove words already in the API

In [None]:
noswearing_DB = pd.read_csv('noswearing_original_data.csv')
hatebase_total_DB = pd.read_csv('total_results_T_trim.csv')
noswearing_list = noswearing_DB['term']
hatebase_total_list = hatebase_total_DB['term']
len(noswearing_list)

In [None]:
noswearing_trim_list = list(set(noswearing_list) - set(hatebase_total_list)) #remove any duplicates already present in the Hatebase Database
len(noswearing_trim_list)

In [None]:
noswearing_final = noswearing_DB[noswearing_DB['term'].isin(noswearing_trim_list)] #only keep rows that weren't trimmed
noswearing_final.shape

In [None]:
noswearing_final.to_csv('noswearing_trim_data.csv', index=False) #create final noswearing list to work as amplifiers