In [2]:
### Example code to show how data can be retrieved from the AUSSDA repository
### You will learn how to navigate to the repository and download a datafile, and how to use different 
### datafiles.


# This installs the packages so Jupyter Notebook can execute the code. 
# Version info: python 3.9.7
# If you run this in a mybinder.org environment, you do not need to execute this chunk of code. 
import sys
!conda install --yes --prefix {sys.prefix} requests 
!conda install --yes --prefix {sys.prefix} jsonschema 
!conda install --yes --prefix {sys.prefix} pandas 
!conda install --yes --prefix {sys.prefix} numpy 
!conda install --yes --prefix {sys.prefix} matplotlib

!{sys.executable} -m pip install pyDataverse

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.

Collecting package metadata (current_repodata.json): ...working... done
Solving environment: ...working... done

# All requested packages already installed.



In [7]:
## Start by installing and loading the packages you need. 
# The package pyDataverse handles everything you need to connect and retrieve files from a Dataverse 
# installation.
# Check out the documentation: 
# https://pydataverse.readthedocs.io/en/latest/user/basic-usage.html#download-and-save-a-dataset-to-disk

from pyDataverse.api import NativeApi, DataAccessApi
import io
import pandas as pd

In [8]:
## Decide which data repository you want to use, and which dataset you want to download. 
## This example is based on the AUSSDA Dataverse. AUSSDA - The Austrian Social Science Data Archive is 
## a rich source for data for research purposes. Some data can also be used for other purposes under the 
## specified license conditions.
## You will learn how to download the Social Survey Austria 2016 from the Dataverse.

# Data citation: Bacher, Johann; Beham-Rabanser, Martina; Grausgruber, Alfred; Haller, Max; Höllinger, Franz; 
# Muckenhuber, Johanna; Prandner, Dimitri; Verwiebe, Roland, 2018, "Social Survey Austria 2016", 
# https://doi.org/10.11587/EHJHFJ, AUSSDA, V3
# The data files under the above citation are licensed under a Creative Commons Attribution 4.0 International 
# License.
# see source for more: https://doi.org/10.11587/EHJHFJ

# Step 1: Define and connect to repository API
repository_url = 'https://data.aussda.at/' 
native_api = NativeApi(repository_url)

# Step 2: Define and download dataset
DOI = "doi:10.11587/EHJHFJ"
dataset = native_api.get_dataset(DOI)

In [24]:
## To download the data file automatically, the following code proceeeds in several steps.
## First, you create a list of available files. Then, you download the first datafile.
## In Dataverse, there are several files saved under each DOI: datafiles (usually .tab files), and 
## documentation as pdf-files. The following code saves a list of all these files. 
## 
## The following code first goes through all files in the file list (i.e. all files in the dataset), 
## and identifies all files that are tab-delimited files. These are the datafiles you want to download. 
## Then, it picks the first of these tab-delimited files which is usually the original datafile by default. 
## Then you download this file.  


# Create a list of files that are available in the dataset
files_list = dataset.json()['data']['latestVersion']['files']
print(files_list)

# Find the first tab-file
ident = []

for file in files_list:
    filename = file["dataFile"]["filename"]
    file_id = file["dataFile"]["id"]
    if filename.endswith('.tab'):
        ident.append(file_id)

# Save the ID of the first tab file that occurs in the list
datafile_id = ident[0] 
print(datafile_id)

# Step 3: Connect to API for data access at the repository
data_access_api = DataAccessApi(repository_url)

# Step 4: Download data file using its id
response = data_access_api.get_datafile(datafile_id)
with open("data.dta", "xb") as f:
    f.write(response.content)

[{'description': 'Codebook', 'label': '10007_co_de_v1_0.pdf', 'restricted': False, 'directoryLabel': 'documentation', 'version': 2, 'datasetVersionId': 2227, 'categories': ['Codebook', 'Documentation'], 'dataFile': {'id': 181, 'persistentId': '', 'pidURL': '', 'filename': '10007_co_de_v1_0.pdf', 'contentType': 'application/pdf', 'filesize': 8871474, 'description': 'Codebook', 'storageIdentifier': 'file://1624de0b568-41d22a2eaf98', 'rootDataFileId': -1, 'md5': 'bdd1bb23ff05b38df06ffa93175639bf', 'checksum': {'type': 'MD5', 'value': 'bdd1bb23ff05b38df06ffa93175639bf'}, 'creationDate': '2018-03-22'}}, {'description': 'Core data file - STATA format - 373 Variables, 2021 Observations', 'label': '10007_da_de_v1_2-1.tab', 'restricted': False, 'directoryLabel': 'data', 'version': 2, 'datasetVersionId': 2227, 'categories': ['Data', 'STATA'], 'dataFile': {'id': 188, 'persistentId': '', 'pidURL': '', 'filename': '10007_da_de_v1_2-1.tab', 'contentType': 'text/tab-separated-values', 'filesize': 219

In [9]:
## Working with the datafile Pt. 1
## Now that you have downloaded the datafile, you can work with it: It needs to be transformed into a format 
## that can be used by the Pandas package.

# Transform the response into a Pandas data frame
data = io.StringIO(str(response.content,'utf-8'))

# The file is a tab-delimited file in the repository, i.e. the seperator between columns (variables) is a tab.
# The first line consists of the variable names.
data = pd.read_csv(data, sep="\t", index_col=0)

In [10]:
## Working with the datafile Pt. 2
## Looking at data: Check if the download worked, and  what the data looks like.

# Show sample data by displaying the first rows
data.head()

# The rows are unique observations. In your case survey data each row is an invidual respondent, because you 
# use survey data.
# The columns are variables, which are mostly questions in survey data. The variable names in the header show 
# which question is stored in which column, and the values show the answer each respondent gave to each 
# question.  

Unnamed: 0_level_0,doi,NR_2016,ISSP_Version,Gewicht_DES,Gewicht_POS,Gewicht_GES,SEX_2016,af1,af5,bf14,...,Matura,city,Matura_vater,Matura_mutter,Nationalitaet,Geburtsland,Partnerschaft,Matura_partner,Messgang,Wohnverhaeltnis
version,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1.2 (2018-03-26),doi:10.11587/EHJHFJ,1172,2,1.28212,0.80073,1.15048,1,1,-1,1,...,1,1,1,1,1,1,2,1,0,1
1.2 (2018-03-26),doi:10.11587/EHJHFJ,1857,2,0.38161,0.80682,0.20884,2,1,10,1,...,2,1,2,2,1,1,3,-99,4,0
1.2 (2018-03-26),doi:10.11587/EHJHFJ,1946,2,0.4164,0.95263,0.373,1,1,15,1,...,2,2,2,2,0,0,3,-99,5,0
1.2 (2018-03-26),doi:10.11587/EHJHFJ,1102,2,1.28212,0.92258,1.30632,2,4,1,1,...,1,1,1,1,1,1,1,1,5,2
1.2 (2018-03-26),doi:10.11587/EHJHFJ,1352,2,0.4164,0.86337,0.43237,2,1,-1,1,...,1,1,0,1,1,1,1,0,4,1


In [8]:
### Where to go from here? Pt. 2. 

# Another option is to use a different dataset that is published under an open access license (CC BY) that 
# does not require a login or API key.

# Data citation: Kittel, Bernhard; Kritzinger, Sylvia; Boomgaarden, Hajo; Prainsack, Barbara; Eberl, 
# Jakob-Moritz; Kalleitner, Fabian; Lebernegg, Noëlle S.; Partheymüller, Julia; Plescia, Carolina; 
# Schiestl, David W.; Schlogl, Lukas, 2020, "Austrian Corona Panel Project (OA edition)", 
# https://doi.org/10.11587/P5YJ0O, AUSSDA, V2
# DOI = "doi:10.11587/P5YJ0O"

# If you want to use the Corona Panel Project instead of the Social Survey, change the DOI in the code above 
# to "10.11587/P5YJ0O" in Step 2. Look at the codebook or the questionnaire again to determine which variable 
# contains information on which question. 


In [9]:
## Manual Process: Specify the id of the file you want to download, and proceed with Step 1 and Step 2

# If the automatic download described above does not work, you can specify the name of the file you want to 
# retrieve manually, and download it using the function get_dataframe_by_name. In order to find the name, 
# check the object files_list that you created after Step 2, or look at the landing page for the Social 
# Survey Austria in the repository at  https://doi.org/10.11587/EHJHFJ.
# You can also download the files manually when using the Corona Panel Project in case the automatic 
# download does not work.

# # Check the id numbers for each file.
# for file in files_list:
#     filename = file["dataFile"]["filename"]
#     file_id = file["dataFile"]["id"]
#     print("File name {} has id {}".format(filename, file_id))
# # Specify which file you want to retrieve.
# datafile_id = 188 
# # Step 1: Connect to API for data access at the repository
# data_access_api = DataAccessApi(repository_url)
# # Step 2: Retrieve data file
# response = data_access_api.get_datafile(datafile_id)