# Importing the libraries

In [1]:
import pandas as pd
import numpy as np
import requests #for API
pd.set_option('display.max_columns', None)

# Importing the data
First of all we will import both data from the rollercoaster database API and the accidents CSV.

## Accidents CSV

In [2]:
accident1 = pd.read_csv('datasource/Saferparks-dataset-2017-v3.csv', sep=',', encoding='cp1252')

In [3]:
accident2 = pd.read_csv('datasource/Saferparks-dataset-legacy-v2.csv', sep=',', encoding='cp1252')

In [4]:
er = pd.read_csv('datasource/Saferparks-dataset-neiss.csv', sep=',', encoding='cp1252')

## Rollercoasters database API

In [83]:
response = requests.get('https://coasters-api.herokuapp.com/')

In [84]:
type(response)

requests.models.Response

In [85]:
response

<Response [200]>

If we read the json, we retrieve a list where each element refers to a specific rollercoaster.

In [86]:
#results retorna uma lista onde cada termo contém um dicionário representando as features de uma montanha-russa
results = response.json()
results

[{'length': 950,
  'height': 138,
  'speed': 206,
  'inversions': 0,
  'gForce': None,
  'country': 'United States',
  'year': 2005,
  'type': ['Roller Coaster', 'Steel', 'Sit Down', 'Extreme'],
  '_id': '5e8ef56a60fa824d1e2db3bf',
  'name': 'Kingda Ka',
  'park': 'Six Flags Great Adventure',
  'model': 'Accelerator Coaster',
  'createdAt': '2020-04-09T10:14:02.717Z',
  'updatedAt': '2020-04-09T10:14:02.717Z',
  '__v': 0},
 {'length': 853,
  'height': 128,
  'speed': 193,
  'inversions': 0,
  'gForce': None,
  'country': 'United States',
  'year': 2003,
  'type': ['Roller Coaster', 'Steel', 'Sit Down', 'Extreme'],
  '_id': '5e8ef56a60fa824d1e2db3c0',
  'name': 'Top Thrill Dragster',
  'park': 'Cedar Point',
  'model': 'Accelerator Coaster',
  'createdAt': '2020-04-09T10:14:02.717Z',
  'updatedAt': '2020-04-09T10:14:02.717Z',
  '__v': 0},
 {'length': 2000,
  'height': 51,
  'speed': 240,
  'inversions': 0,
  'gForce': 4.8,
  'country': 'United Arab Emirates',
  'year': 2010,
  'type': [

 This dictionary has the rollercoaster features. For example, the rollercoaster number 0, has the following features:

In [87]:
results[0].keys()

dict_keys(['length', 'height', 'speed', 'inversions', 'gForce', 'country', 'year', 'type', '_id', 'name', 'park', 'model', 'createdAt', 'updatedAt', '__v'])

And the following values for those features:

In [88]:
results[0].values()

dict_values([950, 138, 206, 0, None, 'United States', 2005, ['Roller Coaster', 'Steel', 'Sit Down', 'Extreme'], '5e8ef56a60fa824d1e2db3bf', 'Kingda Ka', 'Six Flags Great Adventure', 'Accelerator Coaster', '2020-04-09T10:14:02.717Z', '2020-04-09T10:14:02.717Z', 0])

Let's create a list to feed our dataframe, iterating over the API:

In [89]:
list_values = []

for i in range(1,len(results)):
    list_values.append(list(results[i].values()))

Creating the dataframe with the keys as column headers and the list as our data:

In [90]:
roller_api = pd.DataFrame(columns = list(results[0].keys()), data = list_values)
roller_api.head()

Unnamed: 0,length,height,speed,inversions,gForce,country,year,type,_id,name,park,model,createdAt,updatedAt,__v
0,853.0,128,193.0,0,,United States,2003.0,"[Roller Coaster, Steel, Sit Down, Extreme]",5e8ef56a60fa824d1e2db3c0,Top Thrill Dragster,Cedar Point,Accelerator Coaster,2020-04-09T10:14:02.717Z,2020-04-09T10:14:02.717Z,0.0
1,2000.0,51,240.0,0,4.8,United Arab Emirates,2010.0,"[Roller Coaster, Steel, Sit Down, Extreme]",5e8ef56a60fa824d1e2db3be,Formula Rossa-,Ferrari World Abu Dhabi,Accelerator Coaster,2020-04-09T10:14:02.716Z,2020-04-09T10:14:02.716Z,0.0
2,1243.0,49,179.0,1,,Japan,2001.0,"[Roller Coaster, Steel, Sit Down, Extreme]",5e8ef56a60fa824d1e2db3c1,Do-Dodonpa,Fuji-Q Highland,Thrust Air Coaster,2020-04-09T10:14:02.717Z,2020-04-09T10:14:02.717Z,0.0
3,880.0,111,179.0,0,,Spain,2017.0,"[Roller Coaster, Steel, Sit Down, Extreme]",5e8ef56a60fa824d1e2db3c2,Red Force,Ferrari Land,Accelerator Coaster,2020-04-09T10:14:02.717Z,2020-04-09T10:14:02.717Z,0.0
4,2478.0,97,152.0,0,,Japan,2000.0,"[Roller Coaster, Steel, Sit Down, Extreme]",5e8ef56a60fa824d1e2db3c3,Steel Dragon 2000,Nagashima Spa Land,Hyper Coaster,2020-04-09T10:14:02.717Z,2020-04-09T10:14:02.717Z,0.0


# Data cleaning
Let's check the health of our data before venturing ourselves into wild statistics:

## Rollercoasters database API

In [91]:
roller_api.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 49 entries, 0 to 48
Data columns (total 15 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   length      48 non-null     float64
 1   height      49 non-null     int64  
 2   speed       49 non-null     float64
 3   inversions  49 non-null     int64  
 4   gForce      20 non-null     float64
 5   country     49 non-null     object 
 6   year        48 non-null     float64
 7   type        49 non-null     object 
 8   _id         49 non-null     object 
 9   name        49 non-null     object 
 10  park        49 non-null     object 
 11  model       49 non-null     object 
 12  createdAt   34 non-null     object 
 13  updatedAt   34 non-null     object 
 14  __v         34 non-null     float64
dtypes: float64(5), int64(2), object(8)
memory usage: 5.9+ KB


Year: shouldn't be float, instead it should be int64.

Type: it's a list, we should unpack it into different columns.

CreatedAt, UpdatedAt and v are not useful columns.

### Nested data inside column 'type'

In [92]:
roller_api['type'][0:10]

0    [Roller Coaster, Steel, Sit Down, Extreme]
1    [Roller Coaster, Steel, Sit Down, Extreme]
2    [Roller Coaster, Steel, Sit Down, Extreme]
3    [Roller Coaster, Steel, Sit Down, Extreme]
4    [Roller Coaster, Steel, Sit Down, Extreme]
5    [Roller Coaster, Steel, Sit Down, Extreme]
6    [Roller Coaster, Steel, Sit Down, Extreme]
7    [Roller Coaster, Steel, Sit Down, Extreme]
8    [Roller Coaster, Steel, Sit Down, Extreme]
9    [Roller Coaster, Steel, Sit Down, Extreme]
Name: type, dtype: object

In [None]:
roller_api['type']

In [75]:
roller_api[['Struc', 'Material', 'Seat', 'Intensity']] = roller_api['type'].str.split(', ', expand=True)

ValueError: Columns must be same length as key

In [None]:
for i in roller_api['type']:
    row['Structure'] = i[0]
    row['Material'] = i[1]
    row['Seat'] = i[0]

In [62]:
roller_api['type'][1]

['Roller Coaster', 'Steel', 'Sit Down', 'Extreme']

### Columns with no interest
Drop them all.

In [None]:
roller_api.drop(columns = 'createdAt', 'updatedAt', '__v')