In [21]:
import pandas as pd
import numpy as np

pd.set_option('display.max_columns', 200)

from sklearn.experimental import enable_iterative_imputer  
from sklearn.impute import IterativeImputer, SimpleImputer
from sklearn.compose import ColumnTransformer

import warnings
warnings.filterwarnings('ignore')

In [9]:
df = pd.read_csv('notebooks/coaster_cleaned.csv')

In [13]:
df.head()

Unnamed: 0,Coaster_Name,Location,Status,Manufacturer,Year_Introduced,Latitude,Longitude,Type_Main,Opening_Date,Speed_mph,Height_ft,Inversions,Gforce
0,Switchback Railway,Coney Island,Removed,LaMarcus Adna Thompson,1884,40.574,-73.978,Wood,1884-06-16,6.0,,0,2.9
1,Flip Flap Railway,Sea Lion Park,Removed,Lina Beecher,1895,40.578,-73.979,Wood,1895-01-01,,,1,12.0
2,Switchback Railway (Euclid Beach Park),"Cleveland, Ohio, United States",Closed,,1896,41.58,-81.57,Other,,,,0,
3,Loop the Loop (Coney Island),Other,Removed,Edwin Prescott,1901,40.5745,-73.978,Steel,1901-01-01,,,1,
4,Loop the Loop (Young's Pier),Other,Removed,Edwin Prescott,1901,39.3538,-74.4342,Steel,1901-01-01,,,1,


In [14]:
df = df.drop('Status', axis=1)

In [16]:
df.columns

Index(['Coaster_Name', 'Location', 'Manufacturer', 'Year_Introduced',
       'Latitude', 'Longitude', 'Type_Main', 'Opening_Date', 'Speed_mph',
       'Height_ft', 'Inversions', 'Gforce'],
      dtype='object')

In [17]:
num_cols = df.select_dtypes(include=['number']).columns
cat_cols = df.select_dtypes(include=['object', 'category']).columns

In [18]:
num_imp = IterativeImputer(random_state=42)
cat_imp = SimpleImputer(strategy='most_frequent')

In [19]:
preprocessor = ColumnTransformer([
    ('num', num_imp, num_cols),
    ('cat', cat_imp, cat_cols)
])

In [22]:
df_imputed = preprocessor.fit_transform(df)

In [23]:
df_imputed = pd.DataFrame(df_imputed, columns=num_cols.tolist() + cat_cols.tolist())

In [24]:
df = df_imputed

In [26]:
df.head()

Unnamed: 0,Year_Introduced,Latitude,Longitude,Speed_mph,Height_ft,Inversions,Gforce,Coaster_Name,Location,Manufacturer,Type_Main,Opening_Date
0,1884.0,40.574,-73.978,6.0,-39.805827,0.0,2.9,Switchback Railway,Coney Island,LaMarcus Adna Thompson,Wood,1884-06-16
1,1895.0,40.578,-73.979,153.513794,362.111776,1.0,12.0,Flip Flap Railway,Sea Lion Park,Lina Beecher,Wood,1895-01-01
2,1896.0,41.58,-81.57,36.227781,44.265966,0.0,4.122628,Switchback Railway (Euclid Beach Park),"Cleveland, Ohio, United States",Vekoma,Other,1999-01-01
3,1901.0,40.5745,-73.978,38.249314,50.755251,1.0,4.254852,Loop the Loop (Coney Island),Other,Edwin Prescott,Steel,1901-01-01
4,1901.0,39.3538,-74.4342,38.312525,50.913806,1.0,4.250597,Loop the Loop (Young's Pier),Other,Edwin Prescott,Steel,1901-01-01


In [29]:
df.isna().sum()

Year_Introduced    0
Latitude           0
Longitude          0
Speed_mph          0
Height_ft          0
Inversions         0
Gforce             0
Coaster_Name       0
Location           0
Manufacturer       0
Type_Main          0
Opening_Date       0
dtype: int64

In [30]:
df.dtypes

Year_Introduced    object
Latitude           object
Longitude          object
Speed_mph          object
Height_ft          object
Inversions         object
Gforce             object
Coaster_Name       object
Location           object
Manufacturer       object
Type_Main          object
Opening_Date       object
dtype: object

In [31]:
df['Year_Introduced'] = pd.to_numeric(df['Year_Introduced']).astype(int)
df['Latitude'] = pd.to_numeric(df['Latitude'])
df['Longitude'] = pd.to_numeric(df['Longitude'])
df['Speed_mph'] = pd.to_numeric(df['Speed_mph']).astype(int)
df['Height_ft'] = pd.to_numeric(df['Height_ft']).astype(int)
df['Inversions'] = pd.to_numeric(df['Inversions']).astype(int)
df['Gforce'] = pd.to_numeric(df['Gforce'])
df['Opening_Date'] = pd.to_datetime(df['Opening_Date'])

In [32]:
df.dtypes

Year_Introduced             int64
Latitude                  float64
Longitude                 float64
Speed_mph                   int64
Height_ft                   int64
Inversions                  int64
Gforce                    float64
Coaster_Name               object
Location                   object
Manufacturer               object
Type_Main                  object
Opening_Date       datetime64[ns]
dtype: object

In [33]:
df.columns

Index(['Year_Introduced', 'Latitude', 'Longitude', 'Speed_mph', 'Height_ft',
       'Inversions', 'Gforce', 'Coaster_Name', 'Location', 'Manufacturer',
       'Type_Main', 'Opening_Date'],
      dtype='object')

In [35]:
df = df[['Coaster_Name', 'Location', 'Manufacturer',
       'Type_Main', 'Year_Introduced', 'Latitude', 'Longitude', 'Speed_mph', 'Height_ft',
       'Inversions', 'Gforce', 'Opening_Date']]

In [36]:
df

Unnamed: 0,Coaster_Name,Location,Manufacturer,Type_Main,Year_Introduced,Latitude,Longitude,Speed_mph,Height_ft,Inversions,Gforce,Opening_Date
0,Switchback Railway,Coney Island,LaMarcus Adna Thompson,Wood,1884,40.574000,-73.978000,6,-39,0,2.900000,1884-06-16
1,Flip Flap Railway,Sea Lion Park,Lina Beecher,Wood,1895,40.578000,-73.979000,153,362,1,12.000000,1895-01-01
2,Switchback Railway (Euclid Beach Park),"Cleveland, Ohio, United States",Vekoma,Other,1896,41.580000,-81.570000,36,44,0,4.122628,1999-01-01
3,Loop the Loop (Coney Island),Other,Edwin Prescott,Steel,1901,40.574500,-73.978000,38,50,1,4.254852,1901-01-01
4,Loop the Loop (Young's Pier),Other,Edwin Prescott,Steel,1901,39.353800,-74.434200,38,50,1,4.250597,1901-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...
985,Ice Breaker (roller coaster),SeaWorld Orlando,Premier Rides,Steel,2022,28.408800,-81.463300,52,112,0,3.181643,2022-02-01
986,Leviathan (Sea World),Sea World,Martin & Vleminckx,Wood,2022,-27.957400,153.426300,49,105,0,3.119106,2022-01-01
987,Pantheon (roller coaster),Busch Gardens Williamsburg,Intamin,Steel,2022,37.233900,-76.642600,73,169,2,4.151077,2022-01-01
988,Tumbili,Kings Dominion,S&S – Sansei Technologies,Steel,2022,37.749929,-29.022480,34,64,0,2.699946,1999-01-01


In [37]:
df.to_csv('data/featured_coaster_data.csv', index=False)