In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('../data/featured_coaster_data.csv')

In [3]:
df

Unnamed: 0,Coaster_Name,Location,Manufacturer,Type_Main,Year_Introduced,Latitude,Longitude,Speed_mph,Height_ft,Inversions,Gforce,Opening_Date
0,Switchback Railway,Coney Island,LaMarcus Adna Thompson,Wood,1884,40.574000,-73.978000,6,-39,0,2.900000,1884-06-16
1,Flip Flap Railway,Sea Lion Park,Lina Beecher,Wood,1895,40.578000,-73.979000,153,362,1,12.000000,1895-01-01
2,Switchback Railway (Euclid Beach Park),"Cleveland, Ohio, United States",Vekoma,Other,1896,41.580000,-81.570000,36,44,0,4.122628,1999-01-01
3,Loop the Loop (Coney Island),Other,Edwin Prescott,Steel,1901,40.574500,-73.978000,38,50,1,4.254852,1901-01-01
4,Loop the Loop (Young's Pier),Other,Edwin Prescott,Steel,1901,39.353800,-74.434200,38,50,1,4.250597,1901-01-01
...,...,...,...,...,...,...,...,...,...,...,...,...
985,Ice Breaker (roller coaster),SeaWorld Orlando,Premier Rides,Steel,2022,28.408800,-81.463300,52,112,0,3.181643,2022-02-01
986,Leviathan (Sea World),Sea World,Martin & Vleminckx,Wood,2022,-27.957400,153.426300,49,105,0,3.119106,2022-01-01
987,Pantheon (roller coaster),Busch Gardens Williamsburg,Intamin,Steel,2022,37.233900,-76.642600,73,169,2,4.151077,2022-01-01
988,Tumbili,Kings Dominion,S&S – Sansei Technologies,Steel,2022,37.749929,-29.022480,34,64,0,2.699946,1999-01-01


In [4]:
df.isna().sum().sum()

np.int64(0)

In [5]:
num_features = ['Speed_mph', 'Height_ft', 'Inversions', 'Gforce', 'Year_Introduced']
cat_features = ['Location', 'Manufacturer', 'Type_Main']
target = 'Popularity'

In [6]:
df[target] = (df['Speed_mph'] * 0.4 + df['Height_ft'] * 0.3 + df['Inversions'] * 0.2 + df['Gforce'] * 0.1)

In [7]:
X = df[num_features + cat_features]
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
preprocessor = ColumnTransformer([
    ('num', StandardScaler(), num_features),
    ('cat', OneHotEncoder(drop='first', handle_unknown='ignore'), cat_features)
])

In [9]:
model = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LinearRegression())
])

In [10]:
model.fit(X_train, y_train)

In [11]:
y_pred = model.predict(X_test)

In [12]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [13]:
print(f'Mean Squared Error: {mse:.2f}')
print(f'R2 Score: {r2:.2f}')

Mean Squared Error: 0.00
R2 Score: 1.00
