## Imports and Reading Data

In [130]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

In [131]:
df = pd.read_csv('./coaster_db.csv')
df.shape


In [None]:
df.head(8)

In [133]:
pd.set_option('display.max_columns', 200)

In [None]:
df.head()

In [None]:
df.columns

In [None]:
df.dtypes

In [137]:
df.describe()

## Data Preperation

In [None]:
df.columns

In [164]:
# df.drop(['Opening date'],axis=1)

In [75]:
df = df[['coaster_name', 
    # 'Length', 'Speed',
     'Location', 'Status',
    #  'Opening date',
    #    'Type', 
       'Manufacturer',
        #  'Height restriction', 'Model', 'Height',
    #    'Inversions', 'Lift/launch system', 'Cost', 'Trains', 'Park section',
    #    'Duration', 'Capacity', 'G-force', 'Designer', 'Max vertical angle',
    #    'Drop', 'Soft opening date', 'Fast Lane available', 'Replaced',
    #    'Track layout', 'Fastrack available', 'Soft opening date.1',
    #    'Closing date', 
    # 'Opened', 
    # 'Replaced by', 'Website',
    #    'Flash Pass Available', 'Must transfer from wheelchair', 'Theme',
    #    'Single rider line available', 'Restraint Style',
    #    'Flash Pass available', 'Acceleration', 'Restraints', 'Name',
       'year_introduced',
         'latitude', 'longitude', 'Type_Main',
       'opening_date_clean',
        #  'speed1', 'speed2', 'speed1_value', 'speed1_unit',
       'speed_mph',
        'height_value', 'height_unit', 'height_ft',
       'Inversions_clean', 'Gforce_clean']].copy()

In [76]:
df.shape

(1087, 15)

In [None]:
df.dtypes

In [141]:
df['opening_date_clean'] = pd.to_datetime(df['opening_date_clean'])

## Rename our Columns

In [142]:
df.columns
df = df.rename(columns={'coaster_name':"Coaster_Name","year_introduced":"Year_Introduced"})

In [None]:
df.columns

In [None]:
df.isna().sum()

In [None]:
df.loc[df.duplicated()]

In [None]:
df.loc[df.duplicated(subset=['Coaster_Name'])] # It show the second element means the duplciated one

In [None]:
df.query('Coaster_Name == "Crystal Beach Cyclone"')

In [148]:
# df.duplicated(subset=['Coaster_Name',"Location","opening_date_clean"])
df = df.loc[~df.duplicated(subset=['Coaster_Name',"Location","opening_date_clean"])] # Not duplicates
df

In [None]:
df.reset_index(drop=True)

In [None]:
df.shape

## Feature Understanding

In [None]:
df['Year_Introduced'].value_counts()  # Automatically shows in decreasing order

In [152]:
ax = df['Year_Introduced'].value_counts() \
.head(10) \
.plot(kind='bar',title="Top 10 Years Coasters Introduced")  # Back Slashes let us break the line

ax.set_ylabel("Count")
ax.set_xlabel("Year Introduced")

In [153]:
ax = df['speed_mph'].plot(kind="hist",bins=20,title="Coster Speed")
ax.set_xlabel("Speed mph")

## Feature Relationships

In [None]:

df.plot(kind="scatter",x='speed_mph',y="height_ft",title="Speed VS Height")
plt.show()

In [None]:
# Seaborn
sns.scatterplot(x="speed_mph",y='height_ft',data=df,hue="Year_Introduced")

In [156]:
sns.pairplot(df,x_vars=["Year_Introduced",'speed_mph',"height_ft","Gforce_clean"],hue='Type_Main')

In [157]:
df_corr = df[["Year_Introduced",'speed_mph',"height_ft","Gforce_clean"]].dropna().corr() # corr shows corelation between the options that we want
df_corr

In [158]:

sns.heatmap(df_corr,annot=True)

## Asking a Question About the Data

Q What are the locations with the fastest roller coasters (min of 10)?

In [159]:
df.head()

In [None]:
df['Location'].value_counts()

In [None]:
df.query('Location != "Other"').groupby('Location')['speed_mph'].agg(['mean','count']).query('count>=10').sort_values('mean')
# Filter out rows where Location is "Other", then group the remaining data by Location.
# For each group, compute the mean and count of 'speed_mph'.
# Finally, keep only those groups where the count is at least 10 to ensure statistical significance.
# Sort by mean speed

In [None]:
df.query('Location != "Other"').groupby('Location')['speed_mph'].agg(['mean','count']).query('count>=10').sort_values('mean').plot(kind='barh')


In [163]:
ax = df.query('Location != "Other"').groupby('Location')['speed_mph'].agg(['mean','count']).query('count>=10').sort_values('mean')['mean'].plot(kind='barh',title = 'Avg speed by Location')

ax.set_xlabel("Average Coaster Speed")