In [139]:
# Import the dependencies
import pandas as pd
import numpy as np
import datetime

# Plotting
import matplotlib.pyplot as plt
import seaborn as sns

# ORM Stuff
from sqlalchemy import create_engine, inspect, text, func


In [140]:
df = pd.read_csv("Olympic_Swimming_Results_1912to2020.csv")
print(df.shape)
df.head()

(4359, 10)


Unnamed: 0,Location,Year,Distance (in meters),Stroke,Relay?,Gender,Team,Athlete,Results,Rank
0,Tokyo,2020,100m,Backstroke,0,Men,ROC,Evgeny Rylov,51.98,1
1,Tokyo,2020,100m,Backstroke,0,Men,ROC,Kliment Kolesnikov,52.0,2
2,Tokyo,2020,100m,Backstroke,0,Men,USA,Ryan Murphy,52.19,3
3,Tokyo,2020,100m,Backstroke,0,Men,ITA,Thomas Ceccon,52.3,4
4,Tokyo,2020,100m,Backstroke,0,Men,CHN,Jiayu Xu,52.51,4


In [141]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4359 entries, 0 to 4358
Data columns (total 10 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Location              4359 non-null   object
 1   Year                  4359 non-null   int64 
 2   Distance (in meters)  4359 non-null   object
 3   Stroke                4359 non-null   object
 4   Relay?                4359 non-null   int64 
 5   Gender                4359 non-null   object
 6   Team                  4359 non-null   object
 7   Athlete               4345 non-null   object
 8   Results               4331 non-null   object
 9   Rank                  4359 non-null   int64 
dtypes: int64(3), object(7)
memory usage: 340.7+ KB


In [142]:
## Drop unwanted 'Relay' column 
df = df.drop(columns=['Relay?'])  # Drop the column
df = df.dropna()  
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4317 entries, 0 to 4358
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Location              4317 non-null   object
 1   Year                  4317 non-null   int64 
 2   Distance (in meters)  4317 non-null   object
 3   Stroke                4317 non-null   object
 4   Gender                4317 non-null   object
 5   Team                  4317 non-null   object
 6   Athlete               4317 non-null   object
 7   Results               4317 non-null   object
 8   Rank                  4317 non-null   int64 
dtypes: int64(2), object(7)
memory usage: 337.3+ KB


In [143]:
engine = create_engine("sqlite:///Olympic_Swimming_Results.sqlite")

In [144]:
# Random sample for SPEED
df2 = df.sample(4000)
df2.head()

Unnamed: 0,Location,Year,Distance (in meters),Stroke,Gender,Team,Athlete,Results,Rank
3587,Rome,1960,400m,Freestyle,Men,GBR,Ian Macintosh Black,00:04:21.800000,3
3563,Rome,1960,200m,Breaststroke,Men,GER,Egon Henninger,00:02:40.100000,4
1614,Atlanta,1996,200m,Butterfly,Men,GBR,James Hickman,00:01:58.470000,4
3235,City,1968,200m,Breaststroke,Women,URS,Alla Grebennikova,00:02:47.100000,4
4048,Angeles,1932,200m,Breaststroke,Women,RSA,Jenny Maakal,Did not start,0


In [145]:
df2.to_sql(name="Olympic_Swimming_Results", con=engine, index=False, if_exists="append", method="multi")


4000

In [146]:
# Create the inspector and connect it to the engine
inspector = inspect(engine)

# Collect the names of tables within the database
tables = inspector.get_table_names()

# Using the inspector to print the column names within the 'dow' table and its types
for table in tables:
    print(table)
    print("--------")
    columns = inspector.get_columns(table)
    for column in columns:
        print(column["name"], column["type"])

    print()

Olympic_Swimming_Results
--------
Location TEXT
Year BIGINT
Distance (in meters) TEXT
Stroke TEXT
Gender TEXT
Team TEXT
Athlete TEXT
Results TEXT
Rank BIGINT



In [147]:
 # Write to SQL
df.to_sql(name="Olympic_Swimming_Results", con=engine, index=False, if_exists="append", method="multi")

4317

In [148]:
engine.dispose()