In [2]:
# Dependencies
import pandas as pd
import numpy as np
from datetime import datetime
from sqlalchemy import create_engine

### Extract CSVs into DataFrames

In [3]:
# read csv file into dataframe
csv_file = "../Resources/weather_data_nyc_centralpark_2016.csv"
df_weather_data = pd.read_csv(csv_file)
df_weather_data.head()

Unnamed: 0,date,maximum temperature,minimum temperature,average temperature,precipitation,snow fall,snow depth
0,1-1-2016,42,34,38.0,0.0,0.0,0
1,2-1-2016,40,32,36.0,0.0,0.0,0
2,3-1-2016,45,35,40.0,0.0,0.0,0
3,4-1-2016,36,14,25.0,0.0,0.0,0
4,5-1-2016,29,11,20.0,0.0,0.0,0


### Transform weather DataFrame

In [4]:
# replace string 'T's in data with null values
df_weather_data["precipitation"] = df_weather_data["precipitation"].replace('T', np.nan, regex=True)
df_weather_data["snow fall"] = df_weather_data["snow fall"].replace('T', np.nan, regex=True)
df_weather_data["snow depth"] = df_weather_data["snow depth"].replace('T', np.nan, regex=True)

# define function to convert dates since dates in raw data are in 2 different formats
def try_parsing_date(text):
    for fmt in ('%m/%d/%Y', '%d-%m-%Y'):
        try:
            return datetime.strptime(text, fmt).date()
        except ValueError:
            pass
    raise ValueError('no valid date format found')

# convert dates
i = 0
for date in df_weather_data["date"]:
    df_weather_data["date"][i] = try_parsing_date(date)
    i += 1

# rename column headers
df_weather_data = df_weather_data.rename(columns={"maximum temperature": "max_temp",
                                                  "minimum temperature": "min_temp",
                                                  "average temperature": "avg_temp",
                                                  "snow fall": "snow_fall",
                                                  "snow depth": "snow_depth"})

# convert object types to floats
df_weather_data["precipitation"] = df_weather_data.precipitation.astype(float)
df_weather_data["snow_fall"] = df_weather_data.snow_fall.astype(float)
df_weather_data["snow_depth"] = df_weather_data.snow_depth.astype(float)

df_weather_data.head()

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Unnamed: 0,date,max_temp,min_temp,avg_temp,precipitation,snow_fall,snow_depth
0,2016-01-01,42,34,38.0,0.0,0.0,0.0
1,2016-01-02,40,32,36.0,0.0,0.0,0.0
2,2016-01-03,45,35,40.0,0.0,0.0,0.0
3,2016-01-04,36,14,25.0,0.0,0.0,0.0
4,2016-01-05,29,11,20.0,0.0,0.0,0.0


In [5]:
# write df to csv
df_weather_data.to_csv("../Resources/Cleaned_Weather_Data.csv", index=False)

In [6]:
# set index
df_weather_data.set_index("date", inplace=True)

### Create database connection

In [9]:
# create database connection
connection_string = "root:<insert password>@localhost/weathersales_db"
engine = create_engine(f'mysql://{connection_string}')

In [10]:
# confirm tables
engine.table_names()

['weather']

### Load DataFrame into SQL Database

In [12]:
# load df into sql database
df_weather_data.to_sql(name='weather', con=engine, if_exists='append', index=True)