In [35]:
import pandas as pd
import sqlite3
import os

In [36]:
db = sqlite3.connect("Monarchs2001.db")

In [37]:
df = pd.read_csv("2001 Season Summary.csv")

        #convert pandas to SQL table
df.to_sql("monarchs2001", db, if_exists="replace")

os.getcwd()

'C:\\Users\\Arink\\Desktop\\github\\CodeLou\\Monarchs\\Data'

In [38]:
schema = pd.read_sql('SELECT * FROM sqlite_master ORDER by name', db)
schema

Unnamed: 0,type,name,tbl_name,rootpage,sql
0,index,ix_monarchs2001_index,monarchs2001,3,"CREATE INDEX ""ix_monarchs2001_index""ON ""monarc..."
1,table,monarchs2001,monarchs2001,2,"CREATE TABLE ""monarchs2001"" (\n""index"" INTEGER..."


## Checking DataType

In [39]:
data_type = pd.read_sql('SELECT * FROM PRAGMA_TABLE_INFO("monarchs2001")', db)
data_type

Unnamed: 0,cid,name,type,notnull,dflt_value,pk
0,0,index,INTEGER,0,,0
1,1,Tag No.,TEXT,0,,0
2,2,Tagger,TEXT,0,,0
3,3,"Tag Location, Date",TEXT,0,,0
4,4,"Report Date, Location",TEXT,0,,0
5,5,Reporter,TEXT,0,,0
6,6,Miles,REAL,0,,0


### Drop tagger and reporter identification

In [40]:
df.drop(["Tagger", "Reporter"], axis = 1, inplace = True)
df

Unnamed: 0,Tag No.,"Tag Location, Date","Report Date, Location",Miles
0,AAD929,"Salisbury, MA 09-12-01","02-11-02 El Rosario, MICH, MX",2344.0
1,AAD844,"Salisbury, MA 09-12-01","03-08-02 Sierra Chincua, MICH, MX",2343.0
2,AIO261,"Champlain, NY 09-24-01","03-13-02 El Rosario, MICH, MX",2329.0
3,AAU820,"Plattsburgh, NY 08-29-01","02-27-02 El Rosario, MICH, MX",2315.0
4,ABG581,"Saranac Lake, NY 09-16-01","03-13-02 El Rosario, MICH, MX",2303.0
...,...,...,...,...
1957,ADE225,,"10-06-01 Lakeville, MN",
1958,ABW153,,"03-08-02 Sierra Chincua, MICH, MX",
1959,AJA875,,"02-26-02 El Rosario, MICH, MX",
1960,ADR234,,"02-26-02 El Rosario, MICH, MX",


### Rename columns for clean-up

In [41]:
df.rename(columns={'Tag  Location, Date': 'tag_date', 'Report Date, Location': 'report_date'}, inplace=True)
df


Unnamed: 0,Tag No.,tag_date,report_date,Miles
0,AAD929,"Salisbury, MA 09-12-01","02-11-02 El Rosario, MICH, MX",2344.0
1,AAD844,"Salisbury, MA 09-12-01","03-08-02 Sierra Chincua, MICH, MX",2343.0
2,AIO261,"Champlain, NY 09-24-01","03-13-02 El Rosario, MICH, MX",2329.0
3,AAU820,"Plattsburgh, NY 08-29-01","02-27-02 El Rosario, MICH, MX",2315.0
4,ABG581,"Saranac Lake, NY 09-16-01","03-13-02 El Rosario, MICH, MX",2303.0
...,...,...,...,...
1957,ADE225,,"10-06-01 Lakeville, MN",
1958,ABW153,,"03-08-02 Sierra Chincua, MICH, MX",
1959,AJA875,,"02-26-02 El Rosario, MICH, MX",
1960,ADR234,,"02-26-02 El Rosario, MICH, MX",


### Split & rename combined columns in Tagged category

In [42]:
df2 = df['tag_date'].str.split(', ', expand=True)
df3 = df2[1].str.split(' ', expand=True)
df2.columns = ['tag_city', 'state_date']
df3.columns = ['Tag State', 'Tag Date']
df3


Unnamed: 0,Tag State,Tag Date
0,MA,09-12-01
1,MA,09-12-01
2,NY,09-24-01
3,NY,08-29-01
4,NY,09-16-01
...,...,...
1957,,
1958,,
1959,,
1960,,


### Split & Rename combined columns in Reported category

In [43]:
df4 = df['report_date'].str.split(', ', expand=True)
df5 = df4[0].str.extract('(\d\d-\d\d-\d\d)', expand=True)
#df6 = df4[0].str.replace('(\d\d-\d\d-\d\d)', '')- to seperate date from city. decided not to use city
df4.columns = ['city_date', 'Report State', 'Report Country']
df4.loc[df4['Report State'] == 'MICH', 'Report State'] = 'Michoacán'
df5.columns = ['Report Date']

### Create new table with unneeded columns dropped

In [44]:
df7 = pd.concat([df, df2, df3, df4, df5], axis=1)
df7.drop(['state_date', 'tag_date', 'city_date', 'tag_city'], axis=1, inplace=True)
df8 = df7[['Tag No.','Tag State', 'Tag Date', 'Report State', 'Report Country', 'Report Date', 'Miles']]
new_table = df8.copy()

### Rename values in Report Country column

In [45]:
new_table.loc[new_table['Report Country'] == 'MX', 'Report Country'] = 'MEXICO'
new_table.loc[new_table['Report Country'].isnull(), 'Report Country'] = 'USA'
new_table

Unnamed: 0,Tag No.,Tag State,Tag Date,Report State,Report Country,Report Date,Miles
0,AAD929,MA,09-12-01,Michoacán,MEXICO,02-11-02,2344.0
1,AAD844,MA,09-12-01,Michoacán,MEXICO,03-08-02,2343.0
2,AIO261,NY,09-24-01,Michoacán,MEXICO,03-13-02,2329.0
3,AAU820,NY,08-29-01,Michoacán,MEXICO,02-27-02,2315.0
4,ABG581,NY,09-16-01,Michoacán,MEXICO,03-13-02,2303.0
...,...,...,...,...,...,...,...
1957,ADE225,,,MN,USA,10-06-01,
1958,ABW153,,,Michoacán,MEXICO,03-08-02,
1959,AJA875,,,Michoacán,MEXICO,02-26-02,
1960,ADR234,,,Michoacán,MEXICO,02-26-02,


### Final check for unwanted spaces

In [46]:
# new_table.loc[new_table['Report State'].str.contains(r'\s', na=False), 'Report State'].value_counts()
# new_table.loc[new_table['Tag State'].str.contains(r'\s', na=False), 'Tag State'].value_counts()
# new_table.loc[new_table['Tag No.'].str.contains(r'\s', na=False), 'Tag No.'].value_counts()
# new_table.loc[new_table['Report Country'].str.contains(r'\s', na=False), 'Report Country'].value_counts()

In [47]:
new_table["Tag Date"]= pd.to_datetime(new_table["Tag Date"]) 
new_table["Report Date"]= pd.to_datetime(new_table["Report Date"]) 
new_table.dtypes

Tag No.                   object
Tag State                 object
Tag Date          datetime64[ns]
Report State              object
Report Country            object
Report Date       datetime64[ns]
Miles                    float64
dtype: object

In [48]:
del df2, df3, df4, df5, df7, df8

### Export cleaned data to csv.

In [49]:
new_table.to_pickle("Monarchs2001.pkl")