In [2]:
def get_connection_url(db, username=os.getenv('sqlUSER'), host=os.getenv('sqlHOST'), password=os.getenv('sqlPSWD')):
    """
    This function will:
    - take username, pswd, host credentials from imported env module
    - output a formatted connection_url
    """
    return f'mysql+pymysql://{username}:{password}@{host}/{db}'

In [1]:
import pandas as pd 
import env
import os

# Methods of Data Acquisition

### `read_clipboard`: 
- When you have data copied to your clipboard, you can use pandas to read it into a data frame with pd.read_clipboard. This can be useful for quickly transferring data to/from a spreadsheet.

<br>

### `read_excel`: 
- This function can be used to create a data frame based on the contents of an Excel spreadsheet.

<br>

### `read_csv`: 
- Read from a local csv, or from a the cloud (Google Sheets or AWS S3).

<br>

### `read_sql(sql_query, connection_url)`: 
- Read data using a SQL query to a database. You must have the required drivers installed, and a specially formatted url string must be provided.

    >To talk to a mysql database:
    >
    >` python -m pip install pymysql mysql-connector`
    <br>
    >The connection url string:
    >
    >` mysql+pymysql://{USER}:{PASSWORD}@{HOST}/{DATABASE_NAME}`

___
# Source: Clipboard

Navigate to Google Classroom > Classwork > Data

- Scroll down to Classification Lesson - students.csv 
- Double click to Open
- [Cmd][A]
- [Cmd][C]

Or find a table (not image) of data like: <a href = "https://www.testmasters.net/PsatAbout/Scoring-Scale">PSAT Scoring Scale</a>

In [3]:
pd.read_clipboard()

Unnamed: 0,Unnamed: 1,Unnamed: 2,Unnamed: 3,Unnamed: 4,Unnamed: 5,Converted
Total,Score,10th,Grade,11th,Grade,Converted
Total,Score,10th,Grade,11th,Grade,
1520,99+,99+,920,50,34,
1510,99+,99+,910,48,33,
1500,99+,99+,900,47,31,
...,...,...,...,...,...,...
960,58,41,360,1-,1-,
950,56,39,350,1-,1-,
940,54,38,340,1-,1-,
930,52,36,330,1-,1-,


In [6]:
pd.read_clipboard()

Unnamed: 0,unitid,chronname,city,state,level,control
0,100654,Alabama A&M University,Normal,Alabama,4-year,Public
1,100663,University of Alabama at Birmingham,Birmingham,Alabama,4-year,Public
2,100690,Amridge University,Montgomery,Alabama,4-year,Private not-for-profit
3,100706,University of Alabama at Huntsville,Huntsville,Alabama,4-year,Public
4,100724,Alabama State University,Montgomery,Alabama,4-year,Public
5,100751,University of Alabama at Tuscaloosa,Tuscaloosa,Alabama,4-year,Public
6,100760,Central Alabama Community College,Alexander City,Alabama,2-year,Public
7,100830,Auburn University at Montgomery,Montgomery,Alabama,4-year,Public
8,100858,Auburn University,Auburn University,Alabama,4-year,Public
9,100937,Birmingham-Southern College,Birmingham,Alabama,4-year,Private not-for-profit


___
# Source: A Shared Google Sheet
1. Get the shareable link url: https://docs.google.com/spreadsheets/d/BLAHBLAHBLAH/edit#gid=NUMBER

2. Turn that into a CSV export URL: 
    - Replace `/edit` with `/export`; 
    - Add `format=csv` to the beginning of the query string. 
    
        https://docs.google.com/spreadsheets/d/BLAHBLAHBLAH/export?format=csv&gid=NUMBER

3. Pass it to `pd.read_csv`, which can take a URL.

In [7]:
sheet_url = 'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/edit#gid=341089357' 

csv_export_url = sheet_url.replace('/edit#gid=', '/export?format=csv&gid=')
csv_export_url

'https://docs.google.com/spreadsheets/d/1Uhtml8KY19LILuZsrDtlsHHDC9wuDGUSe8LTEwvdI5g/export?format=csv&gid=341089357'

In [8]:
df_googlesheet = pd.read_csv(csv_export_url)
df_googlesheet.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


___
# Source: CSV (Hosted or Local)

#### Hosted:

In [9]:
url = "https://gist.githubusercontent.com/ryanorsinger/bec2f59a9cef8ae7428cb70b3541354a/raw/ef64298da52e5d70f4d388f5fd48eccdb02ed3f1/ice_cream.csv"

df = pd.read_csv(url)
df.head()

Unnamed: 0,flavor,pints
0,moolenium crunch,11.05757
1,bubblegum,6.288724
2,chubby hubby,7.660815
3,bubblegum,6.644338
4,neopolitan,13.600125


___
# Source: XLS 

Click to download an <a href="https://osse.dc.gov/sites/default/files/dc/sites/osse/page_content/attachments/LEA%20ESSER%20Expenditure%20Data_03.10.2023.xlsx"> OSSE XLSX file</a>

In [13]:
df_lea = pd.read_excel("LEA.xlsx", sheet_name="LEA Allocations", header=1)
df_lea.head()

Unnamed: 0.1,Unnamed: 0,LEA,ESSER I - CARES (CARES_Act),ESSER II - CRRSA (ESSER_II),ESSER III - ARP (ESSER_III_ARP)
0,,Achievement Preparatory Academy PCS,462632.76,1819829.0,4080082.0
1,,Bridges Public Charter School,125224.21,574678.7,1288438.0
2,,Capital City Public Charter School,400661.82,1749897.0,3923294.0
3,,Capital Village PCS,41122.97,219141.8,495188.0
4,,Cedar Tree Academy Public Charter School,144198.15,727427.3,1644761.0


___
# Source: SQL
Create a dataframe from the `passengers` table in the mySQL database: `titanic_db`.

<div class="alert alert-danger" role="alert">
    <div class="row vertical-align">
        <div class="col-xs-1 text-center">
            <i class="fa fa-exclamation-triangle fa-2x"></i>
        </div>
        <div class="col-xs-11">
                <strong> Remember:</strong>
            Be sure to import <b>.gitignore</b> prior to pushing env.py
</div>

<div class="alert alert-danger" role="alert">
    <div class="row vertical-align">
        <div class="col-xs-1 text-center">
            <i class="fa fa-exclamation-triangle fa-2x"></i>
        </div>
        <div class="col-xs-11">
<strong>Database Credentials</strong>
<br>
It's a bad idea to store your database access credentials (i.e. your username and password) in plaintext in your source code. There are many different ways one could manage secrets like this, but a simple way is to store the values in a python file that is <b>not</b> included along with the rest of your source code. 
<br>
This is what we have done with the env module.            
<br>
<br>
Another option may be to create environment variables that may be called using the os library.
            </div> 


In [None]:
def get_connection_url(db, username=env.username, host=env.host, password=env.password):
    """
    This function will:
    - take username, pswd, host credentials from imported env module
    - output a formatted connection_url to access mySQL db
    """
    return f'mysql+pymysql://{username}:{password}@{host}/{db}'

In [14]:
df = pd.read_sql('select * from passengers', get_connection_url('titanic_db'))
df.head()

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.25,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.925,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.05,S,Third,,Southampton,1


#### We will create a function that we can reference later to acquire the data:

In [15]:
def new_titanic_data(SQL_query):
    """
    This function will:
    - take in a SQL_query
    - create a connect_url to mySQL
    - return a df of the given query from the titanic_db
    """
    url = get_connection_url('titanic_db')
    
    return pd.read_sql(SQL_query, url)

In [17]:
df_t = new_titanic_data('select * from passengers')

In [18]:
df_t

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1


#### Store this function in a file named `acquire.py`

---
___
# Caching Your Data
Because data acquisition *can take **time***, it's a common practice to write the data **locally** to a `.csv` file.

1. Do whatever you need to do to produce the dataframe that you need.
    - For example ```df = pd.read_sql('SELECT * FROM passengers', get_connection('titanic_db'))```
    - Or your dataframe could include joins, multiple data sources, etc...
    
<br>

2. Next, use ```df.to_csv("titanic.csv")``` to write that dataframe to the file.
<br>

3. In your data acquisition function:
    - First check to see if the csv file exists 
    - If it does, read from the csv file
    - Otherwise get "fresh" data from mySQL


***
Let's work through the function creation!

In [20]:
import os

In [21]:
os.getcwd()

'/Users/amandagomez/codeup/pagel/pagel-classification-exercises'

In [24]:
#save my dir location

directory = '/Users/amandagomez/codeup/pagel/pagel-classification-exercises/'

In [25]:
#test

os.path.exists(directory + "LEA.xlsx")

True

In [29]:
#still testing
filename = "titanic.csv"

In [30]:
if os.path.exists(directory + filename):
    df = pd.read_csv(filename)

In [31]:
df

Unnamed: 0,test


In [32]:
def get_titanic_data(SQL_query, directory, filename="titanic.csv"):
    """
    This function will:
    - Check local directory for csv file
        - return if exists
    - If csv doesn't exists:
        - create a df of the SQL_query
        - write df to csv
    - Output titanic df
"""
    if os.path.exists(directory + filename):
        df = pd.read_csv(filename) 
        return df
    
    else:
        df = new_titanic_data(SQL_query)
        
        #want to save to csv
        df.to_csv(filename)
        return df
    

In [33]:
SQL_query = "select * from passengers"
directory = '/Users/amandagomez/codeup/pagel/pagel-classification-exercises'

In [35]:
titanic = get_titanic_data(SQL_query, directory)

In [36]:
titanic

Unnamed: 0,passenger_id,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,deck,embark_town,alone
0,0,0,3,male,22.0,1,0,7.2500,S,Third,,Southampton,0
1,1,1,1,female,38.0,1,0,71.2833,C,First,C,Cherbourg,0
2,2,1,3,female,26.0,0,0,7.9250,S,Third,,Southampton,1
3,3,1,1,female,35.0,1,0,53.1000,S,First,C,Southampton,0
4,4,0,3,male,35.0,0,0,8.0500,S,Third,,Southampton,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...
886,886,0,2,male,27.0,0,0,13.0000,S,Second,,Southampton,1
887,887,1,1,female,19.0,0,0,30.0000,S,First,B,Southampton,1
888,888,0,3,female,,1,2,23.4500,S,Third,,Southampton,0
889,889,1,1,male,26.0,0,0,30.0000,C,First,C,Cherbourg,1
