# Converting data from mysql to postgresql using pandas

# Table of Contents

- [Setup](#Setup)

    - [Setup - Imports](#Setup---Imports)
    - [Setup - Database](#Setup---Database)
    - [Setup - Functions](#Setup---Functions)
    
        - [Setup - Function `column_name_to_lower_case`](#Setup---Function-column_name_to_lower_case)
        
- [Migrate data from MySQL to PostgreSQL](#Migrate-data-from-MySQL-to-PostgreSQL)
        
- [TODO](#TODO)

# Setup

- back to [Table of Contents](#Table-of-Contents)

## Setup - Imports

- back to [Table of Contents](#Table-of-Contents)

In [6]:
# imports
import datetime
import pandas
import psycopg2
import pymysql
import sqlalchemy

print( "packages imported at " + str( datetime.datetime.now() ) )

packages imported at 2016-11-30 17:29:03.870738


## Setup - Database

- back to [Table of Contents](#Table-of-Contents)

In [7]:
# Create SQLAlchemy connections to both MySQL and PostgreSQL

# ==> MySQL
# set up database credentials
mysql_username = "<username>"
mysql_password = "<password>"
mysql_host = "localhost"
mysql_port = "3306"
mysql_database = "homework"
mysql_charset = "utf8"

mysql_host = "cuspdev.local"
mysql_username = "jonathanmorgan"
mysql_password = "today123"

# Create database connection for pandas.
sqlalchemy_mysql_db = sqlalchemy.create_engine( "mysql+pymysql://" + mysql_username + ":" + mysql_password + "@" + mysql_host + ":" + mysql_port + "/" + mysql_database + "?charset=" + mysql_charset )

# ==> PostgreSQL

# set up database credentials
pgsql_username = "<username>"
pgsql_password = "<password>"
pgsql_host = "localhost"
pgsql_port = "5432"
pgsql_database = "homework"
pgsql_encoding = "utf8"

pgsql_host = "cuspdev.local"
pgsql_username = "jonathanmorgan"
pgsql_password = "today123"

# Create database connection for pandas.
sqlalchemy_pgsql_db = sqlalchemy.create_engine( "postgresql+psycopg2://" + pgsql_username + ":" + pgsql_password + "@" + pgsql_host + ":" + pgsql_port + "/" + pgsql_database + "?client_encoding=" + pgsql_encoding )

print( "database connections created at " + str( datetime.datetime.now() ) )

database connections created at 2016-11-30 17:29:24.581078


## Setup - Functions

- back to [Table of Contents](#Table-of-Contents)

### Setup - Function `column_name_to_lower_case`

- back to [Table of Contents](#Table-of-Contents)

In [8]:
# Postgresql works best when all column and table names are lower case.
#     Here is a function to convert all column names in a pandas DataFrame
#     to lower case.
def column_names_to_lower_case( df_IN ):
    
    '''
    Accepts a pandas DataFrame.  Converts all column names to lower case.
        Returns the updated DataFrame, or None if error.
    '''
    
    # return reference
    dl_OUT = None
    
    # declare variables
    column_name_list = None
    rename_map = None
    original_name = ""
    name_lower = ""
    
    # Make sure we have something passed in.
    if ( df_IN is not None ):
        
        # Create dictionary that maps original column names to that smae name in all lower case.
        rename_map = {}
        
        # get list of column names
        column_name_list = list( df_IN.columns )
        
        # loop over column names
        for original_name in column_name_list:
            
            # convert to all lower case
            name_lower = original_name.lower()
            
            # add to rename map
            rename_map[ original_name ] = name_lower
        
        #-- END loop over column names. --#
        
        # rename columns in DataFrame
        df_IN.rename( columns = rename_map, inplace = True )
        
        # place DataFrame in return reference.
        df_OUT = df_IN
        
    else:
        
        # nothing passed in.  For shame.
        print( "ERROR - no DataFrame passed in.  Nothing to do." )
        
        df_OUT = None
        
    #-- END check to see if DataFrame passed in --#
    
    return df_OUT
    
#-- END function column_names_to_lower_case() --#

print( "function column_names_to_lower_case() declared at " + str( datetime.datetime.now() ) )

function column_names_to_lower_case() declared at 2016-11-30 17:29:28.351296


# Migrate data from MySQL to PostgreSQL

- back to [Table of Contents](#Table-of-Contents)

In [10]:
# database name
database_name = "homework"

# make a list of the names of the tables we want to migrate
homework_table_list = []
homework_table_list.append( "machine_learning" )
homework_table_list.append( "nsf_award" )
homework_table_list.append( "text_analysis" )
homework_table_list.append( "uc_pay_2011" )
homework_table_list.append( "ugrant" )
homework_table_list.append( "vendor" )

In [12]:
# declare variables
table_select_string = ""
table_df = None

# for each homework database table, pull it in from MySQL, write it out to PostgreSQL.
for table_name in homework_table_list:
    
    print( "==> starting migration of table " + table_name + " at " + str( datetime.datetime.now() ) )
    
    # read the table into pandas from mysql
    table_select_string = "SELECT * FROM " + database_name + "." + table_name + ";"
    table_df = pandas.read_sql( table_select_string, con = sqlalchemy_mysql_db )
    
    # convert column names to lower case
    table_df = column_names_to_lower_case( table_df )
    
    print( table_name + " column names: " + str( list( table_df.columns ) ) )
    
    # write the table into postgresql.
    table_df.to_sql( table_name, con = sqlalchemy_pgsql_db )

    print( "<== migration of table " + table_name + " completed at " + str( datetime.datetime.now() ) )
    
#-- END loop over tables --#

==> starting migration of table machine_learning at 2016-11-30 18:40:05.037373
machine_learning column names: ['application_id', 'cfda_code', 'year', 'activity', 'administering_ic', 'arra_funded', 'org_name', 'org_dept', 'topic_id', 'study_section', 'total_cost', 'ed_inst_type']
<== migration of table machine_learning completed at 2016-11-30 18:40:28.929735
==> starting migration of table nsf_award at 2016-11-30 18:40:28.929809
nsf_award column names: ['awardid', 'firstname', 'lastname', 'startdate', 'enddate', 'awardtitle', 'awardeffectivedate', 'awardexpirationdate', 'name', 'cityname', 'zipcode', 'phonenumber', 'streetaddress', 'countryname', 'statename', 'statecode']
<== migration of table nsf_award completed at 2016-11-30 18:40:40.296275
==> starting migration of table text_analysis at 2016-11-30 18:40:40.296351
text_analysis column names: ['application_id', 'abstract_text']
<== migration of table text_analysis completed at 2016-11-30 18:45:55.263207
==> starting migration of tabl

# TODO

- machine_learning - convert arra_funded from text to int (\x01 ==> 1, \x00 ==> 0).
- primary keys

    - ugrant - PRIMARY KEY (`award_id`,`topic_id`)
    - vendor - PRIMARY KEY (`periodstartdate`,`institutionid`,`paymentamount`,`award_id`)
    - make "index" unique integer primary key where none exists:
    
        - machine_learning
        - nsf_award
        - text_analysis
        - uc_pay_2011
        