Save diff dataframe to table

In [1]:
import requests
import datetime
import time
import os
import csv
import pandas as pd
# from os import environ
from sqlalchemy import MetaData, create_engine, asc
from sqlalchemy.orm import sessionmaker
from sqlalchemy.exc import SQLAlchemyError

import psycopg2
from psycopg2 import Error

from flask import Flask, jsonify, request
from flask_cors import cross_origin, CORS
from models.models import Base

from models.models import ExpensesRaw, Filenames
# from endpointClasses.resources import Resources
#
from sqlalchemy import Column, Integer, BigInteger, String, Text, DateTime, \
    Float, Boolean, func, ForeignKeyConstraint, Index, ForeignKey
from sqlalchemy.ext.declarative import declarative_base
from sqlalchemy.orm import relationship, backref
from sqlalchemy.ext.hybrid import hybrid_property, hybrid_method
from datetime import datetime, timedelta, timezone
from geoalchemy2 import Geography, Geometry


In [2]:
run_all_flag = True

In [3]:
### Setup the application
app = Flask(__name__)

# Wrap CORS around the app so that the server does not block machine to machine
# or browser based requests
CORS(app)

<flask_cors.extension.CORS at 0x21cb8a306a0>

In [4]:
# Engine below for Google Cloud PostgreSQL access.
engine = create_engine('postgresql+psycopg2://postgres:5413CrossFit2018@34'
                       '.70.40.80/transgov')

Base.metadata.create_all(engine)

Session = sessionmaker(bind=engine)
session = Session()

In [5]:
col_names = ["Ministry", "Position", "Name", "Type", "Category", "Date", "Amount",
         "Description", "Receipt 1", "Receipt 2", "Receipt 3"]

In [6]:
# download  csv file
# Only do this once per day
print('Downloading .....')
df = pd.read_csv(r'https://expenses.alberta.ca/DownloadData.aspx?type=csv'
                   r'&d=IsVE/OcdpNZJ5rBbvji3qw', names=col_names,
                     low_memory=False, parse_dates=['Date'])

# Saving the dataframe
print('Saving to CSV ...')
filename = 'expenses/' + str( time.strftime('%Y%m%d') ) + ".csv"
df.to_csv(filename)
print('done...')

Downloading .....
Saving to CSV ...
done...


In [7]:
print('Reading from file...')
# filename = "/" + filename
df=pd.read_csv(filename, names=col_names, low_memory=False, skiprows=2)
df_size = len(df.index)
print('Done...', df_size)

Reading from file...
Done... 539390


In [8]:
# compare the current download to the most previous download so that we can identify
# the changes and only process those.
# 1. Check file sizes
# 2. iterate through DF1 and lookup record in DF2
# 3.   if record exists then do nothing
# 4.   is this new record or a changed one? (hard to determine....we may have to manually see if dups become a problem)

# 4.   otherwise add record to database

print('Reading from previous file...')
# get the most recent entry in the table FileNames
prevFile = pd.read_sql(
        session.query(Filenames).statement, session.bind)

previousFile = prevFile._get_value(0,'name')

# open the file as a dataframe
prev_df = pd.read_csv(previousFile, names=col_names, low_memory=False, skiprows=2)
prev_df_size = len(prev_df.index)
print('Done...', prev_df_size)

Reading from previous file...
Done... 536436


In [9]:
# save file name to filenames table in database
filetosave = Filenames(
        name=filename,
        records = df_size,
        created_at=datetime.now()
    )
try:
    session.add(filetosave)
    session.commit()
except SQLAlchemyError as e:
    session.rollback()
    print('Could not save the new filename %s', filetosave)

In [10]:
# compare the two dataframes
# https://hackersandslackers.com/compare-rows-pandas-dataframes/
def dataframe_difference(df1, df2, which=None):
    """Find rows which are different between two DataFrames."""
    comparison_df = df1.merge(
        df2,
        indicator=True,
        how='outer'
    )
    if which is None:
        diff_df = comparison_df[comparison_df['_merge'] != 'both']
    else:
        diff_df = comparison_df[comparison_df['_merge'] == which]
    diff_df.to_csv('expenses/diff.csv')
    return diff_df

In [11]:
df_diff_df = dataframe_difference(df, prev_df)

In [12]:
df_diff_df

Unnamed: 0,Ministry,Position,Name,Type,Category,Date,Amount,Description,Receipt 1,Receipt 2,Receipt 3,_merge
44797,Economic Development and Trade,Board Member,"Danks,Jeannette",Travel,Honorarium,12/08/2017,$319.22,Hotel accommodation for Jeannette Danks attend...,,,,left_only
44798,Economic Development and Trade,NADC Board Member,"Noskey,Ken",Travel,Honorarium,12/08/2017,$319.22,Hotel accommodation for Ken Noskey attending C...,,,,left_only
44804,Economic Development and Trade,Board Member,"Schulz,Jason",Travel,Honorarium,12/08/2017,$319.22,Hotel accommodation for Jason Schulz attending...,,,,left_only
45470,Community and Social Services,Press Secretary,"Toogood,Kathryn",Travel,Ground Transportation,11/22/2017,-$70.48,,http://expenses.alberta.ca/files/2018-04/img_6...,,,left_only
45471,Community and Social Services,Press Secretary,"Toogood,Kathryn",Travel,Ground Transportation,11/30/2017,$140.96,,http://expenses.alberta.ca/files/2018-04/img_6...,,,left_only
...,...,...,...,...,...,...,...,...,...,...,...,...
540270,Infrastructure,Minister,"Panda,Prasad",Travel,Accommodations,10/12/2020,$367.34,Government Government Oct 12th - Coast Edmonto...,,,,right_only
540271,Indigenous Relations,Minister's Chief of Staff,"Bauer,Ted",Travel,Mileage,09/01/2020,$23.10,Backfoot Conf. Protocol sign - Edmonton -Wetas...,,,,right_only
540272,Indigenous Relations,Senior Advisor,"Braun,Riley",Travel,Other Costs,08/16/2020,$15.75,Calgary Stakeholder Meetings - Residence Inn-C...,http://expenses.alberta.ca/files/2020-10/exc_2...,,,right_only
540273,Indigenous Relations,Sr. Executive Advisor CEO AIOC,"Machielse,Matthew",Travel,Accommodations,10/31/2019,$223.42,Calgary Business - Accom. - Delta Hotel,http://expenses.alberta.ca/files/2020-10/exc_6...,,,right_only


In [13]:
df_diff_df._merge.unique()

['left_only', 'right_only']
Categories (2, object): ['left_only', 'right_only']

In [14]:
#Save diff Df
filename = 'expenses/' + 'diff_' + str( time.strftime('%Y%m%d') ) + ".csv"
df_diff_df.to_csv(filename)

In [16]:
# Add left_only to the table expensesraw and update the 'changed' field to true for any that are right_only
start_time = time.time()
length = len(df_diff_df)
ctr = 0

for index, row in df_diff_df.iterrows():
    
    newdate = df_diff_df._get_value(index, 'Date')
    newdate = datetime.strptime(newdate, '%m/%d/%Y')
    newAmount = df_diff_df._get_value(index, 'Amount').replace('$', '').replace(',','')
    newAmount = float(newAmount)

    if len(str(df_diff_df._get_value(index, 'Receipt 1'))) > 5:
        newReceipt1 = df_diff_df._get_value(index, 'Receipt 1')
    else:
        newReceipt1 = ''
    if len(str(df_diff_df._get_value(index, 'Receipt 2'))) > 5:
        newReceipt2 = df_diff_df._get_value(index, 'Receipt 2')
    else:
        newReceipt2 = ''
    if len(str(df_diff_df._get_value(index, 'Receipt 3'))) > 5:
        newReceipt3 = df_diff_df._get_value(index, 'Receipt 3')
    else:
        newReceipt3 = ''  
        
    if df_diff_df._get_value(index, '_merge') == 'left_only':
        changed = False
        
    if df_diff_df._get_value(index, '_merge') == 'right_only':
        changed = True  
        
    expense = ExpensesRaw(
        ministry=df_diff_df._get_value(index, 'Ministry'),
        position=df_diff_df._get_value(index, 'Position'),
        name=df_diff_df._get_value(index, 'Name'),
        type=df_diff_df._get_value(index, 'Type'),
        category=df_diff_df._get_value(index, 'Category'),
        expense_date=newdate,
        amount=newAmount,
        description=df_diff_df._get_value(index, 'Description'),
        receipt1=newReceipt1,
        receipt2=newReceipt2,
        receipt3=newReceipt3,
        changed = changed,
        date_last_found=datetime.now(),
        created_at=datetime.now(),
        updated_at=datetime.now(),
    )
    
    if ctr % 100 == 0:
        perc = "{:.3f}".format(ctr/length)
        elapsed = "{:.2f}".format(time.time() - start_time)
        print('Count: %s of %s percentage %s elapsed %s ' %(ctr, length, perc, elapsed))
    
    ctr = ctr + 1
    
    try:
        session.add(expense)
        session.commit()
    except SQLAlchemyError as e:
        session.rollback()
        print('Could not save the new expense %s',
              expense)


Count: 0 of 2980 percentage 0.000 elapsed 0.00 
Count: 100 of 2980 percentage 0.034 elapsed 19.45 
Count: 200 of 2980 percentage 0.067 elapsed 38.62 
Count: 300 of 2980 percentage 0.101 elapsed 57.85 
Count: 400 of 2980 percentage 0.134 elapsed 76.92 
Count: 500 of 2980 percentage 0.168 elapsed 97.05 
Count: 600 of 2980 percentage 0.201 elapsed 116.22 
Count: 700 of 2980 percentage 0.235 elapsed 135.46 
Count: 800 of 2980 percentage 0.268 elapsed 154.75 
Count: 900 of 2980 percentage 0.302 elapsed 173.91 
Count: 1000 of 2980 percentage 0.336 elapsed 192.93 
Count: 1100 of 2980 percentage 0.369 elapsed 211.80 
Count: 1200 of 2980 percentage 0.403 elapsed 231.07 
Count: 1300 of 2980 percentage 0.436 elapsed 250.98 
Count: 1400 of 2980 percentage 0.470 elapsed 270.61 
Count: 1500 of 2980 percentage 0.503 elapsed 290.64 
Count: 1600 of 2980 percentage 0.537 elapsed 310.58 
Count: 1700 of 2980 percentage 0.570 elapsed 330.11 
Count: 1800 of 2980 percentage 0.604 elapsed 349.73 
Count: 1900 

Following code cleans the original dataframe df

In [17]:
shift_df = df[df.Date.str.startswith('$')]
shift_df

Unnamed: 0,Ministry,Position,Name,Type,Category,Date,Amount,Description,Receipt 1,Receipt 2,Receipt 3
58783,Health,"""Sowa,Bernard, Dr.",Travel,Honorarium,02/21/2020,$27.27,Calgary Hearings - Mileage - 54KM,,,,
58784,Health,"""Sowa,Bernard, Dr.",Travel,Honorarium,02/21/2020,$12.75,Calgary Hearings - Parking - RGH,,,,
58785,Health,"""Sowa,Bernard, Dr.",Travel,Honorarium,02/28/2020,$30.30,Calgary Hearings - Mileage - 60KM,,,,
58787,Health,"""Sowa,Bernard, Dr.",Travel,Honorarium,03/06/2020,$47.47,Calgary Hearings - Mileage - 94KM,,,,
58788,Health,"""Sowa,Bernard, Dr.",Travel,Honorarium,03/06/2020,$12.75,Calgary Hearings - parking - SHC,,,,
66851,Health,"""Juhas,Michal",Travel,Honorarium,02/26/2020,$31.31,Edmonton Hearings - Mileage - 62KM,,,,
66957,Health,"""Juhas,Michal",Travel,Honorarium,03/11/2020,$31.81,Edmonton Hearings - Mileage - 63KM,,,,
77412,Justice and Solicitor General,"""Hak,Elizabeth",Travel,Honorarium,09/05/2019,$318.15,Rauf Hearing in Edmonton - Mileage - Edmonton ...,,,,
77413,Justice and Solicitor General,"""Hak,Elizabeth",Travel,Honorarium,09/05/2019,$20.75,Rauf Hearing in Edmonton - Dinner allowance,,,,
77415,Justice and Solicitor General,"""Hak,Elizabeth",Travel,Honorarium,09/26/2019,$12.75,Parking (no receipt),,,,


In [18]:
len(shift_df)

16

In [19]:
TOTAL_COLS = df.shape[1] -1
for idx, row in shift_df.iterrows():
    new_line = [df.iloc[idx, 0], None]
    for i in df.iloc[idx, 1: TOTAL_COLS]:
        new_line.append(i)
    df.loc[idx] = new_line


In [20]:
# Test
shift_df = df[df.Date.str.startswith('$')]
shift_df

Unnamed: 0,Ministry,Position,Name,Type,Category,Date,Amount,Description,Receipt 1,Receipt 2,Receipt 3


In [21]:
shift_df_2 = df[df.Date.str.startswith('H')]
shift_df_2

Unnamed: 0,Ministry,Position,Name,Type,Category,Date,Amount,Description,Receipt 1,Receipt 2,Receipt 3
58785,Health,,,"Sowa,Bernard, Dr.",Travel,Honorarium,02/28/2020,$14.25,Calgary Hearings - Parking - PLC,http://expenses.alberta.ca/files/2020-04/exc_4...,
58788,Health,,Vice-chair,"Wyrozub,Richard",Travel,Honorarium,02/19/2020,$50.50,Red Deer Hearings - Mileage - 100KM,,
66851,Health,,Public Member,"Butler,Patricia",Travel,Honorarium,02/19/2020,$85.85,Red Deer Hearings - Mileage - 170KM,,
66957,Health,,Board Member,"Kreiner,Monica",Travel,Honorarium,02/24/2020,$253.51,Grande Prairie Hearings - Mileage - 502KM,,
77413,Justice and Solicitor General,,,"Hak,Elizabeth",Travel,Honorarium,09/05/2019,$24.00,Rauf Hearing in Edmonton - Parking,http://expenses.alberta.ca/files/2020-02/exc_1...,
83123,Agriculture and Forestry,,,"Moore,Norine",Travel,Honorarium,12/13/2019,$114.44,Marketing Council Board Mtg - Accommodation 1 ...,http://expenses.alberta.ca/files/2020-02/exc_B...,
447531,Executive Council,,,"McLeod,Patricia",Travel,Honorarium,06/17/2016,$117.60,AOE council strategy meeting - Medicine Hat - ...,http://expenses.alberta.ca/files/2016-10/exc_8...,


In [22]:
len(shift_df_2)

7

In [23]:
TOTAL_COLS = df.shape[1] -1
shift_df_2
for idx, row in shift_df_2.iterrows():
    new_line = []
    for i in df.iloc[idx, : TOTAL_COLS+1]:
        new_line.append(i)
    df.loc[idx] = new_line

In [24]:
# Test
shift_df = df[df.Date.str.startswith('H')]
shift_df

Unnamed: 0,Ministry,Position,Name,Type,Category,Date,Amount,Description,Receipt 1,Receipt 2,Receipt 3


In [25]:
shift_df_3 = df[df.Date.str.startswith('O')]
shift_df_3

Unnamed: 0,Ministry,Position,Name,Type,Category,Date,Amount,Description,Receipt 1,Receipt 2,Receipt 3
77415,Agriculture and Forestry,,Exec Director Wildfire Mgt. Br,"Tapp,North Darren James",Travel,Other Costs,11/20/2019,$7.35,Ottawa Trip -,,


In [26]:
len(shift_df_3)

1

In [27]:
TOTAL_COLS = df.shape[1] -1
shift_df_3
for idx, row in shift_df_3.iterrows():
    new_line = []
    for i in df.iloc[idx, : TOTAL_COLS+1]:
        new_line.append(i)
    df.loc[idx] = new_line

In [28]:
# Test
shift_df = df[df.Date.str.startswith('O')]
shift_df

Unnamed: 0,Ministry,Position,Name,Type,Category,Date,Amount,Description,Receipt 1,Receipt 2,Receipt 3


In [29]:
shift_df_4 = df[df.Date.str.startswith('M')]
shift_df_4

Unnamed: 0,Ministry,Position,Name,Type,Category,Date,Amount,Description,Receipt 1,Receipt 2,Receipt 3
447533,Education,,Deputy Minister of Education,"Clarke,Curtis",Travel,Meals,10/11/2016,-$9.20,CMEC - Toronto - Breakfast (Adjustment),,


In [30]:
TOTAL_COLS = df.shape[1] -1
shift_df_4
for idx, row in shift_df_4.iterrows():
    new_line = []
    for i in df.iloc[idx, : TOTAL_COLS+1]:
        new_line.append(i)
    df.loc[idx] = new_line

In [31]:
# Test
shift_df = df[df.Date.str.startswith('M')]
shift_df

Unnamed: 0,Ministry,Position,Name,Type,Category,Date,Amount,Description,Receipt 1,Receipt 2,Receipt 3


Following code cleans the diff dataframe

In [None]:
shift_df = df_diff_df[df_diff_df.Date.str.startswith('$')]
shift_df

In [None]:
len(shift_df)

In [None]:
TOTAL_COLS = df_diff_df.shape[1] -1
for idx, row in shift_df.iterrows():
    new_line = [df_diff_df.iloc[idx, 0], None]
    for i in df_diff_df.iloc[idx, 1: TOTAL_COLS]:
        new_line.append(i)
    df_diff_df.loc[idx] = new_line

In [None]:
shift_df_2 = df_diff_df[df_diff_df.Date.str.startswith('H')]
shift_df_2

In [None]:
len(shift_df_2)

In [None]:
# Test
shift_df = df_diff_df[df_diff_df.Date.str.startswith('H')]
shift_df

In [None]:
TOTAL_COLS =df_diff_df.shape[1] -1
shift_df_2
for idx, row in shift_df_2.iterrows():
    new_line = []
    for i in df_diff_df.iloc[idx, : TOTAL_COLS+1]:
        new_line.append(i)
    df_diff_df.loc[idx] = new_line

In [None]:
# Test
shift_df = df_diff_df[df_diff_df.Date.str.startswith('H')]
shift_df

In [None]:
shift_df_3 = df_diff_df[df_diff_df.Date.str.startswith('O')]
shift_df_3

In [None]:
len(shift_df_3)

In [None]:
TOTAL_COLS = df_diff_df.shape[1] -1
shift_df_3
for idx, row in shift_df_3.iterrows():
    new_line = []
    for i in df_diff_df.iloc[idx, : TOTAL_COLS+1]:
        new_line.append(i)
    df_diff_df.loc[idx] = new_line

In [None]:
# Test
shift_df = df_diff_df[df_diff_df.Date.str.startswith('O')]
shift_df

In [None]:
shift_df_4 = df_diff_df[df_diff_df.Date.str.startswith('M')]
shift_df_4

In [None]:
TOTAL_COLS = df_diff_df.shape[1] -1
shift_df_4
for idx, row in shift_df_4.iterrows():
    new_line = []
    for i in df_diff_df.iloc[idx, : TOTAL_COLS+1]:
        new_line.append(i)
    df_diff_df.loc[idx] = new_line

In [None]:
# Test
shift_df = df_diff_df[df_diff_df.Date.str.startswith('M')]
shift_df

SAVE DIff dataframe here

The next cells identify around 10000 records that were not saved in the initial run.  Not sure why, probably a DB connection issue

In [None]:
if run_all_flag:
    df.query('Name == "Filevich,Patricia"')

In [None]:
if run_all_flag:
    df.loc[509620:509630]

In [None]:
# new dataframe of missing records
if run_all_flag:
    missing_records_df = df.loc[498839: 509627]

In [None]:
if run_all_flag:
    missing_records_df.head()

In [None]:
if run_all_flag:
    missing_records_df.tail()

In [None]:
if run_all_flag:
    len(missing_records_df)

Duplicates

In [32]:
# identify duplicate values
dups_df= df[df.duplicated(keep=False)] # identifies 850
# dups_df= df[df.duplicated()] # Identifies 431

In [33]:
dups_df

Unnamed: 0,Ministry,Position,Name,Type,Category,Date,Amount,Description,Receipt 1,Receipt 2,Receipt 3
3045,Justice and Solicitor General,Executive Director,"Merryweather,Lorne",Travel,Ground Transportation,12/13/2013,$3.20,ETS,,,
5094,Culture,Executive Director Arts Branch,"Anderson,Jeffrey",Travel,Mileage,10/10/2012,$1.52,Travel in Edmonton to departmental meeting,,,
5096,Culture,Executive Director Arts Branch,"Anderson,Jeffrey",Travel,Mileage,10/10/2012,$1.52,Travel in Edmonton to departmental meeting,,,
5803,Executive Council,"Exec Dir Comp, Job Eval & Ben","Smith,Debra M",Travel,Air Fare,05/02/2013,$15.00,WestJet Seat Selection,,,
5804,Executive Council,"Exec Dir Comp, Job Eval & Ben","Smith,Debra M",Travel,Air Fare,05/02/2013,$15.00,WestJet Seat Selection,,,
...,...,...,...,...,...,...,...,...,...,...,...
537988,Justice and Solicitor General,Assistant Deputy Minister,"Sanderson,Kimberley Anne",Travel,Other Costs,05/11/2020,$12.00,"""",,,
538703,"Culture, Multiculturalism and Status of Women","Executive Director, RAM","Robinson,N Chris",Travel,Other Costs,11/02/2020,$10.00,Parking for meeting,,,
538704,"Culture, Multiculturalism and Status of Women","Executive Director, RAM","Robinson,N Chris",Travel,Other Costs,11/02/2020,$10.00,Parking for meeting,,,
538868,Agriculture and Forestry,Chief of Staff,"Schultz,Tim",Travel,Mileage Allowance,11/12/2020,$151.50,Meetings & Tour in Calgary,,,


In [None]:
# FUTURE only process delta dataframe!!!


# Process the dataframe
# Note that after the very first attempt at this we need to test for whether the record already exist and then update the
# date last found

start_time = time.time()
length = len(df)
ctr = 0

for index, row in df.iterrows():
    
    newdate = df._get_value(index, 'Date')
    newdate = datetime.strptime(newdate, '%m/%d/%Y')
    newAmount = df._get_value(index, 'Amount').replace('$', '').replace(',','')
    newAmount = float(newAmount)

    if len(str(df._get_value(index, 'Receipt 1'))) > 5:
        newReceipt1 = df._get_value(index, 'Receipt 1')
    else:
        newReceipt1 = ''
    if len(str(df._get_value(index, 'Receipt 2'))) > 5:
        newReceipt2 = df._get_value(index, 'Receipt 2')
    else:
        newReceipt2 = ''
    if len(str(df._get_value(index, 'Receipt 3'))) > 5:
        newReceipt3 = df._get_value(index, 'Receipt 3')
    else:
        newReceipt3 = ''  
    
    expense = ExpensesRaw(
        ministry=df._get_value(index, 'Ministry'),
        position=df._get_value(index, 'Position'),
        name=df._get_value(index, 'Name'),
        type=df._get_value(index, 'Type'),
        category=df._get_value(index, 'Category'),
        expense_date=newdate,
        amount=newAmount,
        description=df._get_value(index, 'Description'),
        receipt1=newReceipt1,
        receipt2=newReceipt2,
        receipt3=newReceipt3,
        date_last_found=datetime.now(),
        created_at=datetime.now(),
        updated_at=datetime.now(),
    )
    
#     qry = session.query(ExpensesRaw)
#     qry = qry.filter(
#                      ExpensesRaw.ministry==df._get_value(index, 'Ministry'),
#                      ExpensesRaw.position==df._get_value(index, 'Position'),
#                      ExpensesRaw.name==df._get_value(index, 'Name'),
#                      ExpensesRaw.type==df._get_value(index, 'Type'),
#                      ExpensesRaw.category==df._get_value(index, 'Category'),
#                      ExpensesRaw.expense_date==newdate,
#                      ExpensesRaw.amount==newAmount,
#                      ExpensesRaw.description==df._get_value(index, 'Description'),
#                      ExpensesRaw.receipt1==df._get_value(index, 'Receipt 1'),
#                      ExpensesRaw.receipt2==df._get_value(index, 'Receipt 2'),
#                      ExpensesRaw.receipt3==df._get_value(index, 'Receipt 3')
#                     ).first()
#     print('Query', qry)
    
    if ctr % 100 == 0:
        perc = "{:.3f}".format(ctr/length)
        elapsed = "{:.2f}".format(time.time() - start_time)
        print('Count: %s of %s percentage %s elapsed %s ' %(ctr, length, perc, elapsed))
    
    ctr = ctr + 1
    
    try:
        session.add(expense)
        session.commit()
    except SQLAlchemyError as e:
#         error = str(e.__dict__['orig'])
        session.rollback()
        print('Could not save the new expense %s',
              expense)
#         print(error)
   

In [34]:
# for each row in dups_df find it in the expensesraw table of the db and update the isduplicated flag to true
ctr = 1
for index, row in dups_df.iterrows():
    #find the row in the database
    newdate = df._get_value(index, 'Date')
    newdate = datetime.strptime(newdate, '%m/%d/%Y')
    newAmount = df._get_value(index, 'Amount').replace('$', '').replace(',','')
    newAmount = float(newAmount)

    if len(str(df._get_value(index, 'Receipt 1'))) > 5:
        newReceipt1 = df._get_value(index, 'Receipt 1')
    else:
        newReceipt1 = ''
    if len(str(df._get_value(index, 'Receipt 2'))) > 5:
        newReceipt2 = df._get_value(index, 'Receipt 2')
    else:
        newReceipt2 = ''
    if len(str(df._get_value(index, 'Receipt 3'))) > 5:
        newReceipt3 = df._get_value(index, 'Receipt 3')
    else:
        newReceipt3 = ''
        
    print('Position : ',  df._get_value(index, 'Position'))
    newPosition = df._get_value(index, 'Position')
    
    if df.isnull()._get_value(index, 'Position'):
        print('Position is None')
        newPosition = ''
        
    print('Description : ',  df._get_value(index, 'Description'))
    newDescription = df._get_value(index, 'Description')
    
    if df.isnull()._get_value(index, 'Description'):
        print('Description is None')
        newDescription = ''    
    
    qry = session.query(ExpensesRaw)
    qry = qry.filter(
                     ExpensesRaw.ministry==df._get_value(index, 'Ministry'),
                     ExpensesRaw.position==newPosition,
                     ExpensesRaw.name==df._get_value(index, 'Name'),
                     ExpensesRaw.type==df._get_value(index, 'Type'),
                     ExpensesRaw.category==df._get_value(index, 'Category'),
                     ExpensesRaw.expense_date==newdate,
                     ExpensesRaw.amount==newAmount,
                     ExpensesRaw.description==newDescription,
                     ExpensesRaw.receipt1==newReceipt1,
                     ExpensesRaw.receipt2==newReceipt2,
                     ExpensesRaw.receipt3==newReceipt3,
                    ).all()
    for result in qry:
        result.is_duplicated = True
#         print(result)
    try:
        session.commit()
    except:
        print('Could not update')
    print('Count', ctr)
    ctr = ctr + 1    
    

Position :  Executive Director
Description :  ETS
Count 1
Position :  Executive Director Arts Branch
Description :  Travel in Edmonton to departmental meeting
Count 2
Position :  Executive Director Arts Branch
Description :  Travel in Edmonton to departmental meeting
Count 3
Position :  Exec Dir Comp, Job Eval & Ben
Description :  WestJet Seat Selection
Count 4
Position :  Exec Dir Comp, Job Eval & Ben
Description :  WestJet Seat Selection
Count 5
Position :  Mbr, AISH & IESA Appeal Panels
Description :  Expenses to Attend Appeal - Mileage - 200 Kms
Count 6
Position :  Mbr, AISH & IES Appeal Panels
Description :  Expenses to attend appeals - Breakfast
Count 7
Position :  Ex Dir-OHS Program Delivery
Description :  "
Count 8
Position :  Ex Dir-OHS Program Delivery
Description :  "
Count 9
Position :  Branch Head
Description :  Sharla Rauschning airfare Edm - Clgy - RET
Count 10
Position :  Branch Head
Description :  Sharla Rauschning airfare Edm - Clgy - RET
Count 11
Position :  Board Me

Count 88
Position :  Vice-chair
Description :  Red Deer Hearings - Mileage - 100KM
Count 89
Position :  Assist Chief Medical Examiner
Description :  "
Count 90
Position :  Assist Chief Medical Examiner
Description :  "
Count 91
Position :  ED, Prem Southern AB Office
Description :  Working at Legislature
Count 92
Position :  ED, Prem Southern AB Office
Description :  Working at Legislature
Count 93
Position :  ED, Prem Southern AB Office
Description :  Working at Legislature
Count 94
Position :  Public Member
Description :  CLPNA - Hearing - Aloft
Count 95
Position :  Public Member
Description :  CLPNA - Hearing - Aloft
Count 96
Position :  Special Advisor
Description :  Edmonton - RDAM/Budget - 
Count 97
Position :  Special Advisor
Description :  Edmonton - RDAM/Budget - 
Count 98
Position :  Chief of Staff to Assoc Min
Description :  Meeting with Dept. Officials
Count 99
Position :  Chief of Staff to Assoc Min
Description :  Meeting with Dept. Officials
Count 100
Position :  Chief of

Count 178
Position :  ADM, Delivery Services
Description :  Twin Atria to Legislature
Count 179
Position :  ADM, Delivery Services
Description :  Twin Atria to Legislature
Count 180
Position :  Member
Description :  Calgary Hearings - Parking
Count 181
Position :  Member
Description :  Calgary Hearings - Parking
Count 182
Position :  Regional Dir, Upper/Low Peace
Description :  Interviews - Mileage
Count 183
Position :  Regional Dir, Upper/Low Peace
Description :  Interviews - Mileage
Count 184
Position :  Assistant Deputy Minister
Description :  Meetings in Edmonton - Meetings in Edmonton
Count 185
Position :  Regional Dir, Upper/Low Peace
Description :  Interviews - Mileage
Count 186
Position :  Assistant Deputy Minister
Description :  Meetings in Edmonton - Meetings in Edmonton
Count 187
Position :  Regional Dir, Upper/Low Peace
Description :  Interviews - Mileage
Count 188
Position :  Chair
Description :  MHREVP - Parking
Count 189
Position :  Chair
Description :  MHREVP - Parking


Count 265
Position :  Member, Premier's Council
Description :  Appeals in Barrhead - Mileage
Count 266
Position :  Mbr, AISH & IESA, CYFE,CCL,PDD
Description :  Expenses to attend Appeals - Per Diem
Count 267
Position :  Acting Exec Director for FNR
Description :  Trish Merrithew-Mercredi
Count 268
Position :  Acting Exec Director for FNR
Description :  Trish Merrithew-Mercredi
Count 269
Position :  ED/Commissioner Cul Industries
Description :  Mileage - to/from Govt' offices
Count 270
Position :  Senior Nursing Advisor
Description :  NP Forum - RAH
Count 271
Position :  Regional Ex Dir-South
Description :  Permanency Planning Meeting - Permanency Planning Meeting - Edmonton
Count 272
Position :  Member
Description :  Calgary Hearings - Parking
Count 273
Position :  Member
Description :  Calgary Hearings - Parking
Count 274
Position :  Exec Dir, Cultural Industries
Description :  Travel to and from office to DT meeting
Count 275
Position :  ADM, Shared Services
Description :  Contact C

Count 361
Position :  Assistant Deputy Minister
Description :  Federal-Provincial-Territorial Deputy Ministers Meeting - Breakfast
Count 362
Position :  Assistant Deputy Minister
Description :  Federal-Provincial-Territorial Deputy Ministers Meeting - Breakfast
Count 363
Position :  Exec Dir Metis Relations
Description :  MSGC
Count 364
Position :  Board Member
Description :  Board Hearings - Lethbridge - Mileage within Lethbridge 4 km
Count 365
Position :  Exec Dir-Org Renewal
Description :  ROFP Session - DM and ET - Parking Shaw Conference Centre
Count 366
Position :  Exec Dir-Org Renewal
Description :  ROFP Session - DM and ET - Parking Shaw Conference Centre
Count 367
Position :  Executive Director HR Services
Description :  Alberta Livestock and Meat Agency Executive Meeting
Count 368
Position :  Honoraria
Description :  UCA Advisory Board Meeting - Per Diem - Dinner
Count 369
Position :  Mbr, AISH & IESA Appeal Panels
Description :  Edmonton CARB - ECARB - 102 kms
Count 370
Posi

Count 446
Position :  Mbr, AISH & IESA, CYFE,CCL,PDD
Description :  Expenses to Attend Appeals - Mileage
Count 447
Position :  Mbr, AISH & IESA, CYFE,CCL,PDD
Description :  Expenses to Attend Appeals - Mileage
Count 448
Position :  Exec Dir Metis Relations
Description :  Office to Legislature and return
Count 449
Position :  ED/Commissioner Cul Industries
Description :  Parking - Department meeting
Count 450
Position :  Chief Information Officer
Description :  Trans>Infra
Count 451
Position :  Senior Operating Officer
Description :  Housing Update - taxi
Count 452
Position :  Member
Description :  Appeals in High Prairie - Mileage
Count 453
Position :  Member
Description :  Appeals in High Prairie - Mileage
Count 454
Position :  Member
Description :  Appeals in High Prairie - Mileage
Count 455
Position :  Mbr, AISH & IESA Appeal Panels
Description :  Expenses to Attend Appeals - Taxi
Count 456
Position :  Regional Ex Dir-South
Description :  HS Executive Directors Meeting - HS Executiv

Count 531
Position :  Member, Medicine Hat CAP
Description :  Expenses to attend appeals - Dinner
Count 532
Position :  CEO-PDD NE
Description :  nan
Description is None
Count 533
Position :  CEO-PDD NE
Description :  nan
Description is None
Count 534
Position :  Chief Executive Officer
Description :  Meeting with KTC for partnership development
Count 535
Position :  Chief Executive Officer
Description :  Meeting with KTC for partnership development
Count 536
Position :  Member, Medicine Hat CAP
Description :  Expenses to attend appeals - Lunch
Count 537
Position :  Member, Medicine Hat CAP
Description :  Expenses to attend appeals - Dinner
Count 538
Position :  Member, Medicine Hat CAP
Description :  Expenses to attend appeals - Per Diem
Count 539
Position :  Ex Dir, Human Resources
Description :  Trans EC- Twin
Count 540
Position :  Ex Dir, Human Resources
Description :  Trans EC- Twin
Count 541
Position :  Executive Manager 2
Description :  "
Count 542
Position :  Executive Manager 

Count 621
Position :  Mbr, AISH & IESA Appeal Panels
Description :  Expenses to Attend Appeals - Milage
Count 622
Position :  Member
Description :  Hearings in Edmonton - Parking
Count 623
Position :  Member
Description :  Hearings in Edmonton - Parking
Count 624
Position :  ADM, Creative & Community Dev
Description :  Meetings in Calgary - LRT - meeting in Calgary
Count 625
Position :  Managing Dir AB Mexico Office
Description :  PECOM - Hotel Quinta Real - PECOM
Count 626
Position :  Managing Dir AB Mexico Office
Description :  PECOM - Hotel Quinta Real - PECOM
Count 627
Position :  Board Member
Description :  Trip 1 - Meeting with Minister - CIRB-Edm-Mtg-Taxi
Count 628
Position :  Regional Dir-Calgary
Description :  "
Count 629
Position :  Regional Dir-Calgary
Description :  "
Count 630
Position :  Regional Dir-Calgary
Description :  "
Count 631
Position :  Regional Dir-Calgary
Description :  "
Count 632
Position :  Co-Chair, CFSA North Central
Description :  Board Evaluations/Board

Count 712
Position :  Member, Premier's Council
Description :  Appeals in Lloydminster - Mileage
Count 713
Position :  Mbr, AISH & IESA Appeal Panels
Description :  Appeals in High Prairie - Mileage
Count 714
Position :  General Manager
Description :  National Supervisory Agency Mt - Mileage to/from Airport
Count 715
Position :  Strategic Executive Advisor
Description :  City of Vancouver - Translink Vancouver
Count 716
Position :  Member, Premier's Council
Description :  Appeals in Lloyminister - Mileage
Count 717
Position :  Member, Premier's Council
Description :  Appeals in Lloyminister - Mileage
Count 718
Position :  Member, SCFRC
Description :  Internet
Count 719
Position :  Member, CQA
Description :  Council Meeting - 
Count 720
Position :  Exec. Director, Regional Ops
Description :  All Managers Meeting - On hotel receipt
Count 721
Position :  Exec Dir, Cultural Industries
Description :  Travel to and from office to DT meeting
Count 722
Position :  Board Member
Description :  A

Count 806
Position :  Minister
Description :  Luncheon with Manitoba Minister (RS, TF, RL)
Count 807
Position :  CEO-CFSA Region 1
Description :  DFNA/Directors Meeting - DFNA/Directors Meeting - Edmonton
Count 808
Position :  Mbr, AISH & IESA Appeal Panels
Description :  Appeals in Peace River - Mileage
Count 809
Position :  Member, Labour Relations Board
Description :  Edmonton Hearing GE-07019
Count 810
Position :  Member, Labour Relations Board
Description :  Edmonton Hearing GE-07019
Count 811
Position :  Member, Labour Relations Board
Description :  Edmonton Hearing GE-07019
Count 812
Position :  Member, Labour Relations Board
Description :  Edmonton Hearing GE-07019
Count 813
Position :  Member, Labour Relations Board
Description :  Edmonton Hearing GE-07019
Count 814
Position :  Member, Labour Relations Board
Description :  Edmonton Hearing GE-07019
Count 815
Position :  Member, Labour Relations Board
Description :  Edmonton Hearing GE-07019
Count 816
Position :  Member, Labour

In [None]:
print(df.head())