In [46]:
import os
import sys
import pandas as pd
import pandasql as pdsql


from bkcharts import Donut
from bokeh.io import output_notebook, show

# Do this trick to add the new Beamly functionality
pythonModsRoot = '/Users/carlos.aguilar/Google Drive/PythonDev/Coding/BeamlyPython'
if pythonModsRoot not in sys.path:
    sys.path.append(pythonModsRoot)

import redshiftSqlAlchemy as rsa
import carlosUtils        as cu
import bokehUtils         as bk

from IPython.display import Markdown, display, HTML
def printmd(string):
    display(Markdown(string))


In [47]:
numCampaigns = 20;
cutOffPerc   = 96.0;


In [48]:
# set the table and schema
schemaName = 'adform'
tableName  = 'impressionsExtended'

# set the name of the variables to break-down
breakDownKeyWords = ['campaign_id', 'yyyy_mm_dd', \
'publisher_domain', 'bannerid', 'bannerType', \
'client_id', 'device_type_id', 'placement_id_activity_id', 'tag_id', \
'city_id' ];

# flags to set the outputs
saveFigure    = True;
saveDFToExcel = True;

In [49]:
# connect to the Redshift box
user     = 'carlos_aguilar'
password = 'MdogDI64j6vH90g973'
dbname   = 'adform'
host     = 'adform-ops.c7dxcjhlundm.eu-central-1.redshift.amazonaws.com'

rs = rsa.RedshiftAlchemy(user=user, password=password, 
    database=dbname, host=host)

In [50]:
sqlQuery = '''SELECT count(*) as numRecords 
    from {}.{}'''.format(schemaName, tableName)
df = rs.query2DF(sqlQuery);
printmd('Analysing **' + schemaName + '.' + tableName + '** (' +  str(df.numrecords[0]) + ' records)' )

Analysing **adform.impressionsExtended** (195387010 records)

In [51]:
# This snippet breaks down the variables and writes the counts to a file
printmd('_Break down variables_ from ' +  schemaName +'.' + tableName)
for currentKey in breakDownKeyWords:
    sqlQuery = '''SELECT count(distinct({})) as numRecords 
        from {}.{}'''.format(currentKey, schemaName, tableName)
    df = rs.query2DF(sqlQuery);
    currentText = '**' + currentKey + '**' + ' has got **' + str(df.numrecords[0]) + '** unique values...'
    printmd(currentText)

_Break down variables_ from adform.impressionsExtended

**campaign_id** has got **247** unique values...

**yyyy_mm_dd** has got **66** unique values...

**publisher_domain** has got **276148** unique values...

**bannerid** has got **1454** unique values...

**bannerType** has got **4** unique values...

**client_id** has got **37** unique values...

**device_type_id** has got **5** unique values...

**placement_id_activity_id** has got **1636** unique values...

**tag_id** has got **3091** unique values...

**city_id** has got **220894** unique values...

In [52]:
# This bit takes the 'numCampaigns' most clicked campaigns and writes the results

sqlQuery = '''SELECT
A.campaign_id,
A.clientsname,
A.countryname,
A.devicename,
A.startdate,
A.enddate,
sum(A.numRecords) as totalImpressions,
B.numTotals
from adform.impressionsExtended as A
inner join (
  SELECT  campaign_id,
  sum(numRecords) AS numTotals,
  rank() over (order by numTotals desc) as rnk
  from adform.impressionsExtended
  group by 1
) as B
on A.campaign_id = B.campaign_id
where B.rnk <= {}
group by 1,2,3,4,5,6,B.numTotals
order by numTotals DESC'''.format(numCampaigns)


df        = rs.query2DF(sqlQuery)
groupedDF = df.groupby(df['campaign_id'])
idx       = 0;


# Get a list of the most clicked campaigns
vars2show = ['campaign_id', 'clientsname', 'numtotals', 'startdate', 'enddate']
df2 = df[vars2show].copy()
df2.drop_duplicates(inplace=True)
display(df2)

Unnamed: 0,campaign_id,clientsname,numtotals,startdate,enddate
0,897248,CoverGirl,104971036,2017-05-15 00:00:00.0,2017-06-30 23:59:00.0
358,886811,Sally Hansen,48491511,2017-04-25 00:00:00.0,2017-06-30 23:59:00.0
746,929928,Clairol Consumer,41799226,2017-06-15 00:00:00.0,2017-06-30 23:59:00.0
1480,886100,Sally Hansen,41189281,2017-04-17 00:00:00.0,2017-06-30 23:59:00.0
2177,886095,Sally Hansen,32643950,2017-05-01 00:00:00.0,2017-06-30 23:59:00.0
2856,888329,Sally Hansen,32176714,2017-05-08 00:00:00.0,2017-06-30 23:59:00.0
3269,894699,Rimmel,30504815,2017-05-10 00:00:00.0,2017-06-06 23:59:00.0
3833,912694,OPI Professional,28922186,2017-06-01 00:00:00.0,2017-06-30 23:59:00.0
4545,886846,Sally Hansen,20148263,2017-05-08 00:00:00.0,2017-06-30 23:59:00.0
5266,903883,Rimmel,19277041,2017-05-18 00:00:00.0,2017-06-10 23:59:00.0


In [53]:
for name, group in groupedDF:
    idx += 1;
    groupClientName = group['clientsname'].iloc[0]
    cu.printf('{} - Processing {} id {}...'.format(idx, groupClientName, name))
    print('Current campaign {} id {} got {} impressions'.format(groupClientName, \
        name, group['totalimpressions'].sum()))
    
    # Get the percentage for the break down
    totalCampaignClicks = group['totalimpressions'].sum();
    group['perct'] = 100.0*group['totalimpressions']/totalCampaignClicks;
    group.sort_values(['perct'], ascending=[0], inplace = True)
    
    # cutoff at 96%
    idx95p = group['perct'].cumsum() < cutOffPerc
    df95p  = group.loc[idx95p, :]

    fileName  = groupClientName + ' (id ' + str(name) + ').xlsx'
    

    display(df95p)

    # bokeh
    title  = groupClientName + \
        ' (id ' + str(name) + ') clicks: ' + str(totalCampaignClicks)
    labels = ['devicename','countryname']
    values = 'totalimpressions'
    hoverText ='totalimpressions'
    textFontSize = '10pt'

    donutChart = Donut(df95p, label=labels, values=values,
    text_font_size=textFontSize, hover_text=hoverText,
    title=title)
    output_notebook()
    show(donutChart)

Current campaign Philosophy id 875443 got 14802201 impressions


1 - Processing Philosophy id 875443...
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  if __name__ == '__main__':
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  # Remove the CWD from sys.path while we load stuff.


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
7859,875443,Philosophy,United States,Mobile,2017-04-06 00:00:00.0,2017-06-30 23:59:00.0,7137526,14802201,48.219356
7968,875443,Philosophy,United States,Desktop and Laptop,2017-04-06 00:00:00.0,2017-06-30 23:59:00.0,4150401,14802201,28.03908


  chart = create_and_build(DonutBuilder, data, **kw)


Current campaign Sally Hansen id 886095 got 32643950 impressions


2 - Processing Sally Hansen id 886095...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
2399,886095,Sally Hansen,United States,Desktop and Laptop,2017-05-01 00:00:00.0,2017-06-30 23:59:00.0,15548496,32643950,47.630559


Current campaign Sally Hansen id 886100 got 41189281 impressions


3 - Processing Sally Hansen id 886100...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
1480,886100,Sally Hansen,United States,Mobile,2017-04-17 00:00:00.0,2017-06-30 23:59:00.0,21400058,41189281,51.955406


Current campaign Sally Hansen id 886811 got 48491511 impressions


4 - Processing Sally Hansen id 886811...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
485,886811,Sally Hansen,United States,Mobile,2017-04-25 00:00:00.0,2017-06-30 23:59:00.0,35971633,48491511,74.181299


Current campaign Sally Hansen id 886846 got 20148263 impressions


5 - Processing Sally Hansen id 886846...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
4546,886846,Sally Hansen,United States,Desktop and Laptop,2017-05-08 00:00:00.0,2017-06-30 23:59:00.0,7828394,20148263,38.85394
5034,886846,Sally Hansen,United States,Mobile,2017-05-08 00:00:00.0,2017-06-30 23:59:00.0,4461811,20148263,22.144892
4548,886846,Sally Hansen,United States,Tablet,2017-05-08 00:00:00.0,2017-06-30 23:59:00.0,4121387,20148263,20.455297


Current campaign Sally Hansen id 888329 got 32176714 impressions


6 - Processing Sally Hansen id 888329...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
2999,888329,Sally Hansen,United States,Desktop and Laptop,2017-05-08 00:00:00.0,2017-06-30 23:59:00.0,11152329,32176714,34.659627
2856,888329,Sally Hansen,United States,Mobile,2017-05-08 00:00:00.0,2017-06-30 23:59:00.0,8609426,32176714,26.756697
3001,888329,Sally Hansen,United States,Tablet,2017-05-08 00:00:00.0,2017-06-30 23:59:00.0,6913464,32176714,21.485923


Current campaign Rimmel id 894699 got 30504815 impressions


7 - Processing Rimmel id 894699...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
3447,894699,Rimmel,United Kingdom,Mobile,2017-05-10 00:00:00.0,2017-06-06 23:59:00.0,20773429,30504815,68.098853
3272,894699,Rimmel,United Kingdom,Tablet,2017-05-10 00:00:00.0,2017-06-06 23:59:00.0,5282055,30504815,17.31548


Current campaign CoverGirl id 897248 got 104971036 impressions


8 - Processing CoverGirl id 897248...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
1,897248,CoverGirl,United States,Mobile,2017-05-15 00:00:00.0,2017-06-30 23:59:00.0,87454163,104971036,83.312661


Current campaign Rimmel id 903883 got 19277041 impressions


9 - Processing Rimmel id 903883...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
5267,903883,Rimmel,Italy,Mobile,2017-05-18 00:00:00.0,2017-06-10 23:59:00.0,12563261,19277041,65.172144
5670,903883,Rimmel,Italy,Unknown,2017-05-18 00:00:00.0,2017-06-10 23:59:00.0,3145335,19277041,16.316482


Current campaign Clairol Consumer id 904136 got 15178309 impressions


10 - Processing Clairol Consumer id 904136...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
7727,904136,Clairol Consumer,United States,Mobile,2017-05-22 00:00:00.0,2017-06-30 23:59:00.0,8380826,15178309,55.215808


Current campaign Calvin Klein id 907580 got 17698973 impressions


11 - Processing Calvin Klein id 907580...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
6653,907580,Calvin Klein,Mexico,Mobile,2017-05-19 00:00:00.0,2017-06-25 23:59:00.0,8649610,17698973,48.870689
6807,907580,Calvin Klein,Mexico,Tablet,2017-05-19 00:00:00.0,2017-06-25 23:59:00.0,3768278,17698973,21.290942
6505,907580,Calvin Klein,Mexico,Unknown,2017-05-19 00:00:00.0,2017-06-25 23:59:00.0,3477340,17698973,19.647129


Current campaign OPI Professional id 912694 got 28922186 impressions


12 - Processing OPI Professional id 912694...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
4060,912694,OPI Professional,United States,Mobile,2017-06-01 00:00:00.0,2017-06-30 23:59:00.0,23532571,28922186,81.365119
4081,912694,OPI Professional,United States,Tablet,2017-06-01 00:00:00.0,2017-06-30 23:59:00.0,1772973,28922186,6.130149
4302,912694,OPI Professional,United States,Desktop and Laptop,2017-06-01 00:00:00.0,2017-06-30 23:59:00.0,1318340,28922186,4.558231


Current campaign Calvin Klein id 916730 got 12781501 impressions


13 - Processing Calvin Klein id 916730...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
9818,916730,Calvin Klein,,Mobile,2017-05-30 00:00:00.0,2017-06-25 23:59:00.0,4732809,12781501,37.028585
9781,916730,Calvin Klein,Argentina,Mobile,2017-05-30 00:00:00.0,2017-06-25 23:59:00.0,2920889,12781501,22.852472
9783,916730,Calvin Klein,,Tablet,2017-05-30 00:00:00.0,2017-06-25 23:59:00.0,2122718,12781501,16.607736
9500,916730,Calvin Klein,Argentina,Tablet,2017-05-30 00:00:00.0,2017-06-25 23:59:00.0,1302196,12781501,10.18813
9508,916730,Calvin Klein,,Unknown,2017-05-30 00:00:00.0,2017-06-25 23:59:00.0,657917,12781501,5.147416


Current campaign Rimmel id 923338 got 17903269 impressions


14 - Processing Rimmel id 923338...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
6066,923338,Rimmel,United Kingdom,Mobile,2017-06-07 00:00:00.0,2017-06-30 23:59:00.0,14370824,17903269,80.269274
6269,923338,Rimmel,United Kingdom,Tablet,2017-06-07 00:00:00.0,2017-06-30 23:59:00.0,1828701,17903269,10.214341


Current campaign CoverGirl id 923742 got 13220398 impressions


15 - Processing CoverGirl id 923742...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
8574,923742,CoverGirl,Mexico,Mobile,2017-06-01 00:00:00.0,2017-06-25 23:59:00.0,8093553,13220398,61.220192
8661,923742,CoverGirl,Mexico,Unknown,2017-06-01 00:00:00.0,2017-06-25 23:59:00.0,4057108,13220398,30.688244


Current campaign Clairol Consumer id 929928 got 41799226 impressions


16 - Processing Clairol Consumer id 929928...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
1000,929928,Clairol Consumer,United States,Mobile,2017-06-15 00:00:00.0,2017-06-30 23:59:00.0,35994793,41799226,86.11354
1256,929928,Clairol Consumer,United States,Tablet,2017-06-15 00:00:00.0,2017-06-30 23:59:00.0,2424911,41799226,5.80133


Current campaign Adidas id 930412 got 13745391 impressions


17 - Processing Adidas id 930412...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
8518,930412,Adidas,Germany,Desktop and Laptop,2017-06-08 00:00:00.0,2017-06-30 23:59:00.0,12230729,13745391,88.980583


Current campaign Marc Jacobs id 930443 got 13943022 impressions


18 - Processing Marc Jacobs id 930443...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
8337,930443,Marc Jacobs,,Mobile,2017-06-14 00:00:00.0,2017-06-28 23:59:00.0,5512172,13943022,39.533553
8205,930443,Marc Jacobs,Argentina,Mobile,2017-06-14 00:00:00.0,2017-06-28 23:59:00.0,3336414,13943022,23.928916
8209,930443,Marc Jacobs,,Tablet,2017-06-14 00:00:00.0,2017-06-28 23:59:00.0,2064278,13943022,14.805097
8070,930443,Marc Jacobs,Argentina,Tablet,2017-06-14 00:00:00.0,2017-06-28 23:59:00.0,1367302,13943022,9.806353


Current campaign OPI Professional id 930891 got 15421693 impressions


19 - Processing OPI Professional id 930891...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
7160,930891,OPI Professional,United Kingdom,Mobile,2017-06-16 00:00:00.0,2017-06-30 23:59:00.0,12702134,15421693,82.365367
7170,930891,OPI Professional,United Kingdom,Tablet,2017-06-16 00:00:00.0,2017-06-30 23:59:00.0,1206343,15421693,7.822377


Current campaign Sally Hansen id 941709 got 13191594 impressions


20 - Processing Sally Hansen id 941709...


Unnamed: 0,campaign_id,clientsname,countryname,devicename,startdate,enddate,totalimpressions,numtotals,perct
9296,941709,Sally Hansen,United States,Mobile,2017-07-05 00:00:00.0,2017-07-31 23:59:00.0,7022252,13191594,53.232778
9273,941709,Sally Hansen,United States,Desktop and Laptop,2017-07-05 00:00:00.0,2017-07-31 23:59:00.0,4616102,13191594,34.992754
9038,941709,Sally Hansen,United States,Tablet,2017-07-05 00:00:00.0,2017-07-31 23:59:00.0,587140,13191594,4.450865


In [54]:


rs.close()