In [1]:
%run __init__.ipynb

Successfully connected to MongoDB


In [2]:
# Import modules
from googleapiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
import pandas as pd
from datetime import datetime, timedelta, date
from dateutil.rrule import rrule, DAILY
from time import sleep
from googleapiclient.discovery import build
from oauth2client.service_account import ServiceAccountCredentials
import time
import json

# Set variables
SCOPES = ['https://www.googleapis.com/auth/analytics.readonly']
KEY_FILE_GOOGLE = json.loads(naas.secret.get('KEY_FILE_GOOGLE'))
VIEW_ID = '214312751'
METRICS = [{'expression': 'ga:users'}]
# metrics = [{'expression': 'ga:sessions'}]
DIMENTIONS = [{'name': 'ga:country'}]
FIRSTDATA = 30
TODAY = datetime.now().strftime("%Y-%m-%d")


In [3]:
def initialize_analyticsreporting():
    """ 
    Initializes an Analytics Reporting API V4 service object.

    Returns:
    An authorized Analytics Reporting API V4 service object.
    """
    credentials = ServiceAccountCredentials.from_json_keyfile_dict(
      KEY_FILE_GOOGLE, SCOPES)

    # Build the service object.
    analytics = build('analyticsreporting', 'v4', credentials=credentials)
    return analytics

analytics = initialize_analyticsreporting()

In [4]:
%%time

def convert_reponse_to_df(response, date=None):
      list = []
      # parse report data
      for report in response.get('reports', []):

        columnHeader = report.get('columnHeader', {})
        dimensionHeaders = columnHeader.get('dimensions', [])
        metricHeaders = columnHeader.get('metricHeader', {}).get('metricHeaderEntries', [])
        rows = report.get('data', {}).get('rows', [])

        for row in rows:
            dict = {}
            dimensions = row.get('dimensions', [])
            dateRangeValues = row.get('metrics', [])

            for header, dimension in zip(dimensionHeaders, dimensions):
                dict[header] = dimension

            for i, values in enumerate(dateRangeValues):
                for metric, value in zip(metricHeaders, values.get('values')):
                    if ',' in value or ',' in value:
                        dict[metric.get('name')] = float(value)
                    else:
                        dict[metric.get('name')] = int(value)
                    if date is not None:
                        dict['date'] = date
            list.append(dict)

        df = pd.DataFrame(list)
        return df


def get_report(analytics, start_date, end_date, view_id, metrics, dimensions, dimensionFilterClauses=[], segments=[]):
    body = {
        'reportRequests': [
        {
          'viewId': view_id,
          'dateRanges': [{'startDate':start_date, 'endDate': end_date}],
          'metrics': metrics,
          'dimensions': dimensions,
          'pageSize': 10000,
          'dimensionFilterClauses': dimensionFilterClauses,
          'segments': segments,
        }]
      }
#     print('body', body)
    return analytics.reports().batchGet(
      body=body
    ).execute()


def return_ga_data(view_id, metrics, dimensions, split_dates, group_by=[], dimensionFilterClauses=[], segments=[]):
    start_date = (datetime.now() - timedelta(FIRSTDATA)).strftime("%Y-%m-%d")
    end_date = TODAY
    
    if split_dates == False:
        return convert_reponse_to_df(get_report(analytics, start_date, end_date, view_id, metrics, dimensions, dimensionFilterClauses, segments))
    else:
        start_date = datetime.strptime(start_date, '%Y-%m-%d').date()
        end_date = datetime.strptime(end_date, '%Y-%m-%d').date()

        df_total = pd.DataFrame()
        for date in rrule(freq=DAILY, dtstart=start_date, until=end_date):
            date = str(date.date())
            df_total = df_total.append(convert_reponse_to_df(get_report(analytics, date, date, view_id, metrics, dimensions, dimensionFilterClauses, segments), date))

        if len(group_by) != 0:
            df_total = df_total.groupby(group_by).sum()

        return df_total

df_ga = return_ga_data(VIEW_ID, METRICS, DIMENTIONS, True)

CPU times: user 123 ms, sys: 2.41 ms, total: 125 ms
Wall time: 9.65 s


In [5]:
%%time

def domain_301(df):
    cols_to_rename = {"ga:country": "ENTITY", "date": "DATE", "ga:users": "VALUE_D"}
    domain = df.copy().rename(index=str, columns=cols_to_rename).fillna("Not defined")
    domain['DATE'] = pd.to_datetime(domain['DATE'], format='%Y-%m-%d').dt.strftime('%d/%m/%Y')
    
    #-- Consolidate data
    domain_ww = domain.copy()
    domain_ww['ENTITY'] = 'WORLDWIDE'
    
    # Concat & groupby
    domain_final = pd.concat([domain,domain_ww], axis=0)
    domain_final = domain_final.groupby(['ENTITY','DATE'], as_index=False).agg({'VALUE_D': 'sum'})
    domain_final = domain_final.sort_values(["DATE"], ascending=True).reset_index(drop=True)
    domain_final['VALUE'] = domain_final.groupby(['ENTITY'], as_index=True).agg({'VALUE_D': 'cumsum'})
    
    domain_final['DATE_ORDER'] = pd.to_datetime(domain_final['DATE'], format='%d/%m/%Y').dt.strftime('%Y%m%d')
    domain_final['LAST_UPDATE'] = datetime.now().strftime('%d/%m/%Y %H:%M:%S')
    return domain_final

domain301 = domain_301(df_ga)
if USE_MONGO:
    naas_drivers.mongo.send(domain301,'301',DB_APP,True)

Dataframe 301 successfully save in database app-wsr in MongoDB. Time: --- 0.041902780532836914 secnds ---
CPU times: user 16.3 ms, sys: 8.09 ms, total: 24.4 ms
Wall time: 59.4 ms
