In [1]:
import json
import os
import shutil
from google.cloud import bigquery
from google.cloud import storage
from jinja2 import Environment, FileSystemLoader, select_autoescape

from datetime import datetime, timedelta
from pytz import timezone
from tzlocal import get_localzone

In [2]:
project_id = ""
dataset_id = ""
basic_history_table = ""
history_agg_table = ""
final_rating_table = ""
bucket_name = "{Bucket Name}"

In [3]:
# Construct a BigQuery client object.
client = bigquery.Client()

# Current time in UTC
now_utc = datetime.now(timezone('Asia/Kolkata'))
prediction_day = now_utc.strftime('%A')
env = Environment(loader=FileSystemLoader('queries'))

# Bucket object
bucket = storage.Client().get_bucket(bucket_name)

## Check the latest date of the basic history to update it

In [4]:
basic_history_date_query = "SELECT max(date) from `{}.{}.{}`".format(
    project_id,
    dataset_id,
    basic_history_table
)

basic_history_date_job = client.query(
    basic_history_date_query
)
from_date = basic_history_date_job.to_dataframe().values.tolist()[0][0]
from_date = (datetime.strptime(
    from_date, "%Y%m%d"
).date() + timedelta(days=1)).strftime("%Y%m%d")

to_date = now_utc.date() - timedelta(days=1)
to_date = to_date.strftime("%Y%m%d")

## Update the basic history from the dates generated above

In [15]:
basic_history_template = env.get_template("basic_history.jinja2")
basic_history_query = basic_history_template.render(
    from_date=from_date,
    to_date=to_date
)

# Update the table
basic_table_id = "{}.{}.{}".format(project_id, dataset_id, basic_history_table)
job_config = bigquery.QueryJobConfig(destination=basic_table_id)

# Create partitioned table
day_partition = bigquery.TimePartitioning()
job_config.time_partitioning = day_partition

# Table Disposition
job_config.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED

# Start the query, passing in the extra configuration.
query_job = client.query(basic_history_query, job_config=job_config)
query_job.result()  # Wait for the job to complete.

print("Query results loaded to the table {}".format(basic_table_id))

Query results loaded to the table quantum-flood-755.daily_serving_blog_reco.blog_basic_history


## Top Visited blogs in T-30 days

In [169]:
top_visited_template = env.get_template("top_visited_blogs.jinja2")
top_visited_query = top_visited_template.render(
    project_id=project_id,
    dataset_id=dataset_id,
    basic_history_table=basic_history_table
)

query_job = client.query(top_visited_query, job_config=job_config)
top_blogs = query_job.to_dataframe()

# Create dictionary for top visited blogs
top_visited_blogs = {}
top_visited_blogs['recommended_blogs'] = list(top_blogs['pagepath'].to_dict().values())

# Upload the json to the gcs
blob = bucket.blob("top_visited_blogs.json")
blob.upload_from_string(data=json.dumps(top_visited_blogs),content_type='application/json') 

## Prepare Rating from the timeonpage

In [11]:
final_rating_template = env.get_template("final_rating.jinja2")
final_rating_query = final_rating_template.render(
    project_id=project_id,
    dataset_id=dataset_id,
    basic_history_table=basic_history_table
)

# Update the table
rating_table_id = "{}.{}.{}".format(project_id, dataset_id, final_rating_table)
job_config = bigquery.QueryJobConfig(destination=rating_table_id)


# Table Disposition
job_config.write_disposition = bigquery.WriteDisposition.WRITE_TRUNCATE
job_config.create_disposition = bigquery.CreateDisposition.CREATE_IF_NEEDED

# Start the query, passing in the extra configuration.
query_job = client.query(final_rating_query, job_config=job_config)
query_job.result()  # Wait for the job to complete.

print("Query results loaded to the table {}".format(rating_table_id))

Query results loaded to the table quantum-flood-755.daily_serving_blog_reco.blog_mf_data


## Fetch the rating from the table

In [4]:
import turicreate as tc
import pandas as pd

In [5]:
rating_data = """SELECT * except(clientId), CONCAT("'", clientId) as clientId FROM `{}.{}.{}`""".format(
    project_id,
    dataset_id,
    final_rating_table
)

query_job = client.query(rating_data)
main_rating_df = query_job.to_dataframe()

In [6]:
main_rating_df

Unnamed: 0,blog,pagepath,rating,clientId
0,national-pension-scheme-and-nps-work,/blogs/miscellaneous/national-pension-scheme-a...,0.282959,'1000085904.1632291810
1,investing-in-nps-for-retirement-saving,/blogs/nps/investing-in-nps-for-retirement-sav...,0.929877,'1000085904.1632291810
2,what-is-the-mclr-and-how-does-it-affect-the-ec...,/blogs/personal-loan/what-is-the-mclr-and-how-...,0.035423,'1000168273.1633060672
3,do-you-use-credit-card-know-the-pros-and-cons,/blogs/credit-card/do-you-use-credit-card-know...,0.833788,'1000238702.1633184250
4,what-is-home-loan,/blogs/home-loan/what-is-home-loan.page,0.639788,'1000300182.1633274296
...,...,...,...,...
71434,everything-you-need-to-know-about-credit-card-...,/blogs/credit-card/everything-you-need-to-know...,0.236486,'986109707.1608819612
71435,everything-you-need-to-know-about-credit-card-...,/blogs/credit-card/everything-you-need-to-know...,0.041424,'987302482.1624551974
71436,everything-you-need-to-know-about-credit-card-...,/blogs/credit-card/everything-you-need-to-know...,0.084726,'989028703.1631952181
71437,everything-you-need-to-know-about-credit-card-...,/blogs/credit-card/everything-you-need-to-know...,0.030706,'989917153.1633102225


In [7]:
rating_df = main_rating_df[['clientId', 'blog', 'rating']]
rating_df = rating_df.groupby(['clientId', 'blog']).agg({'rating': 'mean'}).reset_index()
rating_df["clientId"] = rating_df['clientId'].apply(lambda x: x.lstrip("'")) 

In [8]:
actions = tc.SFrame(rating_df)
model = tc.recommender.item_similarity_recommender.create(actions, 'clientId', 'blog', target="rating")

In [9]:
results = model.recommend()

In [99]:
clientid = "1000040434.1634360262"
visitied_blogs = rating_df[rating_df['clientId'] == clientid]['blog'].values.tolist()
predicted_blogs = results[
    results['clientId'] == clientid
]['blog']

print("Visited Blogs:\n{}\n".format("\n".join(visitied_blogs)))
print("Predicted Blogs:\n{}".format("\n".join(predicted_blogs)))

Visited Blogs:
car-loan-tax-benefits-and-how-to-claim-it

Predicted Blogs:
how-to-get-a-car-loan-online-in-easy-steps
5-reasons-why-car-insurance-is-a-must
pre-approved-for-car-loan
things-to-consider-before-applying-for-car-loan
tax-benefits-on-loan-against-property
life-insurance-a-smart-investment-for-uncertain-times
can-get-car-loan-second-hand-car
tax-saving-fixed-deposits
types-of-savings-account
5-reasons-to-review-your-life-insurance-cover


In [115]:
asd= asd.groupby("clientId")['blog'].apply(list).reset_index()

In [10]:
q = results.to_dataframe()

In [11]:
main_rating_df["clientId"] = main_rating_df['clientId'].apply(lambda x: x.lstrip("'")) 
main_rating_df.head()

Unnamed: 0,blog,pagepath,rating,clientId
0,national-pension-scheme-and-nps-work,/blogs/miscellaneous/national-pension-scheme-a...,0.282959,1000085904.1632292
1,investing-in-nps-for-retirement-saving,/blogs/nps/investing-in-nps-for-retirement-sav...,0.929877,1000085904.1632292
2,what-is-the-mclr-and-how-does-it-affect-the-ec...,/blogs/personal-loan/what-is-the-mclr-and-how-...,0.035423,1000168273.163306
3,do-you-use-credit-card-know-the-pros-and-cons,/blogs/credit-card/do-you-use-credit-card-know...,0.833788,1000238702.1633184
4,what-is-home-loan,/blogs/home-loan/what-is-home-loan.page,0.639788,1000300182.1633275


In [31]:
blog_tags = main_rating_df[['blog', 'pagepath']]
blog_tags = blog_tags.drop_duplicates().reset_index(drop=True)

In [40]:
asd = pd.merge(
    left=q,
    right=blog_tags,
    on=['blog'],
    how='left'
)

In [41]:
asd_1= asd.groupby("clientId")['pagepath'].apply(list).reset_index()

In [43]:
asd[asd['clientId']=='1000040434.1634360262']

Unnamed: 0,clientId,blog,score,rank,pagepath
0,1000040434.163436,how-to-get-a-car-loan-online-in-easy-steps,0.015504,1,/blogs/car-loan/how-to-get-a-car-loan-online-i...
1,1000040434.163436,5-reasons-why-car-insurance-is-a-must,0.009486,2,/blogs/car-loan/5-reasons-why-car-insurance-is...
2,1000040434.163436,pre-approved-for-car-loan,0.004566,3,/blogs/loan/pre-approved-for-car-loan.page
3,1000040434.163436,things-to-consider-before-applying-for-car-loan,0.004188,4,/blogs/loan/things-to-consider-before-applying...
4,1000040434.163436,tax-benefits-on-loan-against-property,0.002347,5,/blogs/home-loan/tax-benefits-on-loan-against-...
5,1000040434.163436,life-insurance-a-smart-investment-for-uncertai...,0.001654,6,/blogs/life-insurance/life-insurance-a-smart-i...
6,1000040434.163436,can-get-car-loan-second-hand-car,0.001629,7,/blogs/loan/can-get-car-loan-second-hand-car.page
7,1000040434.163436,tax-saving-fixed-deposits,0.001512,8,/blogs/fixed-deposits/tax-saving-fixed-deposit...
8,1000040434.163436,types-of-savings-account,0.00102,9,/blogs/saving-account/types-of-savings-account...
9,1000040434.163436,5-reasons-to-review-your-life-insurance-cover,0.000845,10,/blogs/life-insurance/5-reasons-to-review-your...


In [58]:
from tqdm import tqdm

PARENT_FOLDER = "recommended_blogs"
shutil.rmtree(PARENT_FOLDER, ignore_errors=True)
os.makedirs(PARENT_FOLDER, exist_ok=True)

for row in tqdm(asd_1.iterrows()):
    clientid = row[1].clientId
    path = os.path.join(PARENT_FOLDER, clientid + ".json")
    reco_blog = {}
    reco_blog['recommended'] = row[1].pagepath
    with open(path, 'w') as f:
        f.write(json.dumps(reco_blog))

66922it [00:16, 3985.07it/s]


In [55]:
count = 0
for row in asd_1.iterrows():
    print(row[1].clientId)
    print(row[1].pagepath)
    count += 1
    if count == 10:
        break

1000040434.1634360262
['/blogs/car-loan/how-to-get-a-car-loan-online-in-easy-steps.page', '/blogs/car-loan/5-reasons-why-car-insurance-is-a-must.page', '/blogs/loan/pre-approved-for-car-loan.page', '/blogs/loan/things-to-consider-before-applying-for-car-loan.page', '/blogs/home-loan/tax-benefits-on-loan-against-property.page', '/blogs/life-insurance/life-insurance-a-smart-investment-for-uncertain-times.page', '/blogs/loan/can-get-car-loan-second-hand-car.page', '/blogs/fixed-deposits/tax-saving-fixed-deposits.page', '/blogs/saving-account/types-of-savings-account.page', '/blogs/life-insurance/5-reasons-to-review-your-life-insurance-cover.page']
1000085904.1632291810
['/blogs/investment/is_it_good_idea_to_invest_in_national_pension_scheme.page', '/blogs/nps/nps-tax-benefits.page', '/blogs/nps/nps-returns.page', '/blogs/nps/save-tax-with-nps.page', '/blogs/nps/everything-you-need-to-know-about-nps.page', '/blogs/personal-loan/advantages-of-personal-loan-to-raise-funds.page', '/blogs/inve