# Data Generation for Boosting and Freshness

In this notebook we'll generate sample dataset to test and simulate freshness.
We'll generate two datasets into two different containers to apply different approaches

We'll cover 2 approaches to handle freshness boosting using these datasets
1. Adding a freshness value field to rank based on the value
2. Adding multiple date fields per document type to rank based on built in datetime freshness

### Provide Config file for Azure Services

In [1]:
#For direct upload to blob please define below config file in "notebooks" folder
# myconfig.py
# stg_acct = 'StorageName'
# stg_key = 'BlobStorageKey'
# stg_conn = 'ConnectionString'
# container_freshness = 'freshness'
# container_multidatefield = 'multidatefield'
# cognitive_services_key = 'CognitiveServicesKey'
# cognitive_services_endpoint = 'https://westeurope.api.cognitive.microsoft.com/'

### Import libraries and define functions

In [2]:
# Install faker to generate names and countries
#!pip install Faker

# Install sdk for Azure Cognitive Services Text Analytics
#!pip install azure-ai-textanalytics

# Install Azure Blob Storage Client Library for Python
#!pip install azure-storage-blob

from myconfig import *
from azure.storage.blob import generate_blob_sas, BlobSasPermissions, ContainerSasPermissions , BlobClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
from faker import Faker

import pandas as pd 
import json
import random
import time

def convert_date(start, end, format, prop):
    stime = time.mktime(time.strptime(start, format))
    etime = time.mktime(time.strptime(end, format))
    ptime = stime + prop * (etime - stime)
    return time.strftime(format, time.localtime(ptime))

def get_random_date(start, end, prop):
    return convert_date(start, end, '%Y-%m-%dT%H:%M:%SZ', prop)

def extend_date(jsonData):
    freqCode = jsonData['docFrequency'] 
    jsonData['published_d'] = '1900-01-01T00:00:00Z'
    jsonData['published_w'] = '1900-01-01T00:00:00Z'
    jsonData['published_ah'] = '1900-01-01T00:00:00Z'
    jsonData['published_m'] = '1900-01-01T00:00:00Z'
    jsonData['published_y'] = '1900-01-01T00:00:00Z'
    
    if(freqCode =="Daily"): 
        jsonData['published_d'] = jsonData['published']
    elif(freqCode =="Weekly"):
        jsonData['published_w'] = jsonData['published']
    elif(freqCode =="Bi-Weekly"):
        jsonData['published_ah'] = jsonData['published']
    elif(freqCode =="Monthly"):
        jsonData['published_m'] = jsonData['published']
    elif(freqCode =="Yearly"):
        jsonData['published_y'] = jsonData['published']
    return jsonData

def extract_keyphrases(content):
    credential = AzureKeyCredential(cognitive_services_key)
    endpoint=cognitive_services_endpoint
    
    text_analytics_client = TextAnalyticsClient(endpoint, credential)
    documents = [
        content
    ]
    response = text_analytics_client.extract_key_phrases(documents, language="en")
    result = [doc for doc in response if not doc.is_error]
    
    keyPhrases = []
    for doc in result:
        keyPhrases= keyPhrases + doc.key_phrases
    
    return keyPhrases

def extract_entities(content):
    credential = AzureKeyCredential(cognitive_services_key)
    endpoint=cognitive_services_endpoint
    
    text_analytics_client = TextAnalyticsClient(endpoint, credential)
    documents = [ 
        content 
    ]
    response = text_analytics_client.recognize_entities(documents, language="en")
    result = [doc for doc in response if not doc.is_error]
    
    entityCategories = ["Location", "Person", "Skill", "Organization", "Event", "Product"]
  
    entitiesList = []
    for doc in result:
        for entity in doc.entities:
            if any(entity.category in c for c in entityCategories):
                entitiesList.append(entity.text)
    
    return list(set(entitiesList))

### 1. Import News Dataset from Kaggle

In [5]:
# Dataset is downloaded from Kaggele Competition
# https://www.kaggle.com/c/learn-ai-bbc/data?select=BBC+News+Train.csv
data = pd.read_csv("../data/bbc/BBC_News_Train.csv")
data

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [6]:
# Unique category values 
data['Category'].unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

#### Load Faker and define values

In [7]:
# Test Faker library to generate some metadata
fake = Faker()
print(fake.name(),"-", fake.text())

Sara Salazar - Never every employee each by. Production blue course event from just upon. Quite democratic quality.
Central certain film south. Onto message think account need.


In [8]:
# Test random date between 2 dates
print(get_random_date("2020-04-07T15:24:04Z", "2020-07-28T15:24:04Z", random.random()))

2020-06-25T10:53:25Z


In [9]:
# Define List of document Frequency types
freqList = ['Daily','Weekly','Bi-Weekly','Monthly','Yearly']

In [10]:
# define slice of the data for test
#dataslice = data[0:5] # get first 5 items for test
dataslice = data       # assign full dataframe

### 2. Generate Data with Freshness Value

In [None]:
# Generate metadate with freshness value and upload to Blob Storage
for index, row in dataslice.iterrows():
    jsondata = {}
    jsondata['docFrequency'] = random.choice(freqList)
    jsondata['published'] = get_random_date("2020-04-01T00:00:00Z", "2020-07-29T00:00:00Z", random.random())
    jsondata['author'] = fake.name()
    jsondata['location'] = fake.country()
    jsondata['articleId'] = row['ArticleId']
    jsondata['content'] = row['Text']
    jsondata['category'] = row['Category']
    jsondata['keyPhrases'] = extract_keyphrases(row['Text'])
    jsondata['entities'] = extract_entities(row['Text'])
    print(jsondata['articleId'])
    blob_client = BlobClient.from_connection_string(conn_str=stg_conn, container_name=container_freshness, blob_name=str(jsondata['articleId'])+ '.json')
    blob_client.upload_blob(json.dumps(jsondata),overwrite=True)

### 2. Generate MultiDate Field with Freshness Value

In [None]:
# Generate metadata with Multidate fields and upload to Blob Storage
for index, row in dataslice.iterrows():
    jsondata = {}
    jsondata['docFrequency'] = random.choice(freqList)
    jsondata['published'] = get_random_date("2020-04-01T00:00:00Z", "2020-07-29T00:00:00Z", random.random())
    jsondata['author'] = fake.name()
    jsondata['location'] = fake.country()
    jsondata['articleId'] = row['ArticleId']
    jsondata['content'] = row['Text']
    jsondata['category'] = row['Category']
    jsondata['keyPhrases'] = extract_keyphrases(row['Text'])
    jsondata['entities'] = extract_entities(row['Text'])
    jsondata = extend_date(jsondata)
    print(jsondata['articleId'])
    blob_client = BlobClient.from_connection_string(conn_str=stg_conn, container_name=container_multidatefield, blob_name=str(jsondata['articleId'])+ '.json')
    blob_client.upload_blob(json.dumps(jsondata),overwrite=True)