# Data Generation for Boosting and Freshness

In [11]:
#For direct upload to blob please define below config file in "notebooks" folder
# myconfig.py
# stg_acct = 'StorageName'
# stg_key = 'BlobStorageKey'
# stg_conn = 'ConnectionString'
# container_freshness = 'freshness'
#container_multidatefield = 'multidatefield'
#cognitive_services_key = 'CognitiveServicesKey'
#cognitive_services_endpoint = 'https://westeurope.api.cognitive.microsoft.com/'

### Import libraries and define functions

In [12]:
# Install faker to generate names and countries
#!pip install Faker

# Install sdk for Azure Cognitive Services Text Analytics
#!pip install azure-ai-textanalytics

from myconfig import *
from azure.storage.blob import generate_blob_sas, BlobSasPermissions, ContainerSasPermissions , BlobClient
from azure.core.credentials import AzureKeyCredential
from azure.ai.textanalytics import TextAnalyticsClient
from faker import Faker

import pandas as pd 
import json
import random
import time


def convert_date(start, end, format, prop):
    stime = time.mktime(time.strptime(start, format))
    etime = time.mktime(time.strptime(end, format))
    ptime = stime + prop * (etime - stime)
    return time.strftime(format, time.localtime(ptime))

def get_random_date(start, end, prop):
    return convert_date(start, end, '%Y-%m-%dT%H:%M:%SZ', prop)


def extendDate(jsonData):
    freqCode = jsonData['docFrequency'] 
    jsonData['published_d'] = '1900-01-01T00:00:00Z'
    jsonData['published_w'] = '1900-01-01T00:00:00Z'
    jsonData['published_ah'] = '1900-01-01T00:00:00Z'
    jsonData['published_m'] = '1900-01-01T00:00:00Z'
    jsonData['published_y'] = '1900-01-01T00:00:00Z'
    
    if(freqCode =="Daily"): 
        jsonData['published_d'] = jsonData['published']
    elif(freqCode =="Weekly"):
        jsonData['published_w'] = jsonData['published']
    elif(freqCode =="Bi-Weekly"):
        jsonData['published_ah'] = jsonData['published']
    elif(freqCode =="Monthly"):
        jsonData['published_m'] = jsonData['published']
    elif(freqCode =="Yearly"):
        jsonData['published_y'] = jsonData['published']
    return jsonData

def extract_KeyPhrases(content):
    credential = AzureKeyCredential(cognitive_services_key)
    endpoint=cognitive_services_endpoint
    
    text_analytics_client = TextAnalyticsClient(endpoint, credential)
    documents = [
        content
    ]
    response = text_analytics_client.extract_key_phrases(documents, language="en")
    result = [doc for doc in response if not doc.is_error]
    
    keyPhrases = []
    for doc in result:
        keyPhrases= keyPhrases + doc.key_phrases
    
    return keyPhrases

def extract_Entities(content):
    credential = AzureKeyCredential(cognitive_services_key)
    endpoint=cognitive_services_endpoint
    
    text_analytics_client = TextAnalyticsClient(endpoint, credential)
    documents = [ 
        content 
    ]
    
    response = text_analytics_client.recognize_entities(documents, language="en")
    result = [doc for doc in response if not doc.is_error]
    
    entityCategories = ["Location", "Person", "Skill", "Organization", "Event", "Product"]
  
    entitiesList = []
    for doc in result:
        for entity in doc.entities:
            if any(entity.category in c for c in entityCategories):
                entitiesList.append(entity.text)
    
    return list(set(entitiesList))

### 1. Import News Dataset from Kaggle

In [13]:
# Dataset is downloaded from Kaggele Competition
# https://www.kaggle.com/c/learn-ai-bbc/data?select=BBC+News+Train.csv
data = pd.read_csv("../data/bbc/BBC News Train.csv")
data

Unnamed: 0,ArticleId,Text,Category
0,1833,worldcom ex-boss launches defence lawyers defe...,business
1,154,german business confidence slides german busin...,business
2,1101,bbc poll indicates economic gloom citizens in ...,business
3,1976,lifestyle governs mobile choice faster bett...,tech
4,917,enron bosses in $168m payout eighteen former e...,business
...,...,...,...
1485,857,double eviction from big brother model caprice...,entertainment
1486,325,dj double act revamp chart show dj duo jk and ...,entertainment
1487,1590,weak dollar hits reuters revenues at media gro...,business
1488,1587,apple ipod family expands market apple has exp...,tech


In [14]:
# Unique category values 
data['Category'].unique()

array(['business', 'tech', 'politics', 'sport', 'entertainment'],
      dtype=object)

In [15]:
# Test Faker library to generate some metadata
fake = Faker()
print(fake.name(),"-", fake.text())

Alexander Taylor - Young future himself close. See find create executive organization audience whole.
Both foot look. Keep hour soon officer bag.


In [16]:
# Test random date between 2 dates
print(get_random_date("2020-04-07T15:24:04Z", "2020-07-28T15:24:04Z", random.random()))

2020-06-16T16:49:59Z


In [17]:
# Define List of document Frequency types
freqList = ['Daily','Weekly','Bi-Weekly','Monthly','Yearly']

In [18]:
# define slice of the data for test
#dataslice = data[0:5]
dataslice = data

### 2. Generate Data with Freshness Value

In [19]:
# Generate metadate with freshness value and upload to Blob Storage
for index, row in dataslice.iterrows():
    jsondata = {}
    jsondata['docFrequency'] = random.choice(freqList)
    jsondata['published'] = get_random_date("2020-04-01T00:00:00Z", "2020-07-29T00:00:00Z", random.random())
    jsondata['author'] = fake.name()
    jsondata['location'] = fake.country()
    jsondata['articleId'] = row['ArticleId']
    jsondata['content'] = row['Text']
    jsondata['category'] = row['Category']
    jsondata['keyPhrases'] = extract_KeyPhrases(row['Text'])
    jsondata['entities'] = extract_Entities(row['Text'])
    jsondata['freshness'] = random.uniform(0, 1)
    print(jsondata['articleId'])
    blob_client = BlobClient.from_connection_string(conn_str=stg_conn, container_name=container_freshness, blob_name=str(jsondata['articleId'])+ '.json')
    blob_client.upload_blob(json.dumps(jsondata),overwrite=True)

1833
154
1101
1976
917
1582
651
1797
2034
1866
1683
1153
1028
812
707
1588
342
486
1344
1552
1547
177
1785
1617
405
1561
702
1026
1527
1503
1951
1407
2002
2100
466
687
1009
805
771
1532
2205
2000
953
1394
1522
455
593
590
277
90
904
527
1763
42
1364
1418
643
40
1518
2046
464
180
476
2017
315
96
1079
947
1742
972
210
2144
1766
1971
1303
1638
79
1055
1804
1929
371
445
105
1297
1932
1458
2127
1321
699
1738
639
1756
212
1566
1544
1962
1474
603
1220
891
1259
813
1680
136
684
777
618
1843
2048
334
2093
226
506
974
1408
461
1745
1133
1844
1249
1515
1425
2061
39
1091
865
634
306
1825
764
997
1361
1945
267
227
197
749
380
251
690
1411
1235
932
1487
611
775
1523
303
91
1184
449
1778
1469
1260
2044
19
666
853
844
1919
344
840
1596
1769
1771
378
2079
183
413
1066
1557
1645
406
867
1392
490
537
1205
1964
1330
1711
876
1021
2082
627
1400
61
2179
1213
1299
1264
2077
1119
828
893
1164
1696
396
1862
559
439
601
147
736
1412
1878
261
273
1022
760
50
850
1930
1060
31
761
316
171
1175
1047
1343
1934
82
10

### 2. Generate MultiDate Field with Freshness Value

In [20]:
# Generate metadata with Multidate fields and upload to Blob Storage
for index, row in dataslice.iterrows():
    jsondata = {}
    jsondata['docFrequency'] = random.choice(freqList)
    jsondata['published'] = get_random_date("2020-04-01T00:00:00Z", "2020-07-29T00:00:00Z", random.random())
    jsondata['author'] = fake.name()
    jsondata['location'] = fake.country()
    jsondata['articleId'] = row['ArticleId']
    jsondata['content'] = row['Text']
    jsondata['category'] = row['Category']
    jsondata['keyPhrases'] = extract_KeyPhrases(row['Text'])
    jsondata['entities'] = extract_Entities(row['Text'])
    jsondata = extendDate(jsondata)
    print(jsondata['articleId'])
    blob_client = BlobClient.from_connection_string(conn_str=stg_conn, container_name=container_multidatefield, blob_name=str(jsondata['articleId'])+ '.json')
    blob_client.upload_blob(json.dumps(jsondata),overwrite=True)

1833
154
1101
1976
917
1582
651
1797
2034
1866
1683
1153
1028
812
707
1588
342
486
1344
1552
1547
177
1785
1617
405
1561
702
1026
1527
1503
1951
1407
2002
2100
466
687
1009
805
771
1532
2205
2000
953
1394
1522
455
593
590
277
90
904
527
1763
42
1364
1418
643
40
1518
2046
464
180
476
2017
315
96
1079
947
1742
972
210
2144
1766
1971
1303
1638
79
1055
1804
1929
371
445
105
1297
1932
1458
2127
1321
699
1738
639
1756
212
1566
1544
1962
1474
603
1220
891
1259
813
1680
136
684
777
618
1843
2048
334
2093
226
506
974
1408
461
1745
1133
1844
1249
1515
1425
2061
39
1091
865
634
306
1825
764
997
1361
1945
267
227
197
749
380
251
690
1411
1235
932
1487
611
775
1523
303
91
1184
449
1778
1469
1260
2044
19
666
853
844
1919
344
840
1596
1769
1771
378
2079
183
413
1066
1557
1645
406
867
1392
490
537
1205
1964
1330
1711
876
1021
2082
627
1400
61
2179
1213
1299
1264
2077
1119
828
893
1164
1696
396
1862
559
439
601
147
736
1412
1878
261
273
1022
760
50
850
1930
1060
31
761
316
171
1175
1047
1343
1934
82
10