In [21]:
import joblib
import gzip
import json
import numpy as np
import random
import pandas as pd
import os
from sqlalchemy import create_engine
import urllib
from azure.identity import DefaultAzureCredential
from azure.keyvault.secrets import SecretClient
from transformers import pipeline

In [22]:
## Login to Azure Key Vault

# replace 'my-key-vault-url' with the URL of your Azure Key Vault
vault_url = 'https://kv-colmex.vault.azure.net/'

# DefaultAzureCredential will use the credentials of your logged-in Azure account
credential = DefaultAzureCredential()

# create a SecretClient
client = SecretClient(vault_url=vault_url, credential=credential)

In [23]:
# Read metadata corresponding to reviews sample
df = pd.read_parquet("../data_files/metadata_industry.parquet")
df = df[['asin', 'brand', 'main_cat', 'title']]
asin_reviews_sample = ['B000UXXUSG', 'B0002Q80TA', 'B0026PI8JC', 'B000RA5BMK', 'B00EZKO8J0', 'B015724OVG', 'B005CSF1JK', 'B00E6LJBUO', 'B00505VMXY', 'B000Q6ZK3K', 'B0091KOI8K', 'B000XHE0Q0', 'B0095CC1PQ', 'B00E6LJAOG', 'B00JTE1Z8E', 'B000W3RSGE', 'B001B5L5SY', 'B00DQG8SR2', 'B000SKZIXG', 'B000IVEM5K', 'B00E6LJ2SA', 'B004QK8FBG', 'B000F7VRIC', 'B00CFRF7UY', 'B015VYC606', 'B0009XB160', 'B000K757W4', 'B0026PEQES', 'B003PDMNCC', 'B002DHLUWK', 'B015724RQI', 'B005AAWLYS', 'B002MFSRIS', 'B013XFGA6G', 'B008S0IV9G', 'B00CTTEKJW', 'B00EZQYBZ0', 'B015IHWAZW', 'B004A7JMSU', 'B01326J80Q', 'B0014X5XEK', '3959828276', 'B012F7PNPM', 'B003DU34P2', 'B0002XHAX0', 'B001C31OZY', 'B000ES4PYU', 'B00ENFVLAQ', 'B009348X7Q', 'B001C3MHTQ', 'B00EZJRS8E', 'B015NN1E0S', 'B01637RMYU', 'B000AP2X0A', 'B001B5J2UW', 'B0144NYGJY', 'B00A6TPHZS', 'B003VNCRNQ', 'B0095C08YM', 'B004Q0PT3I', 'B00005AC56', 'B00111DJQ4', 'B00M9GTHS4', 'B000X86ZAS', 'B004YHXXKO', 'B00002SANG', 'B000BZJ0LY', 'B000HCZ8EO', 'B004A7Y0UK', 'B008MR38ZM', 'B000HLT5HQ', 'B015724V9Q', 'B0026PIBK8', 'B015610ZB6', 'B0017M9ZGA', 'B01617VNBK', 'B000I3C36S', '3772397301', 'B00F8LK4SO', 'B002DHGM50', 'B00B1TGHXS', 'B00B766VZE', 'B00E6OPDUS', 'B00F8K9MZQ', 'B0009OH8IO', 'B00E6ONJJU', 'B000NPEYZ4', 'B000067G1X', 'B001ECGT8A', 'B005AAWZ3K']
df = df.loc[df['asin'].isin(asin_reviews_sample), :]

In [24]:
# Drop duplicates
df.drop_duplicates(subset=['asin'], inplace=True)

In [25]:
# Load data to db
server = client.get_secret('db-server').value
database = client.get_secret('database').value
username = client.get_secret('db-username').value
password = client.get_secret('db-password').value
driver = '{ODBC Driver 17 for SQL Server}'
table = 'metadata_test'

# create the connection string
params = urllib.parse.quote_plus(
    f'DRIVER={driver};SERVER={server};DATABASE={database};UID={username};PWD={password}')

engine = create_engine(f'mssql+pyodbc:///?odbc_connect={params}')

df.to_sql(f'{table}', con=engine, if_exists='append', index=False)

90