In [1]:
# imports
import os
import sys
import dvc.api
import pandas as pd
import dataframe_image as dfi
from dotenv import load_dotenv
import cohere
from cohere.classify import Example
import warnings
warnings.filterwarnings('ignore')

In [2]:
# adding and setting up scripts
sys.path.append('.')
sys.path.append('..')
sys.path.insert(1, '../scripts/')
import defaults as defs
import dataCleaner as dc
import dataVisualizer as dv
# load your environment
load_dotenv()
# use your own api key here
cohere_api_key = os.getenv('cohere_api_key')
cleaner = dc.dataCleaner('news scoring using co:here API notebook')
visualizer = dv.dataVisualizer('news scoring using co:here API notebook')

logger <Logger dataCleaner (DEBUG)> created at path: ../logs/cleaner_root.log
Data cleaner in action
logger <Logger dataVisualizer (DEBUG)> created at path: ../logs/visualizer_root.log
Data visualizer in action


In [3]:
# pandas settings
pd.set_option('display.max_columns', 30)

# version of the data
# v1 : local-store
version = 'v1'

# set up the dat url
news_url = dvc.api.get_url(path = defs.news_local_path, 
                       repo = defs.repo, 
                       rev = version)

# print news path
print(f'news data path: {news_url}')

news data path: /home/f0x-tr0t/Documents/dvc-store//50/1fbc56d932bcb51d74876281ec8f71


In [4]:
# reading csv files
DateCols = ['timestamp']
missing_values = ["n/a", "na", "undefined", '?', 'NA', 'undefined']

news_data = pd.read_csv(news_url, na_values=missing_values, parse_dates=DateCols, low_memory=False)

# News scoring using co:here API

## Using the classification feature

In [5]:
# setting up api key
# use your own api key here
co = cohere.Client(cohere_api_key)

### Setting up model parameters

In [6]:
news_data.columns

Index(['Domain', 'Title', 'Description', 'Body', 'Link', 'timestamp',
       'Analyst_Average_Score', 'Analyst_Rank', 'Reference_Final_Score'],
      dtype='object')

* As we have seen in the demo, it will be best to classify the scores first prior to requesting the API

In [7]:
news_data['Analyst_Average_Score'].value_counts()

0.00    7
1.33    1
1.66    1
0.33    1
Name: Analyst_Average_Score, dtype: int64

In [8]:
news_data['Analyst_Rank'].value_counts()

4    7
2    1
1    1
3    1
Name: Analyst_Rank, dtype: int64

* We can classify the average score into 6 categories.
    * 1- lovely    ---   below 0.33
    * 2- good         ---    between 0.33 and 0.66
    * 3- neutral      ---    between 0.66 and 1.22
    * 4- risky      ---     between 1.22 and 1.55
    * 5- detrimental --- greater than: 1.55

In [9]:
def classifyBasedOnScore(df: pd.DataFrame) -> str:
    """
    Classify based on score.
    Parameters
    =--------=
    df: pandas dataframe
        the data frame to classify
    """
    if df['Analyst_Average_Score'] <= 0.33:
        return 'lovely'
    if df['Analyst_Average_Score'] <= 0.66:
        return 'good'
    if df['Analyst_Average_Score'] <= 1.22:
        return 'neutral'
    if df['Analyst_Average_Score'] <= 1.55:
        return 'risky'
    if df['Analyst_Average_Score'] > 1.55:
        return 'detrimental'

In [10]:
news_data['class'] = news_data.apply(lambda news_data: classifyBasedOnScore(df= news_data), axis=1)

In [11]:
news_data[['Analyst_Average_Score', 'class']]

Unnamed: 0,Analyst_Average_Score,class
0,0.0,lovely
1,0.0,lovely
2,0.0,lovely
3,0.0,lovely
4,0.0,lovely
5,1.33,risky
6,0.0,lovely
7,1.66,detrimental
8,0.33,lovely
9,0.0,lovely


* Now we can use this class feature for the co:here model

In [69]:
# setting up classify parameters

# model type to use
model_type='large'

examples = []
for i in range(len(copy)):
    examples.append(Example(copy.loc[i]['Body'], copy.loc[i]['class']))

In [72]:
# TODO: change these to the news body
# the news sentences (body) to get classification for
classification_inputs=["Construction activity grew steadily by 4% in the second quarter of the first three months of 2021 and recovered quickly and significantly in the same quarter of 2020 due to the low base effect of Covid. Famous economist Dr. Roerofubota said yesterday. In the announcement of the Afrimat Construction Index (ACI), and on behalf of Afrimat, he said ACI reached 110.3 points in the second half of 2021 and reached a 55% rebound compared to the same quarter last year. .. The construction industry continues to improve, he said in a telephone interview. Quarterly improvements were driven by a significant increase in employment in the construction sector and sales of building materials. Since the second quarter of 2020, the construction industry has created approximately 156,000 jobs. Bota said other promising improvements in the building blocks are the value of the building plans passed and the value of the buildings completed by the larger municipalities of the country. “Unfortunately, the post-pandemic construction recovery remains incomplete, with the second quarter 6.8% behind Covid, the same quarter of 2019.” But Bota said he was confident that it would continue. Damage to buildings from July riots and looting, and potential additional spending on buildings, repairs and security in KwaZulu-Natal and Houten could boost construction activity in the third and perhaps fourth quarters He said it was highly sexual. .. Inflation has begun to fall in recent months, well below the South African Reserve Bank’s upper inflation target. This means that interest rates may remain low until 2022. “A 30% reduction in the cost of mortgages at prime rates has already helped us to return to higher home prices and increase the value of new mortgages,” he said. In addition, two business confidences recorded an immediate recovery from the decline following the July unrest. According to Bota, several companies that have a strong foothold in the construction sector’s value chain have recently released impressive financial results. Examples include JSE-listed Cashbuild and Murray & Roberts, which show that purchase orders have reached a new record of R60.7 billion. “But now all eyes will focus on the government’s reconstruction and reconstruction plans. It’s far behind the starting block.” He hopes that further relaxation of blockade regulations and tax jackpots against the backdrop of record mining sector interests will contribute to the faster implementation of this plan. Andries van Heerden, CEO of Afrimat, is slowly but feeling positive economic momentum within the group to accelerate the long-awaited infrastructure project ready to run the business. He said he needed momentum and commitment from the government. The construction sector is expected to be boosted by riots and looting repairs [Source link](//www.iol.co.za/business-report/companies/construction-sector-projected-to-be-boosted-by-rioting-and-looting-repairs-0d6b4e8f-c456-4f70-8b93-7c8fc591d1c1) The construction sector is expected to be boosted by riots and looting repairs", 

'south African Police Service Office of the Provincial Commissioner Eastern Cape EASTERN CAPE – A 42-year-old male suspect was arrested yesterday, Wednesday 8 September 2021 at about 10:00 for the murder of an 80-year-old female in the Epesikeni location Ngqwaru A/A in Cofimvaba. It is alleged that during the morning of Wednesday, neighbors found the door at the deceased’s homestead open. On further investigation it’s where the lifeless body was discovered, covered up, laying on the bed. They also found a known suspect hiding in the room, and after questioning him he reported that he found the lady on her bed sleeping and the suspect then ran away. After the suspect set his house on fire he was rescued by Cofimvaba Visible Police members who arrested him on a charge of murder. According to the suspect the motive for the murder was revenge. The suspect will appear before the Cofimvaba Magistrates Court tomorrow, 10 September 2021 on charges related to Murder. Join Your Neighbourhood Watch Enquiries: Captain Lariane Jonker 082 301 8552 Twitter: https://twitter.com/SAPoliceService Report Every Crime Stamp Out Fraud and Corruption Be the Eyes and Ears for the SAPS in your Street #aim4change Port Elizabethan - [Suspects arrested for allegedly committing robberies against motorists by placing spikes on the N1 and R101 roads appear in court – SAPS Crime Report: 2021-09-09 13:07:58] - 10 September 2021 - [Male arrested for the murder of an elderly female in Cofimvaba – SAPS Crime Report: 2021-09-09 13:22:58] - 10 September 2021 - [Three suspects appeared in court for business robbery – SAPS Crime Report: 2021-09-09 14:05:02] - 9 September 2021 - [Firearm recovered, three suspects in court – SAPS Crime Report: 2021-09-09 14:20:02] - 9 September 2021 - [Two more suspects behind bars in connection with a foiled robbery – SAPS Crime Report: 2021-09-09 19:38:59] - 9 September 2021']

In [73]:
# setting up classify parameters and examples 

response = co.classify(
  model=model_type,
  inputs=classification_inputs,
  examples=examples
)

CohereError: invalid request: the length of the prompt generated by the input "Construction activity grew steadily by 4% in the second quarter of the first three months of 2021 and recovered quickly and significantly in the same quarter of 2020 due to the low base effect of Covid. Famous economist Dr. Roerofubota said yesterday. In the announcement of the Afrimat Construction Index (ACI), and on behalf of Afrimat, he said ACI reached 110.3 points in the second half of 2021 and reached a 55% rebound compared to the same quarter last year. .. The construction industry continues to improve, he said in a telephone interview. Quarterly improvements were driven by a significant increase in employment in the construction sector and sales of building materials. Since the second quarter of 2020, the construction industry has created approximately 156,000 jobs. Bota said other promising improvements in the building blocks are the value of the building plans passed and the value of the buildings completed by the larger municipalities of the country. “Unfortunately, the post-pandemic construction recovery remains incomplete, with the second quarter 6.8% behind Covid, the same quarter of 2019.” But Bota said he was confident that it would continue. Damage to buildings from July riots and looting, and potential additional spending on buildings, repairs and security in KwaZulu-Natal and Houten could boost construction activity in the third and perhaps fourth quarters He said it was highly sexual. .. Inflation has begun to fall in recent months, well below the South African Reserve Bank’s upper inflation target. This means that interest rates may remain low until 2022. “A 30% reduction in the cost of mortgages at prime rates has already helped us to return to higher home prices and increase the value of new mortgages,” he said. In addition, two business confidences recorded an immediate recovery from the decline following the July unrest. According to Bota, several companies that have a strong foothold in the construction sector’s value chain have recently released impressive financial results. Examples include JSE-listed Cashbuild and Murray & Roberts, which show that purchase orders have reached a new record of R60.7 billion. “But now all eyes will focus on the government’s reconstruction and reconstruction plans. It’s far behind the starting block.” He hopes that further relaxation of blockade regulations and tax jackpots against the backdrop of record mining sector interests will contribute to the faster implementation of this plan. Andries van Heerden, CEO of Afrimat, is slowly but feeling positive economic momentum within the group to accelerate the long-awaited infrastructure project ready to run the business. He said he needed momentum and commitment from the government. The construction sector is expected to be boosted by riots and looting repairs [Source link](//www.iol.co.za/business-report/companies/construction-sector-projected-to-be-boosted-by-rioting-and-looting-repairs-0d6b4e8f-c456-4f70-8b93-7c8fc591d1c1) The construction sector is expected to be boosted by riots and looting repairs" and the label "lovely" has exceeded the maximum amount of tokens allowed in a prompt.
Consider removing or shorterning examples or inputs to resolve this issue.

In [14]:
# finally get the classification for the  news and convert it back to a numerical range factor

print('The confidence levels of the labels are:\n{}'.format(response.classifications))

The confidence levels of the labels are:
[cohere.Classification {
	input: Am I still able to return my order?
	prediction: Start return or exchange
	confidence: [cohere.Confidence {
	label: Shipping and handling policy
	confidence: 0.32005534
}, cohere.Confidence {
	label: Start return or exchange
	confidence: 0.5335526
}, cohere.Confidence {
	label: Track orders
	confidence: 0.14639212
}]
}, cohere.Classification {
	input: When can I expect my package?
	prediction: Track orders
	confidence: [cohere.Confidence {
	label: Shipping and handling policy
	confidence: 0.27741268
}, cohere.Confidence {
	label: Start return or exchange
	confidence: 0.30839407
}, cohere.Confidence {
	label: Track orders
	confidence: 0.41419324
}]
}]


* Finally receive the prediction of the classification

In [66]:
copy['class'].value_counts()

lovely         32
risky           4
detrimental     4
Name: class, dtype: int64

In [63]:
copy = news_data.copy()
copy.shape

(10, 10)

In [65]:
copy = copy.append(copy, ignore_index=True)
copy.shape

(40, 10)

## Using the text extraction feature