In [59]:
# imports 
import pandas as pd
import numpy as np
import json
import requests
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import joblib

In [3]:
# loading previous data
org_data = pd.read_csv("data/processed_data.csv")

In [34]:
org_data.drop(columns=['id'], axis=1, inplace=True)

In [44]:
# getting new data from newsapi and processing it 

# news api url
URL = "https://newsapi.org/v2/everything?"

# reading links file
links = pd.read_csv("data/links.csv")

#creating a dataframe
dataset = pd.DataFrame()

# getting data from each link
for link_item in links['links'].values:

    # parameters for request
    PARAMS = {
    'domains': f"{link_item}",
    'apikey' : "31a644adf7964db285900d5fc0cd2f30"
    }

    # sending get request and saving the response as response object
    r = requests.get(url = URL, params = PARAMS)
    
    # extracting data in json format
    data = r.json()
    if 'totalResults' in data.keys():    
        if data["totalResults"] > 1:
            json_data = pd.read_json(json.dumps(data))
            dict_df = pd.json_normalize(json_data['articles'])
            dataset = pd.concat([dataset, dict_df], ignore_index=True)
    else:
        continue

In [47]:
dataset.shape

(5576, 7)

In [28]:
# processing

dataset = dataset.drop_duplicates()
dataset = dataset.drop(columns=['urlToImage', 'source.id'], axis=1)
dataset.dropna(inplace=True)
dataset.rename(columns={"source.name": "source_name"}, inplace=True)
dataset.apply(lambda x: str(x))
dataset["publishedAt"] = dataset["publishedAt"].apply(lambda x: x.split("T")[0])
dataset["author"] = dataset["author"].apply(lambda x: x.split("（")[0]).dropna()

In [46]:
dataset = pd.concat([dataset, org_data], ignore_index=True)

In [48]:
dataset.reset_index(drop=False, inplace=True)
dataset.rename(columns={"index": "id"}, inplace=True)

In [51]:
dataset.to_csv("data/processed_data.csv")

In [52]:
dataset.drop(columns=["description", "url", "content", "source_name"], axis=1, inplace=True)
dataset['tags'] = dataset['author'].apply(lambda x: x+" ") + dataset['title'].apply(lambda x: x+" ") + dataset['publishedAt']
dataset.drop(columns=["author", "title", "publishedAt"], inplace=True)
dataset["tags"] = dataset["tags"].apply(lambda x: x.lower())
dataset["tags"] = dataset["tags"].apply(lambda x: x.replace("— ", ""))
dataset["tags"] = dataset["tags"].apply(lambda x: x.replace(":", ""))

In [55]:
dataset.to_csv("data/Training.csv", index=False)

In [57]:
dataset.head()

Unnamed: 0,id,tags
0,0,abduzeedo neon worms colorful illustrations wi...
1,1,abduzeedo branding for fluxus where coffee mee...
2,2,abduzeedo branding case submission - belysse 2...
3,3,abduzeedo monterra sipping on sustainable and ...
4,4,abduzeedo digitalized - futuristic illustratio...


In [61]:
vectorizer  = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(dataset['tags'])

In [62]:
similarity = cosine_similarity(tfidf_matrix)

# save cosine similarity in a model file
joblib.dump(similarity, 'model/newsapi_articles_cosinesimilarity.pkl')

['model/newsapi_articles_cosinesimilarity.pkl']