# Data Preparation

In [1]:
import pandas as pd
import numpy as np

In [2]:
# reading csv
original_dataset = pd.read_csv("../data/original_dataset.csv")

In [4]:
original_dataset['title'].values

array(['Neon Worms — colorful illustrations with the Blend Tool in Illustrator',
       'Branding for Fluxus: where coffee meets art',
       'Branding case submission - BELYSSE', ...,
       '自然現象のデジタルツイン化を支援する新たなコンピューターシステム', 'ITとOTの融合による新たなセキュリティリスクと課題',
       "I asked Microsoft's new Bing with ChatGPT about Microsoft and oh, it had opinions"],
      dtype=object)

In [95]:
original_dataset = original_dataset.rename(columns={"Unnamed: 0": "id"})

In [96]:
original_dataset["id"] = original_dataset["id"].apply(lambda x: x+1)

In [97]:
# removing two columns
original_dataset = original_dataset.drop(columns=["urlToImage", "source.id"], axis= 1)

In [98]:
original_dataset.dropna(inplace=True)

In [99]:
original_dataset.rename(columns={"source.name": "source_name"}, inplace=True)

In [100]:
original_dataset["author"] = original_dataset["author"].apply(lambda x: str(x))
original_dataset["title"] = original_dataset["title"].apply(lambda x: str(x))
original_dataset["description"] = original_dataset["description"].apply(lambda x: str(x))
original_dataset["url"] = original_dataset["url"].apply(lambda x: str(x))
original_dataset["publishedAt"] = original_dataset["publishedAt"].apply(lambda x: str(x))
original_dataset["content"] = original_dataset["content"].apply(lambda x: str(x))
original_dataset["source_name"] = original_dataset["source_name"].apply(lambda x: str(x))

In [101]:
original_dataset["publishedAt"] = original_dataset["publishedAt"].apply(lambda x: x.split("T")[0])

In [105]:
original_dataset["author"] = original_dataset["author"].apply(lambda x: x.split("（")[0]).dropna()

In [112]:
original_dataset.to_csv("../data/processed_data.csv", index=False)

In [115]:
original_dataset.drop(columns=["description", "url", "content", "source_name"], axis=1, inplace=True)

In [121]:
original_dataset['tags'] = original_dataset['author'].apply(lambda x: x+" ") + original_dataset['title'].apply(lambda x: x+" ") + original_dataset['publishedAt']

In [124]:
original_dataset.drop(columns=["author", "title", "publishedAt"], inplace=True)

In [127]:
# making it lower case
original_dataset["tags"] = original_dataset["tags"].apply(lambda x: x.lower())

In [131]:
# removing — sign
original_dataset["tags"] = original_dataset["tags"].apply(lambda x: x.replace("— ", ""))

In [137]:
# removing - sign
original_dataset["tags"] = original_dataset["tags"].apply(lambda x: x.replace(":", ""))

Threee attibutes are  used to perfrom recommendation
1. author
2. title
3. publishedAt

don't have used other columns which are 
1. description
2. url
3. content
4. source_name
5. id 

In [143]:
original_dataset.head()

Unnamed: 0,id,tags
0,1,abduzeedo neon worms colorful illustrations wi...
1,2,abduzeedo branding for fluxus where coffee mee...
2,3,abduzeedo branding case submission belysse 202...
3,4,abduzeedo monterra sipping on sustainable and ...
4,5,abduzeedo digitalized futuristic illustrations...


In [146]:
original_dataset.to_csv("../data/Training.csv", index=False)

# Training using TFIDF

In [7]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import json
from nltk import PorterStemmer
import joblib

In [8]:
# loading 
data = pd.read_csv('../data/Training.csv')

In [9]:
data.head()

Unnamed: 0,id,tags
0,1,abduzeedo neon worms colorful illustrations wi...
1,2,abduzeedo branding for fluxus where coffee mee...
2,3,abduzeedo branding case submission belysse 202...
3,4,abduzeedo monterra sipping on sustainable and ...
4,5,abduzeedo digitalized futuristic illustrations...


In [11]:
vectorizer  = TfidfVectorizer(stop_words='english')
tfidf_matrix = vectorizer.fit_transform(data['tags'])


In [13]:
similarity = cosine_similarity(tfidf_matrix)

In [14]:
scores = similarity[0]

# Find the top 3 most similar documents
top_indices = np.argsort(scores)[::-1][1:5]

# Print the recommended documents
for i in top_indices:
    print(f"Recommended document: {data['id'].iloc[i]}")

Recommended document: 5
Recommended document: 24
Recommended document: 43
Recommended document: 42


In [15]:
# save cosine similarity in a model file
joblib.dump(similarity, '../model/newsapi_articles_cosinesimilarity.pkl')

['../model/newsapi_articles_cosinesimilarity.pkl']