# News Dataset Engineering & Vectorization

In [1]:
## import library 
import warnings
warnings.filterwarnings('ignore')

## Data Cleaning

In [16]:
import pandas as pd 
news_test = pd.read_csv("Data/news_link_test_2020_05_to_2023_05.csv")
news_test

Unnamed: 0,date,url,text
0,2023-05-01,https://theenterpriseleader.com/2023/05/01/pie...,ad behind market interested weekthe single gre...
1,2023-05-01,https://markets.financialcontent.com/stocks/ar...,recent quote view full list watchlist create w...
2,2023-05-01,https://finance.yahoo.com/news/lithiumbank-app...,globenewswirelithiumbank appoints strategy exe...
3,2023-05-01,https://www.benzinga.com/pressreleases/23/05/g...,espa olindiaitalianofran aismy accountmy accou...
4,2023-05-01,https://www.juniorminingnetwork.com/junior-min...,foremost lithium completes successful initial ...
...,...,...,...
3760,2020-05-07,https://www.reuters.com/article/us-albemarle-t...,ernest scheyder min readreuters albemarle corp...
3761,2020-05-07,https://www.kitco.com/news/2020-05-07/UPDATE-1...,add detail greenbushes sale stock movement ern...
3762,2020-05-07,https://www.kitco.com/news/2020-05-07/UPDATE-2...,make kitco homepage login sign refresh page ho...
3763,2020-05-07,https://www.reuters.com/article/us-albemarle-t...,ernest scheyder min readreuters albemarle corp...


In [4]:
news_test.info()
# there is null value in the dataset 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3765 entries, 0 to 3764
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    3765 non-null   object
 1   url     3765 non-null   object
 2   text    3078 non-null   object
dtypes: object(3)
memory usage: 88.4+ KB


In [25]:
# check if there is duplicate in the dataset: 
# df_count_text = pd.DataFrame(news_test.groupby("text").count().sort_values(by="url"))
# df_count_text[df_count_text.url > 1]

In [5]:
# only retain the text that with the ealiest date: 

news_test = news_test.sort_values(['text', 'date'])
# Drop duplicates, keep the first occurrence which is the earliest date
news_test_filtered = news_test.drop_duplicates(subset='text', keep='first')\
                    .loc[news_test['text'].notnull()]

In [6]:
news_test_filtered.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 2826 entries, 3132 to 668
Data columns (total 3 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   date    2826 non-null   object
 1   url     2826 non-null   object
 2   text    2826 non-null   object
dtypes: object(3)
memory usage: 88.3+ KB


In [7]:
news_test_filtered.describe()

Unnamed: 0,date,url,text
count,2826,2826,2826
unique,793,2826,2826
top,2023-04-21,https://abcnews.go.com/US/nevada-sees-lithium-...,abc newsvideoliveshowselectionsinterest succes...
freq,48,1,1


In [8]:
news_test_filtered.head()

Unnamed: 0,date,url,text
3132,2021-04-23,https://abcnews.go.com/US/nevada-sees-lithium-...,abc newsvideoliveshowselectionsinterest succes...
485,2023-03-09,https://kvia.com/news/2023/03/09/why-lithium-i...,abcfirst alert heat lateday storm threat
721,2023-01-30,https://www.prnewswire.com/news-releases/abtc-...,abtc prepares commission lithiumion battery re...
2337,2021-12-27,https://newsheater.com/2021/12/27/your-finance...,acadia healthcare company inc achc share decli...
3002,2021-06-20,https://timesofindia.indiatimes.com/blogs/Masq...,accept update privacy cookie policy use cooky ...


In [9]:
# we need to make the file sorted by date 
news_test_filtered = news_test_filtered.sort_values(['date'], ascending = False).reset_index(drop = True)
news_test_filtered.head()

Unnamed: 0,date,url,text
0,2023-05-01,https://theenterpriseleader.com/2023/05/01/pie...,ad behind market interested weekthe single gre...
1,2023-05-01,https://www.juniorminingnetwork.com/junior-min...,foremost lithium completes successful initial ...
2,2023-05-01,https://www.naturalnews.com/2023-05-01-sodium-...,home brighteon prep mike interview audio book ...
3,2023-05-01,https://markets.financialcontent.com/stocks/ar...,recent quote view full list watchlist create w...
4,2023-05-01,https://www.theglobeandmail.com/investing/mark...,stock leadersenbt cnqt atht hutt cvet trpt tdt...


## Vectorization 

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np
import scanpy as sc 

In [12]:
X = news_test_filtered.text

In [13]:
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.8, min_df=50/3181, ngram_range=(1,1)).fit(np.array(X))
X_vect = tfidf_vectorizer.transform(X)

In [14]:
vocabulary = tfidf_vectorizer.get_feature_names()
df_vocabulary = pd.DataFrame(index=vocabulary)
df_obs = news_test_filtered[['date', 'url', 'text']].reset_index(drop = True)
adata_test = sc.AnnData(X_vect, obs = df_obs, var = df_vocabulary)

In [15]:
adata_test.write('adata_test.h5ad', compression="gzip")