### __Import Packages__

In [4]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import os

In [6]:
from nlp.lda_model.preprocessing_functions_lda import *

### __Load Data__

In [10]:
os.chdir("../")

In [11]:
# OPEN CSV FILES
# This dataset includes news and opinion articles about various companies
df = pd.read_csv('data/raw/us_equities_news_dataset.csv')

# DROP DUPLICATES
df_filtered = df.drop_duplicates(subset=['content'])

# DROP ROWS WHERE 'CONTENT' columns has NANS
df_filtered = df_filtered.dropna(subset=['content'])

# FILTER DATAFRAME ON CONTENT OF EITHER 'NVDA' OR 'NVIDIA'
# Check if each row in 'content' column contains the words "nvda" or "nvidia
df_filtered_on_content = df_filtered[df_filtered['content'].fillna('').str.contains(
    r'\bnvda\b|\bnvidia\b', case=False, regex=True)]
print(f"The amount of articles is (both news and opinions): {len(df_filtered_on_content)}")

The amount of articles is (both news and opinions): 3436


In [17]:
# Save as parquet file
df_filtered_on_content.to_parquet('data/processed/filtered_df.parquet')

### __Preprocessing Steps:__

In [None]:
# PREPROCESSING CONTENT COLUMN

# Convert to lowercase
df_filtered_on_content.loc[:, 'preprocessed_content'] = convert_to_lowercase(df_filtered_on_content['content'])

# Remove URLs
df_filtered_on_content.loc[:, 'preprocessed_content'] = remove_all_urls(df_filtered_on_content['preprocessed_content'])

# Remove all words that are not characters
df_filtered_on_content.loc[:, 'preprocessed_content'] = remove_non_words_characters(df_filtered_on_content['preprocessed_content'])

# Remove digits
df_filtered_on_content.loc[:, 'preprocessed_content'] = remove_digits(df_filtered_on_content['preprocessed_content'])

# Tokenize words
df_filtered_on_content.loc[:, 'preprocessed_content'] = tokenize_words(df_filtered_on_content['preprocessed_content'])

# Remove stopwords
df_filtered_on_content.loc[:, 'preprocessed_content'] = remove_stopwords(df_filtered_on_content['preprocessed_content'])

# Lemmatize words 
df_filtered_on_content.loc[:, 'preprocessed_content'] = lemmatize(df_filtered_on_content['preprocessed_content'])

df_filtered_on_content.head(5)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_filtered_on_content.loc[:, 'preprocessed_content'] = convert_to_lowercase(df_filtered_on_content['content'])


Unnamed: 0,id,ticker,title,category,content,release_date,provider,url,article_id,preprocessed_content
24,221539,NIO,A Central Bank War Just Started And Its Good F...,opinion,ECB Effects\nThe move in the euro was huge fa...,2019-03-07,Michael Kramer,https://www.investing.com/analysis/a-central-b...,200395687,"[ecb, effect, move, euro, huge, falling, pip, ..."
32,221547,NIO,6 Stocks To Watch Nivida Could Be Falling,opinion,6 Stocks To Watch March 6 Trading Session\nSt...,2019-03-06,Michael Kramer,https://www.investing.com/analysis/6-stocks-to...,200394931,"[stock, watch, march, trading, session, stock,..."
57,221572,NIO,Stocks Dow Drops Nearly 400 Points as Apple ...,news,Investing com A rout in Apple and Facebook ...,2018-11-19,Investing.com,https://www.investing.com/news/stock-market-ne...,1694042,"[rout, apple, facebook, nasdaq, fb, monday, sw..."
78,221593,UBER,The Zacks Analyst Blog Highlights Advanced Mi...,opinion,For Immediate ReleaseChicago IL January 13 ...,2020-01-12,Zacks Investment Research,https://www.investing.com/analysis/the-zacks-a...,200498277,"[immediate, releasechicago, il, january, annou..."
82,221597,UBER,The Best Of CES 2020 Revised,opinion,With 4 500 companies bringing their innovation...,2020-01-16,Zacks Investment Research,https://www.investing.com/analysis/the-best-of...,200499164,"[company, bringing, innovation, ce, jan, get, ..."


### __Save preprocessed data to parquet__

In [19]:
df_filtered_on_content.to_parquet('data/processed/preprocessed_df.parquet')