In [1]:
import pandas as pd
def read_text_file(file_path,printflag=0):
    try:
        with open(file_path, 'r') as file:
            # Read the entire file contents into a string
            file_contents = file.read()
        if printflag:
            print("File contents:")
            print(file_contents)
        return file_contents
    except FileNotFoundError:
        return f"File not found: {file_path}"
    except Exception as e:
        return f"An error occurred: {str(e)}"

  


In [2]:
# stock data
import yfinance as yf
import os

def download_stock_data(company_name, start_date, end_date, data_dir):
    # Create a directory to store the data if it doesn't exist
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)
    
    # Define the data file path
    data_file_path = f"{data_dir}/{company_name}_stock_data.csv"
    
    if os.path.isfile(data_file_path):
        print(f"Data for {company_name} already downloaded. Loading from {data_file_path}")
        # stock_data = yf.download(company_name, start=start_date, end=end_date)
        stock_data = pd.read_csv('../../data/stock_data/AAPL_stock_data.csv')
        return stock_data
    else:
        print(f"Downloading data for {company_name} from Yahoo Finance...")
        stock_data = yf.download(company_name, start=start_date, end=end_date)
        stock_data.to_csv(data_file_path)
        print(f"Data downloaded and saved to {data_file_path}")
        return stock_data

# Define the parameters
company_name = "AAPL"
start_date = "2022-10-31"
end_date = "2023-10-31"
data_directory = "../../data/stock_data"

# Call the function to download or load the data
stock_data = download_stock_data(company_name, start_date, end_date, data_directory)

# Now you have the data in the `aapl_data` DataFrame
print(stock_data)



Data for AAPL already downloaded. Loading from ../../data/stock_data/AAPL_stock_data.csv
           Date        Open        High         Low       Close   Adj Close  \
0    2022-10-31  153.160004  154.240005  151.919998  153.339996  152.435699   
1    2022-11-01  155.080002  155.449997  149.130005  150.649994  149.761551   
2    2022-11-02  148.949997  152.169998  145.000000  145.029999  144.174683   
3    2022-11-03  142.059998  142.800003  138.750000  138.880005  138.060974   
4    2022-11-04  142.089996  142.669998  134.380005  138.380005  137.792114   
..          ...         ...         ...         ...         ...         ...   
246  2023-10-24  173.050003  173.669998  171.449997  173.440002  173.440002   
247  2023-10-25  171.880005  173.059998  170.649994  171.100006  171.100006   
248  2023-10-26  170.369995  171.380005  165.669998  166.889999  166.889999   
249  2023-10-27  166.910004  168.960007  166.830002  168.220001  168.220001   
250  2023-10-30  169.020004  171.169998  1

In [3]:
# exploring the stock data
stock_data
type(stock_data) #pandas dataframe
stock_data.shape #(251,6)
# stock_data.head()
stock_data.describe()

stock_data.columns #['Open', 'High', 'Low', 'Close', 'Adj Close', 'Volume']
stock_data.index #['2022-10-31', ..., '2023-10-30']

# Accessing a specific row by label (using .loc)
# stock_data.loc['2023-10-30']

# Accessing a specific row by integer index (using .iloc)
stock_data.iloc[2]  # Access the third row (0-based index)

# --------------------------
# Accessing a Specific Column:

# Using square brackets
stock_data['Volume']

# Using dot notation
stock_data.Open
# ----------------------




0      153.160004
1      155.080002
2      148.949997
3      142.059998
4      142.089996
          ...    
246    173.050003
247    171.880005
248    170.369995
249    166.910004
250    169.020004
Name: Open, Length: 251, dtype: float64

In [4]:
# news data
# Usage example:
from newsapi import NewsApiClient
api_path = '../../data/newsapi_key.txt'
api_key= read_text_file(api_path,1).strip()
newsapi = NewsApiClient(api_key=api_key)


File contents:
0f2340d896c54adfbf54f91b9875a6d9



In [5]:

# print(api_key)
start_date = "2023-10-06"
end_date = "2023-10-30"

news_articles = newsapi.get_everything(
    q=company_name,
    from_param=start_date,
    to=end_date,
    language="en",
    sort_by="publishedAt",
    page_size=100
)

# Access the list of articles
articles = news_articles['articles']
news_data=pd.DataFrame(articles)

In [6]:
articles

[{'source': {'id': None, 'name': 'Biztoc.com'},
  'author': 'benzinga.com',
  'title': 'Some Chinese Abandoning Apple, Stock Buying On No Response From Iran, All Eyes On Bank Of Japan',
  'description': 'To gain an edge, this is what you need to know today. Please click here for a chart of Apple Inc AAPL. • The Morning Capsule is about the big picture, not an individual stock. The chart of AAPL stock is being used to illustrate the point. • This morning inves…',
  'url': 'https://biztoc.com/x/d9a3355a2a89ff80',
  'urlToImage': 'https://c.biztoc.com/p/d9a3355a2a89ff80/s.webp',
  'publishedAt': '2023-10-30T21:52:09Z',
  'content': 'To gain an edge, this is what you need to know today.Please click here for a chart of Apple Inc AAPL.The Morning Capsule is about the big picture, not an individual stock. The chart of AAPL stock is … [+273 chars]'},
 {'source': {'id': None, 'name': 'MarketWatch'},
  'author': 'MarketWatch Automation',
  'title': 'Company Close Updates: Snap Inc. stock outperf

In [7]:
# last_date
last_timestamp=news_data.iloc[-1].publishedAt
# last_timestamp.split('T').str[0]
last_date=last_timestamp.split('T')[0]
while last_date!=start_date:
    print("last_date is : ",last_date)
    end_date=last_date
    news_articles = newsapi.get_everything(
        q=company_name,
        from_param=start_date,
        to=end_date,
        language="en",
        sort_by="publishedAt",
        page_size=100
    )

    # Access the list of articles
    articles = news_articles['articles']
    news_data= news_data.append(pd.DataFrame(articles), ignore_index=True)
    last_timestamp=news_data.iloc[-1].publishedAt
    # last_timestamp.split('T').str[0]
    last_date=last_timestamp.split('T')[0]



last_date is :  2023-10-26


  news_data= news_data.append(pd.DataFrame(articles), ignore_index=True)


last_date is :  2023-10-23


  news_data= news_data.append(pd.DataFrame(articles), ignore_index=True)


last_date is :  2023-10-17


  news_data= news_data.append(pd.DataFrame(articles), ignore_index=True)


last_date is :  2023-10-11


  news_data= news_data.append(pd.DataFrame(articles), ignore_index=True)


In [95]:
# print(len(news_articles['articles']))
# print(articles)

10
[{'source': {'id': None, 'name': 'Biztoc.com'}, 'author': 'benzinga.com', 'title': 'Some Chinese Abandoning Apple, Stock Buying On No Response From Iran, All Eyes On Bank Of Japan', 'description': 'To gain an edge, this is what you need to know today. Please click here for a chart of Apple Inc AAPL. • The Morning Capsule is about the big picture, not an individual stock. The chart of AAPL stock is being used to illustrate the point. • This morning inves…', 'url': 'https://biztoc.com/x/d9a3355a2a89ff80', 'urlToImage': 'https://c.biztoc.com/p/d9a3355a2a89ff80/s.webp', 'publishedAt': '2023-10-30T21:52:09Z', 'content': 'To gain an edge, this is what you need to know today.Please click here for a chart of Apple Inc AAPL.The Morning Capsule is about the big picture, not an individual stock. The chart of AAPL stock is … [+273 chars]'}, {'source': {'id': None, 'name': 'MarketWatch'}, 'author': 'MarketWatch Automation', 'title': 'Company Close Updates: Snap Inc. stock outperforms competitors

In [10]:
type(news_data) #pandas dataframe
news_data.shape #(487,8)
news_data.head()
# news_data.describe()

# news_data.columns #['source', 'author', 'title', 'description', 'url', 'urlToImage', 'publishedAt', 'content']
# text1=news_data.iloc[0]

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': None, 'name': 'Biztoc.com'}",benzinga.com,"Some Chinese Abandoning Apple, Stock Buying On...","To gain an edge, this is what you need to know...",https://biztoc.com/x/d9a3355a2a89ff80,https://c.biztoc.com/p/d9a3355a2a89ff80/s.webp,2023-10-30T21:52:09Z,"To gain an edge, this is what you need to know..."
1,"{'id': None, 'name': 'MarketWatch'}",MarketWatch Automation,Company Close Updates: Snap Inc. stock outperf...,Shares of Snap Inc. rose 4.30% to $9.70 Monday...,https://www.marketwatch.com/data-news/snap-inc...,https://images.mktw.net/im-220105/social,2023-10-30T21:35:31Z,"Shares of Snap Inc. \r\n SNAP,\r\n +4.30%\r\nr..."
2,"{'id': None, 'name': 'MarketWatch'}",Associated Press,"Associated Press: Meta rolls out paid, ad-free...",Facebook and Instagram users in Europe are get...,https://www.marketwatch.com/story/meta-rolls-o...,https://images.mktw.net/im-245995/social,2023-10-30T21:35:31Z,LONDON Facebook and Instagram users in Europe ...
3,"{'id': None, 'name': 'Macdailynews.com'}",MacDailyNews,MacDailyNews presents live coverage of Apple’s...,Apple CEO Tim Cook and a team of Apple execs w...,https://macdailynews.com/2023/10/30/macdailyne...,https://149359564.v2.pressablecdn.com/wp-conte...,2023-10-30T21:05:27Z,Apple CEO Tim Cook and a team of Apple executi...
4,"{'id': None, 'name': 'AppleInsider'}",news@appleinsider.com (Malcolm Owen),What to expect from Apple's Q4 2023 earnings r...,Apple's fourth fiscal quarter results will be ...,https://appleinsider.com/articles/23/10/30/wha...,https://photos5.appleinsider.com/gallery/56908...,2023-10-30T20:46:15Z,Apple CEO Tim Cook\r\nApple's fourth fiscal qu...


In [18]:
# news_data.publishedAt 
news_data.head()
# type(news_data)

Unnamed: 0,source,author,title,description,url,urlToImage,publishedAt,content
0,"{'id': None, 'name': 'Biztoc.com'}",benzinga.com,"Some Chinese Abandoning Apple, Stock Buying On...","To gain an edge, this is what you need to know...",https://biztoc.com/x/d9a3355a2a89ff80,https://c.biztoc.com/p/d9a3355a2a89ff80/s.webp,2023-10-30T21:52:09Z,"To gain an edge, this is what you need to know..."
1,"{'id': None, 'name': 'MarketWatch'}",MarketWatch Automation,Company Close Updates: Snap Inc. stock outperf...,Shares of Snap Inc. rose 4.30% to $9.70 Monday...,https://www.marketwatch.com/data-news/snap-inc...,https://images.mktw.net/im-220105/social,2023-10-30T21:35:31Z,"Shares of Snap Inc. \r\n SNAP,\r\n +4.30%\r\nr..."
2,"{'id': None, 'name': 'MarketWatch'}",Associated Press,"Associated Press: Meta rolls out paid, ad-free...",Facebook and Instagram users in Europe are get...,https://www.marketwatch.com/story/meta-rolls-o...,https://images.mktw.net/im-245995/social,2023-10-30T21:35:31Z,LONDON Facebook and Instagram users in Europe ...
3,"{'id': None, 'name': 'Macdailynews.com'}",MacDailyNews,MacDailyNews presents live coverage of Apple’s...,Apple CEO Tim Cook and a team of Apple execs w...,https://macdailynews.com/2023/10/30/macdailyne...,https://149359564.v2.pressablecdn.com/wp-conte...,2023-10-30T21:05:27Z,Apple CEO Tim Cook and a team of Apple executi...
4,"{'id': None, 'name': 'AppleInsider'}",news@appleinsider.com (Malcolm Owen),What to expect from Apple's Q4 2023 earnings r...,Apple's fourth fiscal quarter results will be ...,https://appleinsider.com/articles/23/10/30/wha...,https://photos5.appleinsider.com/gallery/56908...,2023-10-30T20:46:15Z,Apple CEO Tim Cook\r\nApple's fourth fiscal qu...


In [19]:
# Remove duplicates based on specific columns
news_data = news_data.drop_duplicates('content')

# Reset the index to ensure it's consecutive
news_data.reset_index(drop=True, inplace=True)

In [20]:
news_data.shape

(448, 8)

In [81]:
news_data.to_csv('../../data/news_data1.csv', index=False)  # Use your desired file path
