## Counts available headlines for DJIA 30

In [9]:
import pandas as pd

df = pd.read_csv('./2009_2020_stocknews/analyst_ratings_processed.csv')

with open('DJIA-30', 'r') as file:
    stocks = file.read().split()

stock_headline_count = {stock: df[df['stock'] == stock].shape[0] for stock in stocks}
total_count = 0

for stock, count in stock_headline_count.items():
    print(f'{stock}: {count}')
    total_count += count

print(f'Total amount of headlines for all DJIA stocks: {total_count}')


AMZN: 330
AXP: 1852
AMGN: 150
AAPL: 469
BA: 10
CAT: 2299
CSCO: 1003
CVX: 0
GS: 0
HD: 2617
HON: 0
IBM: 1083
INTC: 10
JNJ: 2927
KO: 2785
JPM: 10
MCD: 2208
MMM: 1486
MRK: 3334
MSFT: 0
NKE: 0
PG: 1303
TRV: 654
UNH: 230
CRM: 1476
VZ: 2937
V: 10
WMT: 10
DIS: 10
DOW: 1224
Total amount of headlines for all DJIA stocks: 30427


## Processes the given CSV(s) to append the DJIA 30 stocks to the original

In [4]:
import pandas as pd
import os

file_list = [
    './2009_2020_stocknews/raw_partner_headlines.csv',
    './2009_2020_stocknews/analyst_ratings_processed.csv'
]
column_mapping = {
    'raw_partner_headlines.csv': ['date', 'stock', 'headline'],
    'analyst_ratings_processed.csv': ['date', 'stock', 'title']
}

with open('DJIA-30', 'r') as file:
    stocks = file.read().split()

for file_path in file_list:
    df = pd.read_csv(file_path)

    df = df[df['stock'].isin(stocks)].sort_values(by=['stock', 'date'])
    df['date'] = df['date'].str.split(' ').str[0]

    file_name = os.path.basename(file_path)
    columns_to_save = column_mapping[file_name]

    file_exists = os.path.isfile('cleaned_stock_data_copy.csv')
    df.to_csv('cleaned_stock_data_copy.csv', columns=columns_to_save, mode='a', index=False, header=not file_exists)

## Process the original CSV to ensure no duplicates of the same date, stock, and headline

In [5]:
import pandas as pd

file_path = 'cleaned_stock_data_copy.csv'

try:
    df = pd.read_csv(file_path, on_bad_lines='skip')

    duplicates = df[df.duplicated(['Date', 'Stock', 'Headline'], keep=False)].sort_values(by=['Stock', 'Headline'])

    total_duplicates = len(duplicates)

    if total_duplicates > 0:
        df = df.drop_duplicates(subset=['Date', 'Stock', 'Headline'], keep='first')

        df = df.sort_values(by=['Stock', 'Date'])

        df.to_csv('cleaned_stock_data_copy.csv', index=False)
        print('Duplicates removed and file saved and sorted')
    else:
        print("No duplicate headlines found within the same stock.")

except pd.errors.ParserError as e:
    print(f"ParserError: {e}")


Duplicates removed and file saved
