In [75]:
from pathlib import Path
import pandas as pd
from tqdm.notebook import tqdm
from joblib import Parallel, delayed

# Add the 'code' directory to sys.path
project_path = Path(
    '/home/ec2-user/SageMaker/david/tdm-sentiment/'
)
data_path = project_path / 'data/'
data_sample = Path('/home/ec2-user/SageMaker/data/')
dataset_name = 'Newyork20042023'# 'USATodayDavid' 'ChicagoTribune' 'Newyork20042023'
folder_path = data_path / 'results/' / dataset_name #TODO


def get_all_csv_files(folder_path):
    # Use Path to find all CSV files in the folder
    csv_files = list(Path(folder_path).glob('*.csv'))
    return csv_files


def concat_csv_files(folder_path):
    # Get a list of all CSV files in the folder
    csv_files = get_all_csv_files(folder_path)
    
    # Initialize an empty list to store DataFrames
    df_list = []
    
    # Loop through the list of CSV files
    for file in csv_files:
        # Read each CSV file into a DataFrame
        df = pd.read_csv(file)
        
        # Append the DataFrame to the list
        df_list.append(df)
    
    # Concatenate all DataFrames in the list into a single DataFrame
    combined_df = pd.concat(df_list, ignore_index=True)
    
    return combined_df

     


In [76]:
  
# Combine CSV files from both folders
combined_df = concat_csv_files(folder_path) # get usa today

# Drop duplicate rows based on the 'GOID' column
combined_df = combined_df.drop_duplicates(subset=['GOID'])

# Combine 'International Herald Tribune' and 'New York Times' into one category
combined_df['Publisher'] = combined_df['Publisher'].replace(
    {'international herald tribune': 'new york times',
     'new york times the': 'new york times',
     'international new york times': 'new york times',
     'los angeles times (pre-1997 fulltext)': 'los angeles times',
     'washington post (pre-1997 fulltext) the': 'washington post the',
     'usa today (online)': 'usa today'}
)


#combined_df = combined_df[combined_df['Publisher'] != 'new york times español']


#combined_df = combined_df[combined_df['Publisher'] == 'chicago tribune'] #TODO
combined_df.shape# .head() #
            

(373855, 9)

In [77]:
# Get the frequency count of each tag
tag_counts = combined_df['Tags'].value_counts()

# Filter tags that appear between 50 and 100 times
tags_in_range = tag_counts[tag_counts >= 20].index

# Convert tags to a list
tags_in_range_list = tags_in_range.tolist()

# Specify the file path to save the tags
file_path = data_path/f'tags/common_tags_{dataset_name}.txt'

# Save the tags to a text file, six tags per line
with open(file_path, 'w') as f:
    for i in range(0, len(tags_in_range_list), 6):
        # Join six tags together with a space or comma in between
        line = ', '.join(tags_in_range_list[i:i+6])
        f.write(f"{line}\n")
print(f"Tags saved to {file_path}")


Tags saved to /home/ec2-user/SageMaker/david/tdm-sentiment/data/tags/common_tags_Newyork20042023.txt


In [81]:
economic_tags_1 = [
    "Acquisitions & Mergers", "Bank Acquisitions & Mergers", "Banking Industry", "Banks", "Bankruptcy",
    "Bankruptcy Reorganization", "Bailouts", "Budgets", "Budget Deficits", "Business Closings", "Central Banks", "Commercial Real Estate",
    "Corporate Governance", "Corporate Profits", "Credit Cards", "Currency", "Economic Conditions", "Economic Crisis",
    "Economic Forecasts", "Economic Growth", "Economic Impact", "Economic Indicators", "Economic Policy", "Economic Recovery",
    "Economic Statistics", "Economists", "Federal Budget", "Federal Reserve Monetary Policy", "Financial Performance",
    "Financial Services", "Foreign Investment", "Government Spending", "Inflation", "Interest Rates", "International Finance",
    "International Trade", "Investment Advisors", "Investment Banking", "Investment Bankers", "Investment Policy", "Investments", "Lending",
    "Loans", "Monetary Policy", "Mortgage Companies", "Mortgages", "Pension Funds", "Real Estate Sales",
    "Recessions", "Regulation of Financial Institutions", "Retail Sales", "Small Business", "Unemployment", "Unemployment Benefits",
]


economic_tags_2 = [
    "real estate sales", "recessions", "interest rates",
    "economic conditions", "workers", "international trade", "unemployment", "layoffs", "manufacturing", "retail stores",
    "prices", "gasoline prices", "small business", "inflation", "real estate", "federal budget", 
    "housing", "strikes", "labor unions","wage & price controls", "labor force", "supply chains", "great recession",
    "industrial production", "budget surplus", "TARP funds", "market prices",
    "bailoutsfinanceeconomics", "personal income", "foreign investments in the us",
    "service industries", "real estate companies", "price cuts", "corporate debt", "predatory lending", "currencies", "economic trends",
    "economic recoveryeconomics", "offshore banking", "public finance", "cost of living", "borrowing", "currency revaluation", "subsidies", "currency devaluation",
    "treasury notes", "government bonds", "economic development corporations", "imports", "consumer goods", "new store openings",
    "budgets", "economic growth", "bank acquisitions & mergers", "pension funds", "wages & salaries",
    "corporate profits", "state budgets", "unemployment benefits", "budget deficits", "minimum wage", "housing prices",
    "international finance", "economic forecasts", "monetary policy", "government spending",
    "consumers", "budgeting", "labor contracts", "collective bargaining", "banks", "financial services",
    "labor negotiations", "employees", "workforce planning",  "corporate governance", "hostile takeovers",
    "mortgage companies", "shutdowns", "venture capital companies", "economic recovery", "supply chains", "foreclosure", 
    "exports", "economic policy", "trade agreements", "economic crisis", "economic statistics", "poverty", 
    "subprime lending", "investment banking"
]




# Convert all elements to lowercase and combine lists
economic_tags = [item.lower() for item in economic_tags_1] + [item.lower() for item in economic_tags_2]
    
# Use a set to remove duplicates and then convert it back to a list
economic_tags = list(set(economic_tags))

# Specify the file path to save the tags
file_path = data_path/f'tags/realistic_economy_tags_{dataset_name}.txt'

# Save the tags to a text file, six tags per line
with open(file_path, 'w') as f:
    for i in range(0, len(economic_tags), 1):
        # Join six tags together with a space or comma in between
        line = ', '.join(economic_tags[i:i+1])
        f.write(f"{line}\n")
print(f"Tags saved to {file_path}")




Tags saved to /home/ec2-user/SageMaker/david/tdm-sentiment/data/tags/realistic_economy_tags_Newyork20042023.txt


In [71]:
filter_tags_df = combined_df[combined_df['Tags'].isin(economic_tags)]
filter_tags_df



#Newyork20042023/433585924.xml

# Extract the GOID column
goid_list = filter_tags_df['GOID'].tolist()

# Save the GOID numbers to a text file
file_path = f'{data_path}/{dataset_name}_realistic_economy_articles.txt'
with open(file_path, 'w') as file:
    for goid in goid_list:
        file.write(f"{dataset_name}/{goid}.xml\n")

file_path

'/home/ec2-user/SageMaker/david/tdm-sentiment/data/ChicagoTribune_realistic_economy_articles.txt'

In [52]:
tags_list = filter_tags_df['Tags'].tolist()
for tag in tags_list:
    if tag not in economic_tags:
        print(tag)

In [65]:
filter_tags_df.head(20)

Unnamed: 0,GOID,Publisher,Date,Section,Type,Tags,is_economic,text_blob_sentiment,bert_sentiment
14,433642751,new york times,2007-07-09,c,news,retail sales,1,0.067236,
19,1721884962,new york times,2015-10-14,b,commentary,corporate profits,1,0.07868,
32,1652618087,new york times,2015-02-10,b,news,economic statistics,1,0.00911,
65,2014878206,new york times,2018-03-19,b,news,investments,1,0.08575,
95,433408460,new york times,2006-09-11,c,news,economic growth,1,0.065804,0.002278
97,432825846,new york times,2004-08-22,1,news,budgeting,1,-0.000861,
107,1829313532,new york times,2016-10-16,bu,news,federal reserve monetary policy,1,0.066925,
109,884366855,new york times,2011-08-20,b,news,interest rates,1,0.002114,
120,433718383,new york times,2007-11-23,c,news,bank acquisitions & mergers,1,0.012817,-0.628475
124,1686178712,new york times,2015-06-06,b,news,bankruptcy reorganization,1,0.033725,
