PCLab#5 - Group 2 - Emanuele Sala, Luca Soleri, Fabio Stefana

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Importing libraries and Dataset</h1>
</div>


In [1]:
import pandas as pd
import os

In [2]:
directory = "data/sigwatch_data"
df_list = []

for file in os.listdir(directory):
    if file.endswith(".dta"):
        file_path = os.path.join(directory, file)
        temp_df = pd.read_stata(file_path)
        df_list.append(temp_df)
        
df = pd.concat(df_list, ignore_index=True)

<div style="border: 4px solid #007bff; padding: 10px; background-color: #e9f5ff; border-radius: 5px;">
    <h1 style="color: #007bff;">Preliminary data exploration</h1>
</div>

In [3]:
# With this filter we keep only the banks
df = df[df["corp_industry_sector1"] == "Finance"]

In [4]:
# And with this filter we only keep the countries form the US, UK or EU
countries = ['Austria',
             'US',
             'Denmark',
             'UK',
             'Germany',
             'Luxembourg',
             'France',
             'Italy',
             'Netherlands',
             'Belgium',
             'Sweden',
             'Spain',
             'Ireland',
             'Portugal',
             'Poland',
             'Finland',
             'USA',
             'Croatia',
             'Bulgaria',
             'Montenegro',
             'Bosnia and Herzegovina']

df = df[df['country_corp'].isin(countries)]

In [5]:
# We count the unique number of ud_archive as some have more than one row but still count as one isngle campaing
n_of_campaigns = len(list(df["uid_archive"].unique()))
print(f"There are {n_of_campaigns} unique campaigns for US UK and EU banks")

There are 2370 unique campaigns for US UK and EU banks


In [6]:
list_of_ngo_columns = []
for i in range(5):
    i = i+1
    ngo_column_number = f"ngo_name{i}"
    ngo_col = list(df[ngo_column_number])
    list_of_ngo_columns += ngo_col
unique_ngos = list(set(list_of_ngo_columns))

# we do -1 because we have to account for the null value
print(f"There are {len(unique_ngos) - 1} unique NGO organizations involved in this dataset")

There are 932 unique NGO organizations involved in this dataset


In [None]:
# Numero di aziende targettate

In [7]:
columns_we_want = ["uid_archive", 
                   "date", 
                   "company",
                   'country_corp', # Country of the Company
                   'corp_industry_sector1', # Industry of the company
                   'company_parent',
                   'company_parent_country',
                   "sentiment",
                   'issue_name1',
                   'issue_name2',
                   'issue_name3']

# Reddit Analysis

In [80]:
import praw
import pandas as pd
from datetime import datetime, timedelta

# Reddit API credentials
client_id = "RQswmwIqjV3DmpuBke34QQ"
client_secret = "EhyqqRfPu3x4EDVnsGTMD12y0xFbBQ"
user_agent = "finance_scrape (by u/Kashiko_02)"

# Set up the Reddit client
reddit = praw.Reddit(
    client_id=client_id,
    client_secret=client_secret,
    user_agent=user_agent
)

# Sample data from your dataset
#campaigns = [df['company'], df['ngo_name1'], df['date']]

df_1 = df.copy()
columns_to_keep = ['date', 'company', 'ngo_name1', 'ngo_name2', 'ngo_name3', 'ngo_name4', 'ngo_name5']
campaign = df_1.reindex(columns=columns_to_keep)
campaign = campaign.iloc[:50]


# Data collection
reddit_data = []

for index, campaign in campaign.iterrows():
    ngos = [campaign['ngo_name1'], campaign['ngo_name2'], campaign['ngo_name3'], 
        campaign['ngo_name4'], campaign['ngo_name5']]
    
    campaign_date = datetime.strptime(campaign["date"], "%Y-%m-%d")
    campaign_year = campaign_date.year

    # Filter out empty NGO names and join them with spaces
    ngos = "; ".join([ngo for ngo in ngos if ngo])
    search_query = f"{campaign['company']} {ngos}".strip()  
    #    print(search_query)
    
    
    search_limit = 10
    
    # Perform the search
    submissions = reddit.subreddit("finance").search(search_query, limit=search_limit)
    
    
    # Convert campaign date to timestamp range
    date_before = campaign_date + timedelta(days=365)
    date_after = campaign_date - timedelta(days=365)

    # Filter and collect data
    for submission in submissions:
        # Convert Reddit post date
        post_date = datetime.utcfromtimestamp(submission.created_utc)
        post_date = datetime.strptime(str(post_date), "%Y-%m-%d %H:%M:%S")
        post_date = post_date.strftime("%Y-%m-%d")
        post_date = datetime.strptime(post_date, "%Y-%m-%d")
        
        
        #print(date_after,post_date,date_before)
        
        # Check if post falls within the date range
        if date_after <= post_date <= date_before:
            reddit_data.append({
                "company": campaign["company"],
                "NGO": ngos,
                "date": campaign["date"],
                "title": submission.title,
                "text": submission.selftext,
                "created_utc": submission.created_utc,
                "url": submission.url,
            #    "subreddit": submission.subreddit.display_name
            })
            

# Save to a DataFrame
df_reddit = pd.DataFrame(reddit_data)

# Display DataFrame (or save it for analysis)
print(df_reddit.head())

# The next steps would be to clean the text for sentiment analysis and apply a sentiment model.

df_reddit.to_csv("reddit_data.csv", index=False)

                        company                                NGO  \
0               JP Morgan Chase                    Mupo Foundation   
1     Goldman Sachs Group, Inc.  Institute for Policy Studies U.S.   
2     Goldman Sachs Group, Inc.  Institute for Policy Studies U.S.   
3  European Investment Bank EIB              CEE Bankwatch Network   
4      Lloyds Banking Group plc                           UK Uncut   

         date                                              title  \
0  2011-12-13  Where did the $2 billion that JP Morgan Chase ...   
1  2011-12-13  I AMA analyst in investment management at a "b...   
2  2011-12-13  Reddit-Exclusive Offer - 25% off Financial Tra...   
3  2011-12-09     The next recession could be a doozy, thoughts?   
4  2011-12-06  UK finance geniuses, what is your opinion of C...   

                                                text   created_utc  \
0  I kinda get this gist of this:\n\n[*In Februar...  1.339052e+09   
1  Sorry finance reddit on my 

In [79]:
df_reddit.to_csv("reddit_data.csv", index=False)