In [1]:
!pip install openai==0.28



In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
import pandas as pd
import os
import json
from sklearn.model_selection import train_test_split
import openai
import time
from tqdm import tqdm

In [4]:
# Initialize a list to hold the data
data = []

# Specify the directory containing the labelled files
directory = "/content/drive/MyDrive/bank_run_detector_files/labelled/"

# Iterate over each file in the directory
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    # Check if the file is a JSON file
    if os.path.isfile(filepath) and filename.endswith('.json'):
        # Open and load the contents of the JSON file
        with open(filepath, 'r') as file:
            file_data = json.load(file)
            # For each item in the list, extract the 'text' and 'sentiment' and add to the data list
            for item in file_data:
                if 'text' in item and 'sentiment' in item:  # Ensure the keys exist
                    data.append({'text': item['text'], 'label': item['sentiment']})
            # # Extend the data list with the contents of this file
            # data.extend(file_data)

# Convert the list of dictionaries into a pandas DataFrame
df = pd.DataFrame(data)

In [5]:
def update_label_names(df):
    # Define a mapping of old labels to new labels
    label_mapping = {
        "Risky": "Indicative of a Bank Run",
        "Non-risky": "Not Indicative of a Bank Run"
    }

    # Use the map function to update the 'sentiment' column based on the label_mapping
    df['label'] = df['label'].map(label_mapping)

    return df
df = update_label_names(df)
df.head()

Unnamed: 0,text,label
0,Too many candidates today to fit! Here's the ...,Not Indicative of a Bank Run
1,LATEST BANKING NEWS BNY Mellon Asset Servicing...,Not Indicative of a Bank Run
2,@SimonBTC right go to http://www.bnymellon.com...,Not Indicative of a Bank Run
3,BNY Mellon selected to provide corporate trust...,Not Indicative of a Bank Run
4,"New stock picks from @money magazine: $ABT, $W...",Not Indicative of a Bank Run


In [6]:
# Desired number of majority samples after undersampling
desired_majority_samples = 180

# Filter the majority class
majority_class_df = df[df['label'] == "Not Indicative of a Bank Run"]

# Sample from the majority class to get the desired number of samples
undersampled_majority_df = majority_class_df.sample(n=desired_majority_samples, random_state=42)

# Combine the undersampled majority class with all instances of the minority class
df = pd.concat([df[df['label'] == "Indicative of a Bank Run"], undersampled_majority_df])

# Shuffle the dataset
df = df.sample(frac=1, random_state=42).reset_index(drop=True)


In [7]:
X,y = df['text'], df['label']

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.3, random_state=42, stratify = y)

In [9]:
X_train.reset_index(drop=True, inplace = True)
X_test.reset_index(drop=True, inplace = True)
y_train.reset_index(drop=True, inplace = True)
y_test.reset_index(drop=True, inplace = True)

In [10]:
df_train = pd.DataFrame(data = {'text': X_train, 'label': y_train})
df_train.head()

Unnamed: 0,text,label
0,Fifth Third Bank: had four separate accounts w...,Indicative of a Bank Run
1,@Chigz10AFC capital one are frauds! Still list...,Indicative of a Bank Run
2,RT 4 chance 2 win a $100 Fifth Third Bank gift...,Not Indicative of a Bank Run
3,Taneka Sukovaty liked NASDAQ:FITB $13.99 Fifth...,Not Indicative of a Bank Run
4,On Sun in LOU Hens broadcaster Jim Weber calle...,Not Indicative of a Bank Run


In [11]:
df_test = pd.DataFrame(data = {'text': X_test, 'label': y_test})
df_test.head()

Unnamed: 0,text,label
0,$AADR - WCM/BNY Mellon Stock Analysis - free p...,Not Indicative of a Bank Run
1,Capital One Commercial with Jennifer Garner - ...,Not Indicative of a Bank Run
2,CFTC files complaint US Bank (US 5th largest b...,Indicative of a Bank Run
3,"Ethanol Safety Seminar Feb. 10th in Boston, MA...",Not Indicative of a Bank Run
4,#bearcatnation fuck Alabama @ Fifth Third Are...,Not Indicative of a Bank Run


In [12]:
texts = df_test['text'].to_list()

### Zeroshot

In [None]:
openai.api_key = "" #Enter openAI API key here

In [None]:
def classify_bank_run(l):
    counter = 0
    responses = []

    #Trying to account for the rate limits
    #Requests per minute = 3
    #Requests per day = 200
    for i in tqdm(range(len(l))):
        if i%3 == 0 and i != 0:
          time.sleep(70)

        prompt = f"Assess the tweet for signs of banking liquidity crises in the near future, including asset quality concerns, bank runs influenced by social media, overleveraging, and systemic financial stresses. Also, identify key crisis characteristics like depositor confidence erosion, interbank lending freezes, central bank interventions, asset liquidations, or credit crunches. Impor- tantly, evaluate if the tweet carries a negative sentiment, as this can further indicate the seriousness of the financial instability being discussed. Label the tweet as ’Indicative of a Bank Run’ if there is even a hint of any of these elements; otherwise, label it as ’Not Indicative of a Bank Run’. Return your analysis in JSON format, with three attributes: ’Predicted’, containing the label (’Indicative of a Bank Run’ or ’Not Indicative of a Bank Run’), ’Probability Indicative’, representing the probability of the label ’Indicative of a Bank Run’ being assigned to the tweet as a decimal number between 0 and 1, and the third attribute ’Probability Not Indica- tive’, representing the probability of the label ’Not Indicative of a Bank Run’ being assigned to the tweet as a decimal number between 0 and 1. Output nothing else."
        prompt += f"Here is the text of the tweet: {l[i]}"

        try:
          response = openai.ChatCompletion.create(
              model="gpt-4-1106-preview",
              messages=[{"role": "user", "content": prompt}],
              max_tokens=150
          )
        except Exception as e:
          print(f"Error at index {i}: {e}")
          response = 'Failed'
        if response!='Failed':
          responses.append(response.choices[0].message['content'].strip().strip('` json\n'))
        else:
          responses.append(response)
        counter += 1
    return responses

result = classify_bank_run(texts)

In [None]:
df_test['response'] = result
df_test.to_csv("/content/drive/MyDrive/bank_run_detector_files/predicted/gpt_zeroshot_predicted.csv",index=False)

### Fewshot

In [None]:
df_test = pd.DataFrame(data = {'text': X_test, 'label': y_test})

In [None]:
def classify_bank_run(l):
    counter = 0
    responses = []

    #Trying to account for the rate limits
    #Requests per minute = 3
    #Requests per day = 200
    for i in tqdm(range(len(l))):
        if i%3 == 0 and i != 0:
          time.sleep(70)

        prompt = """Assess the tweet for signs of banking liquidity crises in the near future, including asset quality concerns, bank runs influenced by social media, overleveraging, and systemic financial stresses. Also, identify key crisis characteristics like depositor confidence erosion, interbank lending freezes, central bank interventions, asset liquidations, or credit crunches. Impor- tantly, evaluate if the tweet carries a negative sentiment, as this can further indicate the seriousness of the financial instability being discussed. Label the tweet as ’Indicative of a Bank Run’ if there is even a hint of any of these elements; otherwise, label it as ’Not Indicative of a Bank Run’. Return your analysis in JSON format, with three attributes: ’Predicted’, containing the label (’Indicative of a Bank Run’ or ’Not Indicative of a Bank Run’), ’Probability Indicative’, representing the probability of the label ’Indicative of a Bank Run’ being assigned to the tweet as a decimal number between 0 and 1, and the third attribute ’Probability Not Indica- tive’, representing the probability of the label ’Not Indicative of a Bank Run’ being assigned to the tweet as a decimal number between 0 and 1. Output nothing else. Examples of tweets Indicative of a Bank Run:
(a) ”The chart for $BKS tells you everything you need to know...run the other way from them!”
(b) ”South Carolina sues BNY Mellon for $200M: Loftis said in the re- lease that the contract called for investments...”
(c) ”Capital One Agrees to $75 Million TCPA Settlement — by Drinker- Biddle.”
(d) ”capital one are frauds! Still list their address here in nottz: but they vacated building ages ago!”
(e) ”Capital One to pay $210 million to settle charges that they misled consumers into paying for extra products.”
(f) ”CFTC files complaint US Bank for using customer funds as collat- eral on loans to Wasendorf.”
(g) ”US Bank NA as Trustee For Structured Asset Investment Loan Trust: IS A BOGUS TRUST.”
Note this is not an exhaustive list and just gives a hint as to what kind of tweets we are looking for."""

        prompt += f"Here is the text of the tweet: {l[i]}"

        try:
          response = openai.ChatCompletion.create(
              model="gpt-4-1106-preview",
              messages=[{"role": "user", "content": prompt}],
              max_tokens=150
          )
        except Exception as e:
          print(f"Error at index {i}: {e}")
          response = 'Failed'
        if response!='Failed':
          responses.append(response.choices[0].message['content'].strip().strip('` json\n'))
        else:
          responses.append(response)
        counter += 1
    return responses

result = classify_bank_run(texts)

In [None]:
df_test['response'] = result
df_test.to_csv("/content/drive/MyDrive/bank_run_detector_files/predicted/gpt_fewshot_predicted.csv",index=False)