In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import os
import json
import plotly.express as px
from sklearn.model_selection import train_test_split # for splitting data into train and test samples
from sklearn.svm import SVC # for Support Vector Classification baseline model
import logging
import numpy as np
from numpy import random
import gensim
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
import re
from bs4 import BeautifulSoup
%matplotlib inline
import nltk
nltk.download('stopwords')
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression



In [None]:
# Initialize a list to hold the data
data = []

# Specify the directory containing the labelled files
directory = "/content/drive/MyDrive/bank_run_detector_files/labelled/"

# Iterate over each file in the directory
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    # Check if the file is a JSON file
    if os.path.isfile(filepath) and filename.endswith('.json'):
        # Open and load the contents of the JSON file
        with open(filepath, 'r') as file:
            file_data = json.load(file)
            # For each item in the list, extract the 'text' and 'sentiment' and add to the data list
            for item in file_data:
                if 'text' in item and 'sentiment' in item:  # Ensure the keys exist
                    data.append({'text': item['text'], 'label': item['sentiment']})

# Convert the list of dictionaries into a pandas DataFrame
labelled_df = pd.DataFrame(data)

In [None]:
# Initialize a list to hold the data
data = []

# Specify the directory containing the labelled files
directory = "/content/drive/MyDrive/bank_run_detector_files/unlabelled/"

# Iterate over each file in the directory
for filename in os.listdir(directory):
    filepath = os.path.join(directory, filename)
    # Check if the file is a JSON file
    if os.path.isfile(filepath) and filename.endswith('.json'):
        # Open and load the contents of the JSON file
        with open(filepath, 'r') as file:
            file_data = json.load(file)
            # For each item in the list, extract the 'text' and 'sentiment' and add to the data list
            for item in file_data:
                if 'text' in item:  # Ensure the keys exist
                    data.append({'text': item['text'], 'label': 'Unlabelled'})
            # # Extend the data list with the contents of this file
            # data.extend(file_data)

# Convert the list of dictionaries into a pandas DataFrame
unlabelled_df = pd.DataFrame(data)

In [None]:
def update_label_names(df):
    # Define a mapping of old labels to new labels
    label_mapping = {
        "Risky": "Indicative of a Bank Run",
        "Non-risky": "Not Indicative of a Bank Run"
    }

    # Use the map function to update the 'sentiment' column based on the label_mapping
    df['label'] = df['label'].map(label_mapping)

    return df

In [None]:
labelled_df = update_label_names(labelled_df)
labelled_df.head()

Unnamed: 0,text,label
0,Too many candidates today to fit! Here's the ...,Not Indicative of a Bank Run
1,LATEST BANKING NEWS BNY Mellon Asset Servicing...,Not Indicative of a Bank Run
2,@SimonBTC right go to http://www.bnymellon.com...,Not Indicative of a Bank Run
3,BNY Mellon selected to provide corporate trust...,Not Indicative of a Bank Run
4,"New stock picks from @money magazine: $ABT, $W...",Not Indicative of a Bank Run


In [None]:
# Desired number of majority samples after undersampling
desired_majority_samples = 180

# Filter the majority class
majority_class_df = labelled_df[labelled_df['label'] == "Not Indicative of a Bank Run"]

# Sample from the majority class to get the desired number of samples
undersampled_majority_df = majority_class_df.sample(n=desired_majority_samples, random_state=42)

# Combine the undersampled majority class with all instances of the minority class
labelled_df = pd.concat([labelled_df[labelled_df['label'] == "Indicative of a Bank Run"], undersampled_majority_df])

# Shuffle the dataset
labelled_df = labelled_df.sample(frac=1, random_state=42).reset_index(drop=True)


In [None]:
X,y = labelled_df['text'], labelled_df['label']

In [None]:
X_train_labelled, X_test, y_train_labelled, y_test = train_test_split(X,y, test_size=0.3, random_state=42, stratify = y)

In [None]:
X_train_unlabelled = unlabelled_df['text']
y_train_unlabelled = unlabelled_df['label']

# Combine labeled and unlabeled data for the training set
X_train = pd.concat([X_train_labelled, X_train_unlabelled])
y_train = pd.concat([y_train_labelled, pd.Series(y_train_unlabelled)])

In [None]:
X_train.reset_index(drop=True, inplace = True)
X_test.reset_index(drop=True, inplace = True)
y_train.reset_index(drop=True, inplace = True)
y_test.reset_index(drop=True, inplace = True)

X_train_labelled.reset_index(drop=True, inplace = True)
y_train_labelled.reset_index(drop=True, inplace = True)

In [None]:
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
        text: a string

        return: modified initial string
    """
    text = text.lower() # lowercase text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) # delete stopwors from text
    return text

labelled_df['text'] = labelled_df['text'].apply(clean_text)


In [None]:
from sklearn.utils.class_weight import compute_class_weight

# Assuming you have a binary classification problem with classes 0 and 1
class_labels = ["Indicative of a Bank Run", "Not Indicative of a Bank Run"]
weights = compute_class_weight(class_weight='balanced', classes=class_labels, y=y_train_labelled)
class_weights = dict(zip(class_labels, weights))


In [None]:
label_names = {0: "Indicative of a Bank Run", 1: "Not Indicative of a Bank Run"}

In [None]:


# Configuration for SVM
model_name = 'SVM'
model = SVC(probability=True, class_weight=class_weights)

params = {
    'clf__C': [1, 10],
    'clf__kernel': ['linear', 'rbf'],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'vectorizer__max_df': (0.5, 0.75, 1.0),
    'vectorizer__max_features': (None, 5000, 10000),
    'tfidf__norm': ('l1', 'l2'),
}

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', model),
])

grid = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='f1_weighted')
grid.fit(X_train_labelled, y_train_labelled)

best_model = grid.best_estimator_
y_proba = best_model.predict_proba(X_test)
thresholds = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95]
for threshold in thresholds:
  y_pred_custom_threshold = (y_proba[:, 0] >= threshold).astype(int)

  y_pred_labels = [label_names[pred] for pred in y_pred_custom_threshold]

  accuracy = accuracy_score(y_test, y_pred_labels)
  report = classification_report(y_test, y_pred_labels, target_names=['Indicative of a Bank Run', 'Not Indicative of a Bank Run'], zero_division=0)

  print(f"\nModel: {model_name} | Threshold: {threshold}")
  print(f"Accuracy: {accuracy}")
  print("Classification Report:")
  print(report)



Model: SVM | Threshold: 0.2
Accuracy: 0.20833333333333334
Classification Report:
                              precision    recall  f1-score   support

    Indicative of a Bank Run       0.00      0.00      0.00        18
Not Indicative of a Bank Run       0.45      0.28      0.34        54

                    accuracy                           0.21        72
                   macro avg       0.23      0.14      0.17        72
                weighted avg       0.34      0.21      0.26        72


Model: SVM | Threshold: 0.3
Accuracy: 0.1527777777777778
Classification Report:
                              precision    recall  f1-score   support

    Indicative of a Bank Run       0.02      0.06      0.03        18
Not Indicative of a Bank Run       0.37      0.19      0.25        54

                    accuracy                           0.15        72
                   macro avg       0.20      0.12      0.14        72
                weighted avg       0.28      0.15      0.19   

In [None]:
# Configuration for Logistic Regression
model_name = 'LogisticRegression'
model = LogisticRegression(class_weight=class_weights)
threshold = 0.5  # Specific threshold for Logistic Regression

params = {
    'clf__C': [1, 10],
    'vectorizer__ngram_range': [(1, 1), (1, 2)],
    'tfidf__use_idf': (True, False),
    'vectorizer__max_df': (0.5, 0.75, 1.0),
    'vectorizer__max_features': (None, 5000, 10000),
    'tfidf__norm': ('l1', 'l2'),
}

pipeline = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', model),
])

grid2 = GridSearchCV(pipeline, param_grid=params, cv=3, scoring='f1_weighted')
grid2.fit(X_train_labelled, y_train_labelled)

best_model2 = grid2.best_estimator_
y_proba2 = best_model2.predict_proba(X_test)
thresholds = [0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.85, 0.9, 0.95]
for threshold in thresholds:
  y_pred_custom_threshold = (y_proba2[:, 0] >= threshold).astype(int)

  y_pred_labels2 = [label_names[pred] for pred in y_pred_custom_threshold]

  accuracy2 = accuracy_score(y_test, y_pred_labels2)
  report2 = classification_report(y_test, y_pred_labels2, target_names=['Indicative of a Bank Run', 'Not Indicative of a Bank Run'], zero_division=0)

  print(f"\nModel: {model_name} | Threshold: {threshold}")
  print(f"Accuracy: {accuracy2}")
  print("Classification Report:")
  print(report2)



Model: LogisticRegression | Threshold: 0.2
Accuracy: 0.3472222222222222
Classification Report:
                              precision    recall  f1-score   support

    Indicative of a Bank Run       0.03      0.06      0.04        18
Not Indicative of a Bank Run       0.59      0.44      0.51        54

                    accuracy                           0.35        72
                   macro avg       0.31      0.25      0.27        72
                weighted avg       0.45      0.35      0.39        72


Model: LogisticRegression | Threshold: 0.3
Accuracy: 0.2222222222222222
Classification Report:
                              precision    recall  f1-score   support

    Indicative of a Bank Run       0.03      0.06      0.03        18
Not Indicative of a Bank Run       0.47      0.28      0.35        54

                    accuracy                           0.22        72
                   macro avg       0.25      0.17      0.19        72
                weighted avg     

In [None]:
grid.best_params_

{'clf__C': 10,
 'clf__kernel': 'linear',
 'tfidf__norm': 'l2',
 'tfidf__use_idf': True,
 'vectorizer__max_df': 0.75,
 'vectorizer__max_features': None,
 'vectorizer__ngram_range': (1, 1)}

In [None]:
grid2.best_params_

{'clf__C': 10,
 'tfidf__norm': 'l2',
 'tfidf__use_idf': False,
 'vectorizer__max_df': 0.5,
 'vectorizer__max_features': None,
 'vectorizer__ngram_range': (1, 2)}