In [336]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
import warnings
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

warnings.filterwarnings('ignore')

In [337]:
def load_and_preprocess_data(file_path):
    df = pd.read_csv(file_path)

    # Fill NaN values in the 'text' column
    df.text = df.text.fillna('')

    # Convert labels to integers
    df['label'] = df.label.replace({'False': 0, 'True': 1, '0.0': 0, '1.0': 1, '1': 1, '0': 0, '2.0' : 1})

    return df

In [338]:
pd.set_option('display.max_rows', None)
# Load the dataset
file_path = '250811_training_data.csv'
data = load_and_preprocess_data(file_path)

# Display the first few rows of the dataframe
data.head(500)


Unnamed: 0.1,Unnamed: 0,index,text,label,mandate,page_no
0,0,0,PROSPECTUS T. Rowe Price Funds OEIC,0.0,labelled\consorsbank_t_rowe_price_funds,0
1,1,1,23 December 2022,0.0,labelled\consorsbank_t_rowe_price_funds,0
2,2,2,Equity Funds,0.0,labelled\consorsbank_t_rowe_price_funds,0
3,3,3,Asian Opportunities Equity Fund (PRN: 757525) ...,0.0,labelled\consorsbank_t_rowe_price_funds,0
4,4,4,Global Impact Equity Fund (PRN: 964358) Global...,0.0,labelled\consorsbank_t_rowe_price_funds,0
5,5,5,US Blue Chip Equity Fund (PRN: 836377) US Equi...,0.0,labelled\consorsbank_t_rowe_price_funds,0
6,6,6,Bond Funds,0.0,labelled\consorsbank_t_rowe_price_funds,0
7,7,7,Dynamic Global Bond Fund (PRN: 757530) Global ...,0.0,labelled\consorsbank_t_rowe_price_funds,0
8,8,0,INVEST WITH CONFIDENCE,0.0,labelled\consorsbank_t_rowe_price_funds,10
9,9,1,11,0.0,labelled\consorsbank_t_rowe_price_funds,10


In [339]:
# Step 1: Merge 'text' and 'mandate' columns
data['merged_text'] = data['text'] + ' ' + data['mandate']

# Step 2: Drop NA values
data.dropna(subset=['merged_text', 'label'], inplace=True)

# Display the first few rows of the modified dataframe
data.head(100)


Unnamed: 0.1,Unnamed: 0,index,text,label,mandate,page_no,merged_text
0,0,0,PROSPECTUS T. Rowe Price Funds OEIC,0.0,labelled\consorsbank_t_rowe_price_funds,0,PROSPECTUS T. Rowe Price Funds OEIC labelled\c...
1,1,1,23 December 2022,0.0,labelled\consorsbank_t_rowe_price_funds,0,23 December 2022 labelled\consorsbank_t_rowe_p...
2,2,2,Equity Funds,0.0,labelled\consorsbank_t_rowe_price_funds,0,Equity Funds labelled\consorsbank_t_rowe_price...
3,3,3,Asian Opportunities Equity Fund (PRN: 757525) ...,0.0,labelled\consorsbank_t_rowe_price_funds,0,Asian Opportunities Equity Fund (PRN: 757525) ...
4,4,4,Global Impact Equity Fund (PRN: 964358) Global...,0.0,labelled\consorsbank_t_rowe_price_funds,0,Global Impact Equity Fund (PRN: 964358) Global...
5,5,5,US Blue Chip Equity Fund (PRN: 836377) US Equi...,0.0,labelled\consorsbank_t_rowe_price_funds,0,US Blue Chip Equity Fund (PRN: 836377) US Equi...
6,6,6,Bond Funds,0.0,labelled\consorsbank_t_rowe_price_funds,0,Bond Funds labelled\consorsbank_t_rowe_price_f...
7,7,7,Dynamic Global Bond Fund (PRN: 757530) Global ...,0.0,labelled\consorsbank_t_rowe_price_funds,0,Dynamic Global Bond Fund (PRN: 757530) Global ...
8,8,0,INVEST WITH CONFIDENCE,0.0,labelled\consorsbank_t_rowe_price_funds,10,INVEST WITH CONFIDENCE labelled\consorsbank_t_...
9,9,1,11,0.0,labelled\consorsbank_t_rowe_price_funds,10,11 labelled\consorsbank_t_rowe_price_funds


In [340]:
# Split the data into training and testing sets
X = data['merged_text']
y = data['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=346)

In [341]:
#Create a Bag-of-Words representation
vectorizer = CountVectorizer()
X_train_bow = vectorizer.fit_transform(X_train)
X_test_bow = vectorizer.transform(X_test)

In [342]:
# Train a simple logistic regression classifier
classifier = LogisticRegression(random_state=346)
classifier.fit(X_train_bow, y_train)

#random_state=871

In [343]:
# Evaluate the model on the testing set
y_pred = classifier.predict(X_test_bow)

# Calculate accuracy, precision, recall, and F1-score
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average=None)
recall = recall_score(y_test, y_pred, average=None)
f1 = f1_score(y_test, y_pred, average=None)

# Create a dictionary to store the results with labels
results = {
    "Accuracy": accuracy,
    "Precision": precision,
    "Recall": recall,
    "F1 Score": f1
}

results

{'Accuracy': 0.9683918669131238,
 'Precision': array([0.97874876, 0.8315508 , 0.        ]),
 'Recall': array([0.98717949, 0.75120773, 0.        ]),
 'F1 Score': array([0.98294605, 0.7893401 , 0.        ])}

The Logistic Regression classifier has been trained on the Bag-of-Words representation of the merged text data. Here are the evaluation metrics for the classifier on the test set:

In [344]:
## Generate the confusion matrix
#cm = confusion_matrix(y_test, y_pred)
## Plot the confusion matrix
#sns.heatmap(cm, annot=True, fmt='d')
#plt.ylabel('Actual')
#plt.xlabel('Predicted')
#plt.show()