In [2]:
!pip install transformers torch



In [1]:
import pandas as pd
import numpy as np
import torch
from transformers import BertTokenizer, BertModel

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report

from tqdm import tqdm
import time

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
file_path = '/content/drive/MyDrive/SameSame/messages.xlsx'

try:
    # Trying with UTF-16 encoding
    with open(file_path, 'r', encoding='utf-16') as file:
        content = file.read()
        print(content)
except UnicodeDecodeError:
    # If UTF-16 fails, try with ISO-8859-1
    try:
        with open(file_path, 'r', encoding='iso-8859-1') as file:
            content = file.read()
            print(content)
    except Exception as e:
        print("Failed to read file with UTF-16 and ISO-8859-1:", e)

IOPub data rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_data_rate_limit`.

Current values:
NotebookApp.iopub_data_rate_limit=1000000.0 (bytes/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [6]:
file_path = '/content/drive/MyDrive/SameSame/messages.xlsx'
df = pd.read_excel(file_path)

print(df.head())

                 Date                          Contact UUID Contact Name  \
0 2023-09-30 14:16:53  778aeb9d-d0b8-4124-b65f-7da99ee1e87f     Nickname   
1 2023-09-30 14:20:31  778aeb9d-d0b8-4124-b65f-7da99ee1e87f     Nickname   
2 2023-09-30 14:21:05  778aeb9d-d0b8-4124-b65f-7da99ee1e87f     Nickname   
3 2023-09-30 14:21:12  778aeb9d-d0b8-4124-b65f-7da99ee1e87f     Nickname   
4 2023-09-30 14:21:53  778aeb9d-d0b8-4124-b65f-7da99ee1e87f     Nickname   

  URN Scheme    URN Value        Flow Direction        Text  Unnamed: 8  \
0   whatsapp  27760322567   Main-Menu        IN         NaN         0.0   
1   whatsapp  27760322567  onboarding        IN     👍🏽 Yebo         NaN   
2   whatsapp  27760322567  onboarding        IN    Nickname         NaN   
3   whatsapp  27760322567  onboarding        IN     Yebo 👍🏽         NaN   
4   whatsapp  27760322567  onboarding        IN  🙋🏾‍♂️ Quiz         NaN   

  Attachments   Status   Channel Labels  
0         NaN  handled  WhatsApp    NaN  
1       

In [7]:
df.columns

Index(['Date', 'Contact UUID', 'Contact Name', 'URN Scheme', 'URN Value',
       'Flow', 'Direction', 'Text', 'Unnamed: 8', 'Attachments', 'Status',
       'Channel', 'Labels'],
      dtype='object')

In [8]:
df.head()

Unnamed: 0,Date,Contact UUID,Contact Name,URN Scheme,URN Value,Flow,Direction,Text,Unnamed: 8,Attachments,Status,Channel,Labels
0,2023-09-30 14:16:53,778aeb9d-d0b8-4124-b65f-7da99ee1e87f,Nickname,whatsapp,27760322567,Main-Menu,IN,,0.0,,handled,WhatsApp,
1,2023-09-30 14:20:31,778aeb9d-d0b8-4124-b65f-7da99ee1e87f,Nickname,whatsapp,27760322567,onboarding,IN,👍🏽 Yebo,,,handled,WhatsApp,
2,2023-09-30 14:21:05,778aeb9d-d0b8-4124-b65f-7da99ee1e87f,Nickname,whatsapp,27760322567,onboarding,IN,Nickname,,,handled,WhatsApp,
3,2023-09-30 14:21:12,778aeb9d-d0b8-4124-b65f-7da99ee1e87f,Nickname,whatsapp,27760322567,onboarding,IN,Yebo 👍🏽,,,handled,WhatsApp,
4,2023-09-30 14:21:53,778aeb9d-d0b8-4124-b65f-7da99ee1e87f,Nickname,whatsapp,27760322567,onboarding,IN,🙋🏾‍♂️ Quiz,,,handled,WhatsApp,


In [9]:
unique_values = df['URN Scheme'].unique()
print("Sample unique values:", unique_values[:20])

Sample unique values: ['whatsapp' 'facebook' 'tel']


In [10]:
df.shape

(1048575, 13)

In [11]:
sheet_names = ['Messages 1', 'Messages 2', 'Messages 3']
dfs = [pd.read_excel(file_path, sheet_name=sheet, usecols=['Text']) for sheet in sheet_names]

print(dfs[0].head())

         Text
0         NaN
1     👍🏽 Yebo
2    Nickname
3     Yebo 👍🏽
4  🙋🏾‍♂️ Quiz


In [12]:
combined_df = pd.concat(dfs, ignore_index=True)

print(combined_df.head())
print("Total entries from all sheets:", len(combined_df))

         Text
0         NaN
1     👍🏽 Yebo
2    Nickname
3     Yebo 👍🏽
4  🙋🏾‍♂️ Quiz
Total entries from all sheets: 3053978


In [13]:
combined_df.shape

(3053978, 1)

In [14]:
filtered_df = combined_df[combined_df['Text'].str.len() >= 20]

print(filtered_df.head())

print("Number of entries after filtering:", len(filtered_df))

                                                 Text
17                             Ways to feel better ❤️
23  How can I know to separate the real love and f...
24                              Questions + answers 🤔
32                             Ways to feel better ❤️
39  Oky sometimes I feel like my girl friend is st...
Number of entries after filtering: 117488


In [15]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.width', 1000)

try:
    while True:
        sample_df = filtered_df.sample(n=10)
        print(sample_df['Text'])

        user_input = input("Press Enter to display another 10 entries or type 'stop' to end: ")
        break
        if user_input.lower() == 'stop':
            break
except KeyboardInterrupt:
    print("Stopped by user.")

3034003    aesgcm://binu-messenger.s3.eu-west-1.amazonaws.com/679eccc790d51753b4678bb853eedf958231a627/S3oMzzBBLF6QwDBUJQP5vXFVLOUWo2ppNoaayvhn/iy1cvlZ-Te-zVff5uNOSTQ.png#8edd8193ef4a89ff8dc426ebb2808d2c074412eae8afda9466b523e40a5581460260f2e5f4c7a6e0b26472e86d613686
2046072                                                                                          https://chat.whatsapp.com/ErxlO0FM9qNCfXAxQnEET6\n\nIf you're not added yet, join! ❤️https://chat.whatsapp.com/ErxlO0FM9qNCfXAxQnEET6\n\nIf you're not added yet, join! ❤️
1497774                                                                                                                                                                                                                                              Emergency interruption
2914207                                                                                                                                                                                             

In [16]:
# # Define the file path under the folder 'SameSame'
# file_path = '/content/drive/MyDrive/SameSame/filtered_data.csv'

# # Save the DataFrame to CSV in the specified path
# filtered_df.to_csv(file_path, index=False)  # index=False if you don't want to save the row indices

# print(f'DataFrame saved successfully to {file_path}')

##Import Filtered DataFrame From Here

In [12]:
# Path to your CSV file on Google Drive
file_path = '/content/drive/MyDrive/SameSame/filtered_data.csv'

# Read the CSV file
filtered_df = pd.read_csv(file_path)

# Display the DataFrame to confirm it's loaded correctly
print(filtered_df.head())

                                                Text
0                             Ways to feel better ❤️
1  How can I know to separate the real love and f...
2                              Questions + answers 🤔
3                             Ways to feel better ❤️
4  Oky sometimes I feel like my girl friend is st...


In [13]:
# # Load pre-trained BERT tokenizer and model
# tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# model = BertModel.from_pretrained('bert-base-uncased')

# # Tokenize the text data
# tokens = tokenizer(filtered_df['Text'].tolist(), padding=True, truncation=True, return_tensors='pt')

# # Get the hidden states from BERT
# with torch.no_grad():
#     outputs = model(**tokens)
#     features = outputs.last_hidden_state.mean(dim=1).numpy()  # Mean pooling over the token dimension

In [14]:
# Define a function to process text in batches with a progress bar
def process_in_batches(text_list, tokenizer, model, batch_size=16):
    all_features = []
    for i in tqdm(range(0, len(text_list), batch_size), desc="Processing Batches"):
        batch_texts = text_list[i:i + batch_size]
        tokens = tokenizer(batch_texts, padding=True, truncation=True, return_tensors='pt').to('cuda')

        with torch.no_grad():
            outputs = model(**tokens)
            batch_features = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            all_features.append(batch_features)

    return np.concatenate(all_features, axis=0)

In [15]:
keywords = ['hopeless', 'hard', 'can’t go on', 'point of living', 'suicide', 'kill myself', 'end my life', 'worthless', 'no way out', 'kill', 'hurt']

def label_message(message, keywords):
    # Check if any keyword is in the message
    for keyword in keywords:
        if keyword in message.lower():
            return 1
    return 0

# Apply heuristic labeling
filtered_df['label'] = filtered_df['Text'].apply(label_message, keywords=keywords)

In [16]:
count_label_1 = (filtered_df['label'] == 1).sum()

# Print the count
print("Number of rows with label = 1:", count_label_1)

Number of rows with label = 1: 3865


In [17]:
# Count the number of labels
num_ones = (filtered_df['label'] == 1).sum()
num_zeros = (filtered_df['label'] == 0).sum()

# Ensure there are enough 0s to match the number of 1s
if num_ones > num_zeros:
    raise ValueError("Not enough negative samples to balance the dataset. Adjust your approach or heuristic.")

In [18]:
# Select all 1s and an equal number of 0s
ones_df = filtered_df[filtered_df['label'] == 1]
zeros_df = filtered_df[filtered_df['label'] == 0].sample(n=num_ones, random_state=42)

# Combine the selected 1s and 0s
balanced_df = pd.concat([ones_df, zeros_df]).sample(frac=1, random_state=42)  # Shuffle the combined dataset

# Check the balance of the new dataset
print("Number of samples in the balanced dataset:", len(balanced_df))
print("Number of 1s:", (balanced_df['label'] == 1).sum())
print("Number of 0s:", (balanced_df['label'] == 0).sum())

Number of samples in the balanced dataset: 7730
Number of 1s: 3865
Number of 0s: 3865


In [19]:
# Load pre-trained BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased').cuda()

# Process the text data in batches for the balanced dataset
batch_size = 16

start_time = time.time()

features_balanced = process_in_batches(balanced_df['Text'].tolist(), tokenizer, model, batch_size=batch_size)

end_time = time.time()
print(f"Feature extraction time for balanced dataset: {end_time - start_time:.2f} seconds")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]



config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

Processing Batches: 100%|██████████| 484/484 [00:21<00:00, 22.80it/s]

Feature extraction time for balanced dataset: 21.24 seconds





In [20]:
# Prepare data for training
X_balanced = features_balanced
y_balanced = balanced_df['label'].values

# Split the balanced data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_balanced, y_balanced, test_size=0.2, random_state=42)

# Train a logistic regression model on the balanced dataset
clf = LogisticRegression()
clf.fit(X_train, y_train)

# Predict on the test set and evaluate the model
y_pred = clf.predict(X_test)
print("Model evaluation on balanced dataset:")
print(classification_report(y_test, y_pred))

Model evaluation on balanced dataset:
              precision    recall  f1-score   support

           0       0.89      0.86      0.88       813
           1       0.85      0.89      0.87       733

    accuracy                           0.87      1546
   macro avg       0.87      0.88      0.87      1546
weighted avg       0.88      0.87      0.87      1546



STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [21]:
# Process the text data in batches for the entire dataset
start_time = time.time()

features_all = process_in_batches(filtered_df['Text'].tolist(), tokenizer, model, batch_size=batch_size)

end_time = time.time()
print(f"Feature extraction time for entire dataset: {end_time - start_time:.2f} seconds")

Processing Batches: 100%|██████████| 7343/7343 [03:08<00:00, 39.00it/s]


Feature extraction time for entire dataset: 188.41 seconds


In [25]:
# Use the trained model to predict labels for the entire dataset
X_all = features_all
filtered_df['predicted_label'] = clf.predict(X_all)

# Ensure the predicted_label column is created correctly
print("Sample of predicted labels:", filtered_df[['Text', 'label', 'predicted_label']].head())

Sample of predicted labels:                                                 Text  label  predicted_label
0                             Ways to feel better ❤️      0                0
1  How can I know to separate the real love and f...      0                0
2                              Questions + answers 🤔      0                0
3                             Ways to feel better ❤️      0                0
4  Oky sometimes I feel like my girl friend is st...      0                0


In [26]:
# Evaluate the model on the entire dataset
print("Model evaluation on the entire dataset:")
print(classification_report(filtered_df['label'], filtered_df['predicted_label']))

Model evaluation on the entire dataset:
              precision    recall  f1-score   support

           0       1.00      0.86      0.93    113623
           1       0.19      0.92      0.31      3865

    accuracy                           0.87    117488
   macro avg       0.59      0.89      0.62    117488
weighted avg       0.97      0.87      0.91    117488



In [27]:
# Count the number of rows with label 1 in the original dataset
num_risk_rows = (filtered_df['predicted_label'] == 1).sum()
print(f"Number of rows predicted as 1 (risk of suicide) in the entire dataset: {num_risk_rows}")

Number of rows predicted as 1 (risk of suicide) in the entire dataset: 18911


In [28]:
# Print some random rows predicted as 1
num_samples = 10
risk_samples = filtered_df[filtered_df['predicted_label'] == 1].sample(n=num_samples, random_state=42)
print("Random samples of rows predicted as 1:")
print(risk_samples.to_string(index=False))

Random samples of rows predicted as 1:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                 Text  label  predicted_label
                                                                                                                                                                                                                   

In [32]:
import os
import zipfile
from transformers import BertTokenizer, BertForSequenceClassification

# Assuming 'model' and 'tokenizer' are your trained model and tokenizer
output_dir = './drive/MyDrive/SameSame/model_save/'

# Save model and tokenizer
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)

# Zip the directory
zipf = zipfile.ZipFile('model_save.zip', 'w', zipfile.ZIP_DEFLATED)
for root, dirs, files in os.walk(output_dir):
    for file in files:
        zipf.write(os.path.join(root, file), os.path.relpath(os.path.join(root, file), output_dir))
zipf.close()

print(f"Model and tokenizer saved and zipped to model_save.zip")

Model and tokenizer saved and zipped to model_save.zip
