In [5]:
import os
import pandas as pd
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
import multiprocessing
from concurrent.futures import ProcessPoolExecutor
from gensim.models.doc2vec import Doc2Vec
from gensim.models import FastText
import multiprocessing
from multiprocessing import Pool
import ast
import csv
import cProfile
import pstats

In [6]:
# Process all text files in the knowledge directory
def process_text_file(directory):
    tagged_data = []
    for filename in os.listdir(directory):
        if filename.endswith(".txt"):
            file_path = os.path.join(directory, filename)
            with open(file_path, 'r') as file:
                text = file.read()
                tokens = text.split() # Tokenize the text
                tagged_data.append(TaggedDocument(words=tokens, tags=["text_file"]))
    return tagged_data

In [7]:
# Worker function to process a single CSV file
def process_single_csv(file_path):
    tagged_data = []
    try:
        df = pd.read_csv(file_path, quoting=csv.QUOTE_NONE, on_bad_lines='skip')

        # Identify all columns of type 'object' (string)
        text_columns = [col for col in df.columns if df[col].dtype == 'object']

        if not text_columns:
            print(f"No suitable text columns found in {file_path}. Skipping file.")
            return tagged_data

        print(f"Using columns {text_columns} from {file_path}")
        for index, row in df.iterrows():
            tokens = []
            for col in text_columns:
                column_data = row[col]
                # Safely evaluate the string if it looks like a list, otherwise split normally
                column_tokens = ast.literal_eval(column_data) if (isinstance(column_data, str) and column_data.startswith('[') and column_data.endswith(']')) else str(column_data).split()
                tokens.extend(column_tokens)
            tagged_data.append(TaggedDocument(words=tokens, tags=[f"{os.path.basename(file_path)}_{index}"]))
    except Exception as e:
        print(f"Error processing file {file_path}: {e}")
    return tagged_data

In [9]:
def process_csv_files(directory):
    files = [os.path.join(directory, filename) for filename in os.listdir(directory) if filename.endswith(".csv")]
    
    # Initialize a pool of worker processes
    with Pool(os.cpu_count()) as p:
        results = p.map(process_single_csv, files)

    # Flatten the list of tagged_data lists into a single list
    tagged_data = [item for sublist in results for item in sublist]
    return tagged_data

In [11]:
# Directory paths
directory = "../data/knowledge/preprocess"
d2v_directory = os.path.join(directory, 'd2v')
os.makedirs(d2v_directory, exist_ok=True)
# Initialize tagged data list
tagged_data = []

# Process the all text file that ends with .txt in the directory preprocess under knowledge folder
tagged_data.extend(process_text_file(directory))

# Process all CSV files in the preprocess directory of knowledge folder
tagged_data.extend(process_csv_files(directory))

Using columns ['text'] from ../data/knowledge/preprocess/labelled_disease_or_not_processed.csv
Using columns ['subreddit', 'comment'] from ../data/knowledge/preprocess/reddit_types_processed.csv
Using columns ['title', 'selftext', 'subreddit'] from ../data/knowledge/preprocess/all_types_processed.csv


In [12]:
# Train the Doc2Vec model for txt and csv files of knowledge folder using GPU
model_d2v_combined = Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.00025, min_count=2, dm=0, workers=multiprocessing.cpu_count(), epochs=10)
model_d2v_combined.build_vocab(tagged_data)
model_d2v_combined.train(tagged_data, total_examples=model_d2v_combined.corpus_count, epochs=model_d2v_combined.epochs)

# Save the model of knowledge folder
model_d2v_combined.save(os.path.join(d2v_directory, 'model_d2v_combined.model'))

In [13]:
# Directory paths for sample_q_and_a files
directory = "../data/sample_q_and_a/preprocess"
d2v_directory = os.path.join(directory, 'd2v')
os.makedirs(d2v_directory, exist_ok=True)
# Initialize tagged data list
tagged_data = []
# Process all CSV files in the directory preprocess under sample_qa folder
tagged_data.extend(process_csv_files(directory))

Using columns ['Context', 'Knowledge', 'Response'] from ../data/sample_q_and_a/preprocess/rules_facts_processed.csv
Using columns ['topic', 'interlocutor', 'utterance_text', 'main_therapist_behaviour', 'client_talk_type'] from ../data/sample_q_and_a/preprocess/sentiment_analysis_1_processed.csv
Using columns ['Timestamp', 'Age', 'Gender', 'Country', 'state', 'self_employed', 'family_history', 'treatment', 'work_interfere', 'no_employees', 'remote_work', 'tech_company', 'benefits', 'care_options', 'wellness_program', 'seek_help', 'anonymity', 'leave', 'mental_health_consequence', 'phys_health_consequence', 'coworkers', 'supervisor', 'mental_health_interview', 'phys_health_interview', 'mental_vs_physical', 'obs_consequence', 'comments'] from ../data/sample_q_and_a/preprocess/survey_processed.csv
Using columns ['question', 'response_j', 'response_k'] from ../data/sample_q_and_a/preprocess/response_1_processed.csv
Using columns ['text'] from ../data/sample_q_and_a/preprocess/sample_answers

In [14]:
model_d2v_qa = Doc2Vec(vector_size=300, alpha=0.025, min_alpha=0.00025, min_count=1, dm=0, workers=multiprocessing.cpu_count(), epochs=100)
model_d2v_qa.build_vocab(tagged_data)
model_d2v_qa.train(tagged_data, total_examples=model_d2v_qa.corpus_count, epochs=model_d2v_qa.epochs)
# Save the model of sample_q_and_a folder
model_d2v_qa.save(os.path.join(d2v_directory, 'model_d2v_qa.model'))

In [15]:
# view the model_d2v_qa.model file
for i in range(10):
    print(model_d2v_combined.docvecs[i])

[ 8.29383552e-01  7.54026771e-01  3.96785319e-01  1.13442969e+00
  1.09294987e+00 -1.32580388e+00 -1.94602573e+00 -4.84175146e-01
 -2.33159328e+00 -3.24903607e-01 -9.48597193e-02  6.55176282e-01
  3.81311983e-01 -1.05736947e+00 -1.13197935e+00  1.41103208e+00
  2.02487445e+00 -8.46682042e-02 -1.53128278e+00 -2.61327863e-01
  1.20582871e-01 -6.56596720e-01  2.84462035e-01  6.79582834e-01
 -5.68560660e-01  2.55709291e-01 -1.76785886e-01  4.38891858e-01
  3.80288869e-01 -1.34164310e+00 -6.07771039e-01 -1.75387299e+00
 -3.75462949e-01 -1.45938098e-02 -9.38461125e-01  1.21344042e+00
 -1.19422436e+00  7.94954300e-01  2.13757634e+00  6.58112586e-01
 -8.30578879e-02  1.26778448e+00 -8.01871598e-01  5.62757030e-02
 -3.22153211e-01  1.23324728e+00 -1.45443901e-01 -8.34156096e-01
 -7.50939429e-01 -8.75484884e-01  7.24555254e-01  8.54056001e-01
  2.47008801e-01 -8.41714799e-01  7.39165306e-01 -6.98035300e-01
  7.16117501e-01 -7.08604276e-01 -1.75860441e+00  2.41023555e-01
 -6.39629483e-01  2.48854

  print(model_d2v_combined.docvecs[i])


In [16]:
for i in range(10):
    print(model_d2v_combined.wv.index_to_key[i])

nan
'like'
'im'
'dont'
'get'
'one'
'people'
'time'
'thing'
'would'
