

### Json Model


In [1]:
!pip install sentence-transformers==2.2.2



In [2]:
import pandas as pd
import numpy as np
import os

# Define the path to the folder containing the CSV files
# folder_path = r'/content/processed'
folder_path = r'./processed'

# List to hold dataframes
dfs = []

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):  # Check if the file is a CSV
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)  # Load the CSV file

        # Check if 'IsQuestion' column exists
        if 'IsQuestion' in df.columns:
            # Convert NaN values to 0 and then the column to integers
            df['IsQuestion'] = df['IsQuestion'].fillna(0).astype(int)
            dfs.append(df)  # Append the dataframe to the list

# Concatenate all dataframes in the list
combined_df = pd.concat(dfs, ignore_index=True)

# If you want to save the combined dataframe to a new CSV file
# combined_df.to_csv('combined_csv.csv', index=False)

# Show the first few rows of the combined dataframe
combined_df.head()


Unnamed: 0,Cell Data,Filename,Table Number,IsQuestion
0,AS PRIMEIRAS PERGUNTAS SÃO SOBRE ÁLCOOL,C:\Users\tates\Downloads\pdf-questionnaire-ext...,1,0
1,"Q1. ALGUMA VEZ NA VIDA, você já tomou alguma \...",C:\Users\tates\Downloads\pdf-questionnaire-ext...,1,1
2,( ) Sim \n( ) Não,C:\Users\tates\Downloads\pdf-questionnaire-ext...,1,0
3,"Q2. SE SIM, que IDADE você tinha quando tomou ...",C:\Users\tates\Downloads\pdf-questionnaire-ext...,1,1
4,____ ____ anos \n( ) Não se aplica (nunca t...,C:\Users\tates\Downloads\pdf-questionnaire-ext...,1,0


In [3]:
combined_df.shape

(1028, 4)

In [4]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(combined_df['Cell Data'].fillna(''), combined_df['IsQuestion'], test_size=0.1, random_state=42)

# Ensure X_train and X_test are lists
X_train_list = X_train.tolist()
X_test_list = X_test.tolist()

In [5]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sentence_transformers import SentenceTransformer

class SentenceTransformerEmbedder(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
        self.model = SentenceTransformer(model_name)

    def fit(self, X, y=None):
        return self  # nothing to fit

    def transform(self, X):
        # Convert texts to embeddings
        embeddings = self.model.encode(X, show_progress_bar=True)
        return np.array(embeddings)


In [6]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the pipeline with XGBClassifier
model_xgb = Pipeline([
    ('embedder', SentenceTransformerEmbedder()),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)),
])

# Train the model
model_xgb.fit(X_train_list, y_train)

# Predict the labels for the test data
y_pred_xgb = model_xgb.predict(X_test_list)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))


Batches:   0%|          | 0/29 [00:00<?, ?it/s]

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

Accuracy: 0.970873786407767

Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.98      0.98        63
           1       0.97      0.95      0.96        40

    accuracy                           0.97       103
   macro avg       0.97      0.97      0.97       103
weighted avg       0.97      0.97      0.97       103



In [None]:
import pickle

with open('models/json_xgb.pkl', 'wb') as f:
    pickle.dump(model_xgb, f)

In [7]:
import numpy as np
import random

filepaths = combined_df['Filename'].unique()
random.shuffle(filepaths)

def split_data(filepaths, n_validation, all_files):
    validation_files = filepaths[:n_validation]
    training_files = set(all_files) - set(validation_files)
    return validation_files, training_files

n_validation_files = 2

all_files = filepaths
scores = []
while len(filepaths) > 0:
    validation_files, training_files = split_data(filepaths, n_validation_files, all_files)

    validation_data = combined_df[combined_df['Filename'].isin(validation_files)]

    training_data = combined_df[combined_df['Filename'].isin(training_files)]

    model_xgb = Pipeline([
    ('embedder', SentenceTransformerEmbedder()),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)),
    ])
    model_xgb.fit(training_data['Cell Data'].tolist(), training_data['IsQuestion'])

    y_pred_xgb = model_xgb.predict(validation_data['Cell Data'].tolist())
    score = accuracy_score(validation_data['IsQuestion'], y_pred_xgb)
    scores.append(score)
    print("Accuracy:", score)
    print("\nClassification Report:\n", classification_report(validation_data['IsQuestion'], y_pred_xgb))
    filepaths = filepaths[n_validation_files:]
np.average(scores)


Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Accuracy: 0.9393939393939394

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.92      0.96        24
           1       0.82      1.00      0.90         9

    accuracy                           0.94        33
   macro avg       0.91      0.96      0.93        33
weighted avg       0.95      0.94      0.94        33



Batches:   0%|          | 0/28 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Accuracy: 0.9699248120300752

Classification Report:
               precision    recall  f1-score   support

           0       0.99      0.96      0.97        80
           1       0.95      0.98      0.96        53

    accuracy                           0.97       133
   macro avg       0.97      0.97      0.97       133
weighted avg       0.97      0.97      0.97       133



Batches:   0%|          | 0/21 [00:00<?, ?it/s]

Batches:   0%|          | 0/12 [00:00<?, ?it/s]

Accuracy: 0.9576719576719577

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.96      0.97       283
           1       0.90      0.94      0.92        95

    accuracy                           0.96       378
   macro avg       0.94      0.95      0.94       378
weighted avg       0.96      0.96      0.96       378



Batches:   0%|          | 0/32 [00:00<?, ?it/s]

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Accuracy: 0.8

Classification Report:
               precision    recall  f1-score   support

           0       0.65      1.00      0.79        11
           1       1.00      0.68      0.81        19

    accuracy                           0.80        30
   macro avg       0.82      0.84      0.80        30
weighted avg       0.87      0.80      0.80        30



Batches:   0%|          | 0/31 [00:00<?, ?it/s]

Batches:   0%|          | 0/2 [00:00<?, ?it/s]

Accuracy: 0.98

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.89      0.94         9
           1       0.98      1.00      0.99        41

    accuracy                           0.98        50
   macro avg       0.99      0.94      0.96        50
weighted avg       0.98      0.98      0.98        50



Batches:   0%|          | 0/28 [00:00<?, ?it/s]

Batches:   0%|          | 0/5 [00:00<?, ?it/s]

Accuracy: 0.9612903225806452

Classification Report:
               precision    recall  f1-score   support

           0       0.94      0.99      0.96        83
           1       0.99      0.93      0.96        72

    accuracy                           0.96       155
   macro avg       0.96      0.96      0.96       155
weighted avg       0.96      0.96      0.96       155



Batches:   0%|          | 0/25 [00:00<?, ?it/s]

Batches:   0%|          | 0/8 [00:00<?, ?it/s]

Accuracy: 0.9397590361445783

Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.94      0.96       190
           1       0.82      0.95      0.88        59

    accuracy                           0.94       249
   macro avg       0.90      0.94      0.92       249
weighted avg       0.95      0.94      0.94       249



0.9354342954030279

# text_data

In [8]:
import pandas as pd
import numpy as np
import os

# Define the path to the folder containing the CSV files
# folder_path = r'/content/text_processed'
folder_path = r'./text_processed'


# List to hold dataframes
dfs = []
filenames = []  # List to store filenames

# Loop through all files in the folder
for filename in os.listdir(folder_path):
    if filename.endswith('.csv'):  # Check if the file is a CSV
        # print(filename)  # Print the filename
        file_path = os.path.join(folder_path, filename)
        df = pd.read_csv(file_path)  # Load the CSV file
        # Add a 'Filename' column filled with the current filename
        df['Filename'] = filename
        # Check if 'IsQuestion' column exists
        if 'IsQuestion' in df.columns:
            print(filename)
            # Convert NaN values to 0 and then the column to integers
            df['IsQuestion'] = df['IsQuestion'].fillna(0).astype(int)
            dfs.append(df)  # Append the dataframe to the list
            filenames.append(filename)  # Append filename to the list

# Concatenate all dataframes in the list
combined_df = pd.concat(dfs, ignore_index=True)

# If you want to save the combined dataframe to a new CSV file
# combined_df.to_csv('combined_csv.csv', index=False)

# Show the first few rows of the combined dataframe
combined_df.head()


062_eoin_no_numbers.csv
022_cesd.csv
052_social_attitudes.csv
002_bhrcsparentreportysi.csv
034_bhrcsparentreportsocialcohesio.csv
057_ndshs2016.csv
045_mr0510201tb.csv
021_sdqenglishukpt417single.csv
041_rcadschildreported818.csv
043_nanopdfcomscoresheetfortheorig.csv
007_bhrcsselfreportedscared.csv
019_bhrcsparentreportsocialcohesio.csv
005_beckdepressioninventorybdi.csv
063_nz_election.csv
010_patienthealthquestionnairephq9.csv
027_hrcw2protocolodomiciliaradulto.csv
040_hrcw0psico.csv
026_sf36.csv
047_bhrcsparentreportsdqadult.csv
061_k10.csv
018_scaredformparentandchildversio.csv
036_adhdquestionnaireasrs111.csv
013_hrcw2protocolopsicoadultos.csv
008_gad7anxietyupdated0.csv
038_mcs7youngpersononlinecawiquest.csv
064_hrcw1psicoconf.csv
011_bhrcsparentreportsocialaptitud.csv
054_class_teacher.csv
033_malaise24item.csv
001_patienthealthquestionnaire.csv
048_hrcw2protocolopsicoconfadultos.csv
015_hrcw1dom.csv
046_borderlinepersonalityscreener.csv
037_hrcw0dom.csv
056_race_related.csv
05

Unnamed: 0,Text,IsQuestion,Filename,comment,Unnamed: 2,Note
0,Little interest or pleasure in doing things?,1,062_eoin_no_numbers.csv,,,
1,"Feeling down, depressed, or hopeless?",1,062_eoin_no_numbers.csv,,,
2,"Trouble falling or staying asleep, or sleeping...",1,062_eoin_no_numbers.csv,,,
3,Feeling tired or having little energy?,1,062_eoin_no_numbers.csv,,,
4,Poor appetite or overeating?,1,062_eoin_no_numbers.csv,,,


In [9]:
combined_df.isna().sum()

Text           91967
IsQuestion         0
Filename           0
comment       175777
Unnamed: 2    176383
Note          176405
dtype: int64

In [10]:
combined_df.shape

(176429, 6)

In [11]:
combined_df["IsQuestion"].value_counts()

IsQuestion
0    165065
1     11364
Name: count, dtype: int64

In [12]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(combined_df['Text'], combined_df['IsQuestion'], test_size=0.1, random_state=42)

# Ensure X_train and X_test are lists
X_train_list = X_train.tolist()
X_test_list = X_test.tolist()

In [13]:
X_train_list = [str(text) for text in X_train_list]
X_test_list = [str(text) for text in X_test_list]

In [14]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the pipeline with XGBClassifier
model_xgb = Pipeline([
    ('embedder', SentenceTransformerEmbedder()),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)),
])

# Train the model
model_xgb.fit(X_train_list, y_train)

# Predict the labels for the test data
y_pred_xgb = model_xgb.predict(X_test_list)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))


Batches:   0%|          | 0/4963 [00:00<?, ?it/s]

Batches:   0%|          | 0/552 [00:00<?, ?it/s]

Accuracy: 0.9446239301706059

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.99      0.97     16497
           1       0.64      0.34      0.44      1146

    accuracy                           0.94     17643
   macro avg       0.80      0.66      0.71     17643
weighted avg       0.93      0.94      0.94     17643



In [15]:
combined_df["Text"] = combined_df["Text"].fillna('')

In [16]:
import pandas as pd

df = combined_df[['Text', 'IsQuestion']]

# Function to combine rows of 'Text' column
def combine_text_rows(df):
    combined_text = []
    combined_other = []
    current_text = ''
    current_other = ''

    for index, row in df.iterrows():
        if row['Text'] != '':
            current_text += str(row['Text']) + ' '
            current_other = row['IsQuestion']
        else:
            if current_text != '':
                combined_text.append(current_text.strip())
                combined_other.append(current_other)
                current_text = ''

    # If there's remaining text after the loop ends
    if current_text != '':
        combined_text.append(current_text.strip())
        combined_other.append(current_other)

    combined_df = pd.DataFrame({'Text': combined_text, 'IsQuestion': combined_other})
    return combined_df

# Call the function to combine rows
processed_df = combine_text_rows(df)
print(processed_df)


                                                    Text  IsQuestion
0      Little interest or pleasure in doing things? F...           1
1      Center for Epidemiologic Studies Depression Sc...           0
2      Below is a list of the ways you might have fel...           0
3                                   During the Past Week           0
4                  Rarely or none of the time (less than           0
...                                                  ...         ...
52356         Q46. If the mother moved who was informed?           1
52357              First Move | Second Move | Third Move           0
52358  Person informed A0366 A0369 A0372 GAP. eeeetce...           0
52359   Domiciliary. ...........:::::00000 A0366C A0369C           0
52360  G.P. Unit... eeeeeeeees A0366D A0369D Hospital...           0

[52361 rows x 2 columns]


In [17]:
processed_df.to_csv("processed.csv", index=False)

In [18]:
processed_df.shape

(52361, 2)

In [19]:
processed_df["IsQuestion"].value_counts()

IsQuestion
0    44890
1     7471
Name: count, dtype: int64

In [20]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(processed_df['Text'], processed_df['IsQuestion'], test_size=0.1, random_state=42)

# Ensure X_train and X_test are lists
X_train_list = X_train.tolist()
X_test_list = X_test.tolist()

In [21]:
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

# Initialize the pipeline with XGBClassifier
model_xgb = Pipeline([
    ('embedder', SentenceTransformerEmbedder()),
    ('classifier', XGBClassifier(use_label_encoder=False, eval_metric='logloss', n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)),
])

# Train the model
model_xgb.fit(X_train_list, y_train)

# Predict the labels for the test data
y_pred_xgb = model_xgb.predict(X_test_list)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_xgb))
print("\nClassification Report:\n", classification_report(y_test, y_pred_xgb))

Batches:   0%|          | 0/1473 [00:00<?, ?it/s]

Batches:   0%|          | 0/164 [00:00<?, ?it/s]

Accuracy: 0.8789383234676341

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.97      0.93      4488
           1       0.66      0.32      0.43       749

    accuracy                           0.88      5237
   macro avg       0.78      0.65      0.68      5237
weighted avg       0.86      0.88      0.86      5237



In [22]:
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from xgboost import XGBClassifier
from sklearn.metrics import classification_report, accuracy_score

In [23]:
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sentence_transformers import SentenceTransformer

class SentenceTransformerEmbedder(BaseEstimator, TransformerMixin):
    def __init__(self, model_name='sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'):
        self.model = SentenceTransformer(model_name)

    def fit(self, X, y=None):
        return self  # nothing to fit

    def transform(self, X):
        # Convert texts to embeddings
        embeddings = self.model.encode(X, show_progress_bar=True)
        return np.array(embeddings)

In [24]:
base_learners = [('rf_model', RandomForestClassifier(criterion = 'entropy', max_depth = 10, max_features = 'sqrt',
                                                     max_leaf_nodes = 8, min_samples_leaf = 5, min_samples_split = 2,
                                                     n_estimators = 50, random_state = 10)),
                 ('KNN_model', KNeighborsClassifier(n_neighbors = 17, metric = 'euclidean')),
                 ('NB_model', GaussianNB())]

In [25]:

stack_model = Pipeline([
    ('embedder', SentenceTransformerEmbedder()),
    ('classifier',StackingClassifier(estimators = base_learners, final_estimator = GaussianNB())),
])

# Train the model
stack_model.fit(X_train_list, y_train)

# Predict the labels for the test data
y_pred_stack = stack_model.predict(X_test_list)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test, y_pred_stack))
print("\nClassification Report:\n", classification_report(y_test, y_pred_stack))

Batches:   0%|          | 0/1473 [00:00<?, ?it/s]

Batches:   0%|          | 0/164 [00:00<?, ?it/s]

Accuracy: 0.8651899942715295

Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.88      0.92      4488
           1       0.52      0.81      0.63       749

    accuracy                           0.87      5237
   macro avg       0.74      0.84      0.77      5237
weighted avg       0.90      0.87      0.88      5237



In [26]:
import pickle

with open('models/stack_model_without_preprocessing.pkl', 'wb') as f:
    pickle.dump(stack_model, f)

In [28]:
#for undersampling
import pandas as pd
from sklearn.model_selection import train_test_split

# Assuming combined_df is your DataFrame
# Separate the classes
class_0 = processed_df[processed_df['IsQuestion'] == 0]
class_1 = processed_df[processed_df['IsQuestion'] == 1]

# Sample 30% of class 0 without replacement
class_0_sampled = class_0.sample(frac=0.3, random_state=42)

# Combine the sampled class 0 data with all of class 1 data
combined_sampled_df = pd.concat([class_0_sampled, class_1])

# Now you split the combined_sampled_df into training and testing sets
X_train_us, X_test_us, y_train_us, y_test_us = train_test_split(
    combined_sampled_df['Text'],
    combined_sampled_df['IsQuestion'],
    test_size=0.1,
    random_state=42
)

# Ensure X_train and X_test are lists if needed
X_train_list_us = X_train_us.tolist()
X_test_list_us = X_test_us.tolist()

In [29]:
import pickle

with open('models/us_data.pkl', 'wb') as f:
    pickle.dump(combined_sampled_df, f)

In [30]:

stack_model = Pipeline([
    ('embedder', SentenceTransformerEmbedder()),
    ('classifier',StackingClassifier(estimators = base_learners, final_estimator = GaussianNB())),
])

# Train the model
stack_model.fit(X_train_list_us, y_train_us)

# Predict the labels for the test data
y_pred_stack_us = stack_model.predict(X_test_list_us)

# Evaluate the model
print("Accuracy:", accuracy_score(y_test_us, y_pred_stack_us))
print("\nClassification Report:\n", classification_report(y_test_us, y_pred_stack_us))

Batches:   0%|          | 0/589 [00:00<?, ?it/s]

Batches:   0%|          | 0/66 [00:00<?, ?it/s]

Accuracy: 0.8042024832855779

Classification Report:
               precision    recall  f1-score   support

           0       0.90      0.79      0.84      1342
           1       0.69      0.84      0.75       752

    accuracy                           0.80      2094
   macro avg       0.79      0.81      0.80      2094
weighted avg       0.82      0.80      0.81      2094



In [31]:
import pickle

with open('models/stack_model_with_undersampling.pkl', 'wb') as f:
    pickle.dump(stack_model, f)

In [32]:
import numpy as np
from sklearn.utils import class_weight

class_weights = list(class_weight.compute_class_weight(class_weight= 'balanced',
                                             classes = np.unique(combined_sampled_df['IsQuestion']),
                                             y = combined_sampled_df['IsQuestion']))

#df['Product'].value_counts()

class_weights.sort()

class_weights

weightsus={}


for index, weight in enumerate(class_weights) :
    weightsus[index]=weight


In [33]:
weightsus

{0: 0.7773817479765353, 1: 1.4012849685450408}

In [34]:
import numpy as np
from sklearn.utils import class_weight

class_weights = list(class_weight.compute_class_weight(class_weight= 'balanced',
                                             classes = np.unique(processed_df['IsQuestion']),
                                             y = processed_df['IsQuestion']))

#df['Product'].value_counts()

class_weights.sort()

class_weights

weights={}


for index, weight in enumerate(class_weights) :
    weights[index]=weight

In [35]:
from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Assuming 'combined_df' is your DataFrame containing the data

# Parameters
embedding_dim = 384  # Adjusted to match the Sentence Transformer model's embedding size

# Load the pretrained Sentence Transformer model
model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
sentence_model = SentenceTransformer(model_name)

# Function to preprocess text data using Sentence Transformer
def get_embeddings(sentences):
    return sentence_model.encode(sentences, batch_size=64, show_progress_bar=True)

# Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(combined_df['Cell Data'], combined_df['IsQuestion'], test_size=0.1, random_state=42)

# Convert X_train and X_test to lists to ensure proper indexing
# X_train_list = X_train.tolist()
# X_test_list = X_test.tolist()

# Preprocess the text data to get embeddings
train_embeddings = get_embeddings(X_train_list)
test_embeddings = get_embeddings(X_test_list)

# Reshape the embeddings to add a "sequence" dimension
train_embeddings = np.expand_dims(train_embeddings, axis=1)
test_embeddings = np.expand_dims(test_embeddings, axis=1)

# LSTM Model Architecture
model = tf.keras.Sequential([
    # Input shape is now (1, embedding_dim) since we're treating each embedding as a single timestep
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True), input_shape=(1, embedding_dim)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=False)),  # Example dimension reduction
    tf.keras.layers.Dense(64, activation='relu'),  # Example additional dense layer
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Overview of the model
model.summary()

# Training parameters
batch_size = 256
epochs = 25


history = model.fit(
    train_embeddings, np.array(y_train),
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(test_embeddings, np.array(y_test)),
    class_weight=weights,  # Ensure 'weights' is defined as per your original code
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]
)



Batches:   0%|          | 0/737 [00:00<?, ?it/s]

Batches:   0%|          | 0/82 [00:00<?, ?it/s]

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (None, 1, 768)            2362368   
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 256)               918528    
 onal)                                                           
                                                                 
 dense (Dense)               (None, 64)                16448     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 3297409 (12.58 MB)
Trainable params: 32974

In [39]:
y_pred = model.predict(test_embeddings)

threshold = 0.75
y_pred_labels = (y_pred > threshold).astype(int)

from sklearn.metrics import classification_report, accuracy_score

classification= classification_report(y_test, y_pred_labels)
print(classification)

              precision    recall  f1-score   support

           0       0.97      0.90      0.93      4488
           1       0.58      0.82      0.68       749

    accuracy                           0.89      5237
   macro avg       0.77      0.86      0.81      5237
weighted avg       0.91      0.89      0.90      5237



In [40]:
model.save('models/lstm_without_undersampling.h5')

  saving_api.save_model(


In [41]:

# Then by loading the model back
import tensorflow as tf
new_model = tf.keras.models.load_model('models/lstm_without_undersampling.h5')

# Check its architecture
new_model.summary()


Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional (Bidirection  (None, 1, 768)            2362368   
 al)                                                             
                                                                 
 bidirectional_1 (Bidirecti  (None, 256)               918528    
 onal)                                                           
                                                                 
 dense (Dense)               (None, 64)                16448     
                                                                 
 dropout (Dropout)           (None, 64)                0         
                                                                 
 dense_1 (Dense)             (None, 1)                 65        
                                                                 
Total params: 3297409 (12.58 MB)
Trainable params: 32974

In [42]:
#LSTM with undersampling

from sentence_transformers import SentenceTransformer
import numpy as np
from sklearn.model_selection import train_test_split
import tensorflow as tf

# Assuming 'combined_df' is your DataFrame containing the data

# Parameters
embedding_dim = 384  # Adjusted to match the Sentence Transformer model's embedding size

# Load the pretrained Sentence Transformer model
model_name = 'sentence-transformers/paraphrase-multilingual-MiniLM-L12-v2'
sentence_model = SentenceTransformer(model_name)

# Function to preprocess text data using Sentence Transformer
def get_embeddings(sentences):
    return sentence_model.encode(sentences, batch_size=64, show_progress_bar=True)

# Split the dataset into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(combined_df['Cell Data'], combined_df['IsQuestion'], test_size=0.1, random_state=42)

# Convert X_train and X_test to lists to ensure proper indexing
# X_train_list = X_train.tolist()
# X_test_list = X_test.tolist()

# Preprocess the text data to get embeddings
train_embeddings = get_embeddings(X_train_list_us)
test_embeddings = get_embeddings(X_test_list_us)

# Reshape the embeddings to add a "sequence" dimension
train_embeddings = np.expand_dims(train_embeddings, axis=1)
test_embeddings = np.expand_dims(test_embeddings, axis=1)

# LSTM Model Architecture
model = tf.keras.Sequential([
    # Input shape is now (1, embedding_dim) since we're treating each embedding as a single timestep
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(embedding_dim, return_sequences=True), input_shape=(1, embedding_dim)),
    tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, return_sequences=False)),  # Example dimension reduction
    tf.keras.layers.Dense(64, activation='relu'),  # Example additional dense layer
    tf.keras.layers.Dropout(0.2),
    tf.keras.layers.Dense(1, activation='sigmoid')
])

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Overview of the model
model.summary()

# Training parameters
batch_size = 256
epochs = 25


history = model.fit(
    train_embeddings, np.array(y_train_us),
    batch_size=batch_size,
    epochs=epochs,
    validation_data=(test_embeddings, np.array(y_test_us)),
    class_weight=weightsus,  # Ensure 'weights' is defined as per your original code
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, min_delta=0.0001)]
)



Batches:   0%|          | 0/295 [00:00<?, ?it/s]

Batches:   0%|          | 0/33 [00:00<?, ?it/s]

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 bidirectional_2 (Bidirecti  (None, 1, 768)            2362368   
 onal)                                                           
                                                                 
 bidirectional_3 (Bidirecti  (None, 256)               918528    
 onal)                                                           
                                                                 
 dense_2 (Dense)             (None, 64)                16448     
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_3 (Dense)             (None, 1)                 65        
                                                                 
Total params: 3297409 (12.58 MB)
Trainable params: 329

In [43]:
y_pred = model.predict(test_embeddings)

threshold = 0.75
y_pred_labels = (y_pred > threshold).astype(int)

from sklearn.metrics import classification_report, accuracy_score

classification= classification_report(y_test_us, y_pred_labels)
print(classification)

              precision    recall  f1-score   support

           0       0.91      0.87      0.89      1342
           1       0.78      0.85      0.81       752

    accuracy                           0.86      2094
   macro avg       0.85      0.86      0.85      2094
weighted avg       0.87      0.86      0.86      2094



In [44]:
model.save(r'\models\lstm_with_undersampling.keras')