In [51]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, concatenate, Dropout
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.optimizers.schedules import ExponentialDecay
from sklearn.metrics import r2_score

In [None]:
df=pd.read_csv("Assembly.csv")
df.head()

# Approach:1 Building a Neural Network with LSTM Layers

![Image](./first_image.png)

In [64]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Issues'])
vocab_size = len(tokenizer.word_index) + 1
max_seq_length = 10
X_text = pad_sequences(tokenizer.texts_to_sequences(df['Issues']), maxlen=max_seq_length, padding='post')


In [None]:
vocab_size

In [55]:
X_numeric = df.drop(columns=['Issues', 'Working_days']).values
scaler = StandardScaler()
X_numeric = scaler.fit_transform(X_numeric)

y = df['Working_days'].values



In [56]:
X_text_train, X_text_test, X_numeric_train, X_numeric_test, y_train, y_test = train_test_split(
    X_text, X_numeric, y, test_size=0.2, random_state=42
)

In [None]:
text_input = Input(shape=(max_seq_length,), name='text_input')
embedding_layer = Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_length)(text_input)
lstm_out = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(embedding_layer)

In [58]:
numeric_input = Input(shape=(X_numeric.shape[1],), name='numeric_input')

In [59]:
combined = concatenate([lstm_out, numeric_input])

In [60]:
x = Dense(64, activation='relu')(combined)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='linear')(x)

### Train the model

In [61]:
model = Model(inputs=[text_input, numeric_input], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

In [None]:
history = model.fit(
    [X_text_train, X_numeric_train], y_train,
    validation_data=([X_text_test, X_numeric_test], y_test),
    epochs=20,
    batch_size=4,
    callbacks=[early_stopping]
)

### Evaluation and Performance Metrics

In [None]:
loss, mae = model.evaluate([X_text_test, X_numeric_test], y_test)

y_pred = model.predict([X_text_test, X_numeric_test])

r2 = r2_score(y_test, y_pred)
print(f"Test Loss: {loss}, Test MAE: {mae}")
print(f"R² Score: {r2}")

In [None]:
y_pred,y_test

In [None]:
model.save('text_num_lstm/lstm_model_latest.h5')

## Approach:2 Generating embeddings with SciBERT and training the neural network without separate layers for text and numeric data.

![Image1](./second_image.png)

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Input, Dense, concatenate
from tensorflow.keras.models import Model

### Loading the scibert model from Hugging face model hub

In [None]:
tokenizer1 = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model1 = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

In [None]:
vocab_size = tokenizer1.vocab_size
print("Vocabulary size:", vocab_size)

### Tokenization Function for Texts

In [7]:
def tokenize_text(texts, tokenizer1, max_length=50):
    encodings = tokenizer1(
        texts, 
        padding=True, 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
    )
    return encodings

In [8]:
encodings = tokenize_text(df['Issues'].tolist(), tokenizer1)

### Extracting Text Features with SciBERT Model

In [9]:
with torch.no_grad():
    outputs = model1(
        input_ids=encodings['input_ids'], 
        attention_mask=encodings['attention_mask']
    )
    text_features = outputs.pooler_output.numpy()

In [None]:
# save the embeddings to a file (As the dataset is large, it can save time from running the above cell again and again)
np.save('text_num_lstm/text_features.npy', text_features)

In [24]:
# loading the saved file
text_features = np.load("text_num_lstm/text_features.npy")

### Feature Engineering

In [25]:
# Numeric Data
X_numeric = df.drop(columns=['Issues', 'Working_days']).values
scaler = StandardScaler()
X_numeric = scaler.fit_transform(X_numeric)

In [26]:
# combined data (Numeric and Text)
X_combined = np.hstack([text_features, X_numeric])

y = df['Working_days'].values

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [None]:
print(X_combined.shape)

### Feed forward neural network 

In [28]:
input_dim = X_combined.shape[1]
input_layer = Input(shape=(input_dim,))
dense_1 = Dense(64, activation='relu')(input_layer)
dense_2 = Dense(32, activation='relu')(dense_1)
output = Dense(1, activation='linear')(dense_2)

model1 = Model(inputs=input_layer, outputs=output)
model1.compile(optimizer='adam', loss='mse', metrics=['mae'])


In [None]:
model1.summary()

In [None]:

# Train the model
history1 = model1.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=4, callbacks=[early_stopping])

### Model Evaluation and Performance Metrics

In [None]:
loss, mae = model1.evaluate(X_test, y_test)
y_pred = model1.predict(X_test)
r2 = r2_score(y_test, y_pred)
print(f"Test Loss: {loss}, Test MAE: {mae}")
print(f"R² Score: {r2}")

In [None]:
model1.save('text_num_lstm/Withscibert_model.h5')

## Approach:3 Training Neural Network with Separate Layers for Numeric and Text Data using SciBert model

![Image2](./Third_image.png)

In [35]:
def build_model(text_input_dim, numeric_input_dim):

    # Input Layer
    text_input = Input(shape=(text_input_dim,), name='text_input')
    numeric_input = Input(shape=(numeric_input_dim,), name='numeric_input')


    # feed forward network
    text_branch = Dense(128, activation='relu')(text_input)
    text_branch = Dropout(0.2)(text_branch)
    
    numeric_branch = Dense(128, activation='relu')(numeric_input)
    numeric_branch = Dropout(0.2)(numeric_branch)
    
    #Combining the text_features and numeric_features
    merged = concatenate([text_branch, numeric_branch])
    
    merged = Dense(64, activation='relu')(merged)
    merged = Dropout(0.2)(merged)
    
    # Output layer
    output = Dense(1)(merged)  # For regression problem, use 1 unit
    
    # Create the model
    model = Model(inputs=[text_input, numeric_input], outputs=output)
    
    # Compile the model
    model.compile(optimizer='adam', loss='mean_squared_error', metrics=['mae'])
    
    return model

In [36]:
text_input_dim = text_features.shape[1]  
numeric_input_dim = X_numeric.shape[1]

In [None]:
text_input_dim,numeric_input_dim

In [None]:

model2 = build_model(text_input_dim, numeric_input_dim)
model2.summary()

### Training the model

In [None]:
history2 = model2.fit(
    [X_train[:, :text_input_dim], X_train[:, text_input_dim:]],
    y_train,
    validation_data=([X_test[:, :text_input_dim], X_test[:, text_input_dim:]], y_test),
    epochs=20,
    batch_size=4,
    callbacks=[early_stopping]
)

### Model Evaluation and Performance Metrics

In [None]:
loss, mae = model2.evaluate([X_test[:, :text_input_dim], X_test[:, text_input_dim:]], y_test)
y_pred = model2.predict([X_test[:, :text_input_dim], X_test[:, text_input_dim:]])


y_pred = y_pred.flatten()

r2 = r2_score(y_test, y_pred)
print(f"Test Loss: {loss}, Test MAE: {mae}")
print(f"R² Score: {r2}")

In [None]:
model2.save('text_num_lstm/SeperatelayersSciBert_model.h5')