In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Input, Embedding, Bidirectional, LSTM, Dense, concatenate
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
df=pd.read_csv("Assembly.csv")

In [None]:
df.head()

# Embeddings with LSTM and Tokenizer

In [4]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(df['Issues'])
vocab_size = len(tokenizer.word_index) + 1
max_seq_length = 10
X_text = pad_sequences(tokenizer.texts_to_sequences(df['Issues']), maxlen=max_seq_length, padding='post')


In [6]:
X_numeric = df.drop(columns=['Issues', 'Working_days']).values
scaler = StandardScaler()
X_numeric = scaler.fit_transform(X_numeric)

y = df['Working_days'].values



In [7]:
X_text_train, X_text_test, X_numeric_train, X_numeric_test, y_train, y_test = train_test_split(
    X_text, X_numeric, y, test_size=0.2, random_state=42
)

In [8]:
text_input = Input(shape=(max_seq_length,), name='text_input')
embedding_layer = Embedding(input_dim=vocab_size, output_dim=128, input_length=max_seq_length)(text_input)
lstm_out = Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2))(embedding_layer)



In [9]:
numeric_input = Input(shape=(X_numeric.shape[1],), name='numeric_input')

In [10]:
combined = concatenate([lstm_out, numeric_input])

In [13]:
x = Dense(64, activation='relu')(combined)
x = Dense(32, activation='relu')(x)
output = Dense(1, activation='linear')(x)

In [14]:
model = Model(inputs=[text_input, numeric_input], outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])

In [None]:
history = model.fit(
    [X_text_train, X_numeric_train], y_train,
    validation_data=([X_text_test, X_numeric_test], y_test),
    epochs=20,
    batch_size=4
)

In [None]:
# Evaluate the Model
loss, mae = model.evaluate([X_text_test, X_numeric_test], y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

In [None]:
from sklearn.metrics import r2_score
y_pred = model.predict([X_text_test, X_numeric_test])

r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")

In [None]:
y_pred,y_test

## Embeddings with scibert

In [None]:
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.layers import Input, Dense, concatenate
from tensorflow.keras.models import Model

In [None]:
tokenizer = AutoTokenizer.from_pretrained("allenai/scibert_scivocab_uncased")
model = AutoModel.from_pretrained("allenai/scibert_scivocab_uncased")

In [22]:
# Tokenize Text
def tokenize_text(texts, tokenizer, max_length=50):
    encodings = tokenizer(
        texts, 
        padding=True, 
        truncation=True, 
        max_length=max_length, 
        return_tensors="pt"
    )
    return encodings

In [23]:
encodings = tokenize_text(df['Issues'].tolist(), tokenizer)

In [24]:
with torch.no_grad():
    outputs = model(
        input_ids=encodings['input_ids'], 
        attention_mask=encodings['attention_mask']
    )
    text_features = outputs.pooler_output.numpy()

In [26]:
X_numeric = df.drop(columns=['Issues', 'Working_days']).values
scaler = StandardScaler()
X_numeric = scaler.fit_transform(X_numeric)

In [27]:
X_combined = np.hstack([text_features, X_numeric])

y = df['Working_days'].values

X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

In [28]:
# Neural network

input_dim = X_combined.shape[1]
input_layer = Input(shape=(input_dim,))
dense_1 = Dense(64, activation='relu')(input_layer)
dense_2 = Dense(32, activation='relu')(dense_1)
output = Dense(1, activation='linear')(dense_2)

model = Model(inputs=input_layer, outputs=output)
model.compile(optimizer='adam', loss='mse', metrics=['mae'])


In [None]:
# Train the Model
history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=20, batch_size=4)

In [None]:
loss, mae = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test MAE: {mae}")

In [None]:

y_pred = model.predict(X_test)

r2 = r2_score(y_test, y_pred)
print(f"R² Score: {r2}")