In [6]:
###############
# Vaibhav Lakshmi
# Bahar Chidem
################

import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import pandas as pd
import transformers
print(transformers.__version__)

file_path = '/Users/bahar/Downloads/first_1100_rows.csv'
df = pd.read_csv(file_path, encoding='ISO-8859-1')
df['text'] = df['problem'].fillna('') + " " + df['solution'].fillna('')  

label_encoder = LabelEncoder()
df['label'] = label_encoder.fit_transform(df['Potential'])

# Splitting the dataset
X_train, X_test, y_train, y_test = train_test_split(df['text'], df['label'], test_size=0.2, random_state=42)

# Ensure all text data is in string format
X_train = X_train.astype(str)
X_test = X_test.astype(str)

# Load BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Tokenization
train_encodings = tokenizer(X_train.tolist(), truncation=True, padding=True, max_length=512)
test_encodings = tokenizer(X_test.tolist(), truncation=True, padding=True, max_length=512)

# Convert to TensorFlow datasets
train_dataset = tf.data.Dataset.from_tensor_slices((dict(train_encodings), y_train)).batch(16)
test_dataset = tf.data.Dataset.from_tensor_slices((dict(test_encodings), y_test)).batch(16)

# Load BERT model with a classification layer
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=len(label_encoder.classes_))

# Model Compilation
optimizer = tf.keras.optimizers.Adam(learning_rate=5e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model.compile(optimizer=optimizer, loss=loss, metrics=['accuracy'])

model.fit(train_dataset.shuffle(100), epochs=3)
model.evaluate(test_dataset)

4.36.2


All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Epoch 2/3
Epoch 3/3


[0.23785936832427979, 0.9363636374473572]

In [None]:
# The code above is training the model
#####################################
# The code below is predicting the ideas based on their Potential and provides the csv file

In [7]:
import pandas as pd
from transformers import BertTokenizer
import tensorflow as tf

tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
# Modify this for testing w another dataset 
file_path_new = '/Users/bahar/Downloads/rest_of_the_rows_selected_columns.csv' 
df_new = pd.read_csv(file_path_new)
df_new['text'] = df_new['problem'].fillna('') + " " + df_new['solution'].fillna('')

new_encodings = tokenizer(df_new['text'].tolist(), truncation=True, padding=True, max_length=512)
new_data = tf.data.Dataset.from_tensor_slices((dict(new_encodings))).batch(16)
new_predictions = model.predict(new_data)

logits = new_predictions.logits
probabilities = tf.nn.softmax(logits, axis=-1)

predicted_indices = tf.argmax(probabilities, axis=1)

predicted_labels = label_encoder.inverse_transform(predicted_indices)

results_df = pd.DataFrame({
    'text': df_new['text'],
    'predicted_potential': predicted_labels
})
# Modify this for the output
output_file_path = '/Users/bahar/Desktop/predictions2.csv'
results_df.to_csv(output_file_path, index=False)



In [None]:
# The code below provides a feedback based on the predicted label and returns the csv file 
# with both predicted potential and feedback related to it.

In [8]:
# Modify this for testing w another dataset 
file_path_new = '/Users/bahar/Downloads/rest_of_the_rows_selected_columns.csv' 
df_new['text'] = df_new['problem'].fillna('') + " " + df_new['solution'].fillna('')

new_encodings = tokenizer(df_new['text'].tolist(), truncation=True, padding=True, max_length=512)
new_data = tf.data.Dataset.from_tensor_slices((dict(new_encodings))).batch(16)
new_predictions = model.predict(new_data)

# Processing predictions
logits = new_predictions.logits
probabilities = tf.nn.softmax(logits, axis=-1)
predicted_indices = tf.argmax(probabilities, axis=1)
predicted_labels = label_encoder.inverse_transform(predicted_indices)

# Generate feedback based on predicted labels
def generate_feedback(label):
    feedback_mapping = {
        'High Potential': 'This idea shows high potential with strong resource efficiency, sustainability, and economic viability.',
        'Medium Potential': 'This idea has medium potential. Consider improvements in resource efficiency, sustainability, or economic viability.',
        'Low Potential': 'This idea has low potential. Assess and enhance resource efficiency, sustainability, and economic viability.',
        'Very Low Potential': 'This idea has very low potential. A major rethink or overhaul may be required to improve its viability.'
    }
    return feedback_mapping.get(label, 'No feedback available')

feedback = [generate_feedback(label) for label in predicted_labels]


results_df = pd.DataFrame({
    'text': df_new['text'],
    'predicted_potential': predicted_labels,
    'feedback': feedback
})
# Modify this for the output
output_file_path = '/Users/bahar/Desktop/predictions_with_feedback.csv'  
results_df.to_csv(output_file_path, index=False)

