In [1]:
import pandas as pd
import numpy as np
from tqdm.auto import tqdm
import tensorflow as tf
from transformers import BertTokenizer

In [2]:
# df = pd.read_csv('train.tsv', sep='\t')

df = pd.read_csv('Privacy_classifier_review_sentiment.csv')
sentimentmap={'positive':1, 'negative':0}

df["Sentiment"]=df['Sentiment'].map(sentimentmap)


df.head()

Unnamed: 0.1,Unnamed: 0,Learner Perspective on Privacy,Sentiment
0,0,One of the other reviewers has mentioned that ...,1
1,1,A wonderful little production. <br /><br />The...,1
2,2,I thought this was a wonderful way to spend ti...,1
3,3,Basically there's a family where a little boy ...,0
4,4,"Petter Mattei's ""Love in the Time of Money"" is...",1


In [3]:
df['Sentiment']
df['Learner Perspective on Privacy']

df2=df[['Learner Perspective on Privacy', 'Sentiment']]
df2

Unnamed: 0,Learner Perspective on Privacy,Sentiment
0,One of the other reviewers has mentioned that ...,1
1,A wonderful little production. <br /><br />The...,1
2,I thought this was a wonderful way to spend ti...,1
3,Basically there's a family where a little boy ...,0
4,"Petter Mattei's ""Love in the Time of Money"" is...",1
...,...,...
80995,Privacy-related shortcomings on this platform ...,0
80996,"In a recent privacy breach, I was alarmed to f...",1
80997,The platform's privacy measures reflect a genu...,1
80998,"During my recent travels, I became increasingl...",1


### **Data Preparation**

In [4]:
# df.drop(118202, inplace=True)
df2['Sentiment'] = df2['Sentiment'].astype(int)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df2['Sentiment'] = df2['Sentiment'].astype(int)


In [5]:
df2['Sentiment'].value_counts()

'''
The sentiment labels are:
0 - negative
1 - positive
'''

'\nThe sentiment labels are:\n0 - negative\n1 - positive\n'

In [6]:
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

In [7]:
token = tokenizer.encode_plus(
    df2['Learner Perspective on Privacy'].iloc[0], 
    max_length=256, 
    truncation=True, 
    padding='max_length', 
    add_special_tokens=True,
    return_tensors='tf'
)

In [8]:
token
print (token.input_ids)
print(token.attention_mask)


tf.Tensor(
[[  101  1448  1104  1103  1168 19475  1144  3025  1115  1170  2903  1198
    122 16075  2004  1128   112  1325  1129 14111   119  1220  1132  1268
    117  1112  1142  1110  2839  1184  2171  1114  1143   119   133  9304
    120   135   133  9304   120   135  1109  1148  1645  1115  4168  1143
   1164 16075  1108  1157 28024  1105  8362  2087  2836  7520  4429  1104
   4289   117  1134  1383  1107  1268  1121  1103  1937 27157   119  4623
   1143   117  1142  1110  1136   170  1437  1111  1103  7859 21898  1137
    189  4060  2386   119  1188  1437  7561  1185 19739  1114 12747  1106
   5557   117  2673  1137  4289   119  2098  1110 16883   117  1107  1103
   5263  1329  1104  1103  1937   119   133  9304   120   135   133  9304
    120   135  1135  1110  1270   152  5301  1112  1115  1110  1103  8002
   1549  1106  1103 18638 13878  4354  1426 23544  3150 17103   119  1135
   7203  2871  1113 24464  1392   117  1126  6700  2237  1104  1103  3315
   1187  1155  1103  3652  

In [9]:
X_input_ids = np.zeros((len(df), 256))
X_attn_masks = np.zeros((len(df), 256))

In [10]:
def generate_training_data(df, ids, masks, tokenizer):
    for i, text in tqdm(enumerate( df2['Learner Perspective on Privacy'])):
        tokenized_text = tokenizer.encode_plus(
            text,
            max_length=256, 
            truncation=True, 
            padding='max_length', 
            add_special_tokens=True,
            return_tensors='tf'
        )
        ids[i, :] = tokenized_text.input_ids
        masks[i, :] = tokenized_text.attention_mask
    return ids, masks

In [11]:
X_input_ids, X_attn_masks = generate_training_data(df, X_input_ids, X_attn_masks, tokenizer)

0it [00:00, ?it/s]

In [12]:
labels = np.zeros((len(df2), 2))
labels.shape

(81000, 2)

In [13]:
labels[np.arange(len(df2)), df2['Sentiment'].values] = 1 # one-hot encoded target tensor

In [14]:
labels

array([[0., 1.],
       [0., 1.],
       [0., 1.],
       ...,
       [0., 1.],
       [0., 1.],
       [1., 0.]])

In [15]:
# creating a data pipeline using tensorflow dataset utility, creates batches of data for easy loading...
dataset = tf.data.Dataset.from_tensor_slices((X_input_ids, X_attn_masks, labels))
dataset.take(1) # one sample data

<_TakeDataset element_spec=(TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(256,), dtype=tf.float64, name=None), TensorSpec(shape=(2,), dtype=tf.float64, name=None))>

In [16]:
def SentimentDatasetMapFunction(input_ids, attn_masks, labels):
    return {
        'input_ids': input_ids,
        'attention_mask': attn_masks
    }, labels

In [17]:
dataset = dataset.map(SentimentDatasetMapFunction) # converting to required format for tensorflow dataset 

In [18]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(256,), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(256,), dtype=tf.float64, name=None)}, TensorSpec(shape=(2,), dtype=tf.float64, name=None))>

In [19]:
dataset = dataset.shuffle(10000).batch(16, drop_remainder=True) # batch size, drop any left out tensor

In [20]:
dataset.take(1)

<_TakeDataset element_spec=({'input_ids': TensorSpec(shape=(16, 256), dtype=tf.float64, name=None), 'attention_mask': TensorSpec(shape=(16, 256), dtype=tf.float64, name=None)}, TensorSpec(shape=(16, 2), dtype=tf.float64, name=None))>

In [21]:
p = 0.9
train_size = int((len(df)//16)*p) # for each 16 batch of data we will have len(df)//16 samples, take 80% of that for train.

In [22]:
train_size

4555

In [23]:
train_dataset = dataset.take(train_size)
val_dataset = dataset.skip(train_size)

### **Model**

In [24]:
from transformers import TFBertModel

In [25]:
model = TFBertModel.from_pretrained('bert-base-cased') # bert base model with pretrained weights

Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFBertModel: ['cls.seq_relationship.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing TFBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertModel for predictions w

In [26]:
# defining 2 input layers for input_ids and attn_masks
input_ids = tf.keras.layers.Input(shape=(256,), name='input_ids', dtype='int32')
attn_masks = tf.keras.layers.Input(shape=(256,), name='attention_mask', dtype='int32')

bert_embds = model.bert(input_ids, attention_mask=attn_masks)[1] # 0 -> activation layer (3D), 1 -> pooled output layer (2D)
intermediate_layer = tf.keras.layers.Dense(512, activation='relu', name='intermediate_layer')(attn_masks)
output_layer = tf.keras.layers.Dense(2, activation='softmax', name='output_layer')(intermediate_layer) # softmax -> calcs probs of classes

sentiment_model = tf.keras.Model(inputs=[input_ids, attn_masks], outputs=output_layer)
sentiment_model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 attention_mask (InputLayer  [(None, 256)]                0         []                            
 )                                                                                                
                                                                                                  
 intermediate_layer (Dense)  (None, 512)                  131584    ['attention_mask[0][0]']      
                                                                                                  
 input_ids (InputLayer)      [(None, 256)]                0         []                            
                                                                                                  
 output_layer (Dense)        (None, 2)                    1026      ['intermediate_layer[0][0]

In [27]:
optim = tf.keras.optimizers.legacy.Adam(learning_rate=1e-5, decay=1e-6)
loss_func = tf.keras.losses.CategoricalCrossentropy()
acc = tf.keras.metrics.CategoricalAccuracy('accuracy')

In [28]:
sentiment_model.compile(optimizer=optim, loss=loss_func, metrics=[acc])

# Training The Model

In [29]:
hist = sentiment_model.fit(
    train_dataset,
    validation_data=val_dataset,
    epochs=15
)

Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


In [30]:
sentiment_model.save('sentiment_model')

INFO:tensorflow:Assets written to: sentiment_model\assets


INFO:tensorflow:Assets written to: sentiment_model\assets


# Ploting the Loss and Accuracy on the Epochs

### **Prediction**

In [31]:
sentiment_model = tf.keras.models.load_model('sentiment_model')

tokenizer = BertTokenizer.from_pretrained('bert-base-cased')

def prepare_data(input_text, tokenizer):
    token = tokenizer.encode_plus(
        input_text,
        max_length=256, 
        truncation=True, 
        padding='max_length', 
        add_special_tokens=True,
        return_tensors='tf'
    )
    return {
        'input_ids': tf.cast(token.input_ids, tf.float64),
        'attention_mask': tf.cast(token.attention_mask, tf.float64)
    }

def make_prediction(model, processed_data, classes=['Positive', 'Negative']):
    probs = model.predict(processed_data)[0]
    return classes[np.argmax(probs)]

In [32]:
input_text = input('Enter movie review here: ')
processed_data = prepare_data(input_text, tokenizer)

result = make_prediction(sentiment_model, processed_data=processed_data)
print(f"Predicted Sentiment: {result}")

Predicted Sentiment: Positive


# Save Responses

In [33]:
import pandas as pd

# Function to collect user input
def get_user_input():
    # user_id = input("Enter User ID: ")
    input_text = input("Enter user input: ")
    # privacy_issue = input("Enter privacy issues (if any): ")
    return input_text
# Prompt user for input
user_input_data = []
# while True:
input_text= get_user_input()
# Perform sentiment prediction here (you need to implement this)
processed_data = prepare_data(input_text, tokenizer)
result = make_prediction(sentiment_model, processed_data=processed_data)
# sentiment_prediction = predict_sentiment(user_input)
user_input_data.append({
    'InputText': input_text,
    'Sentiment': result
})

    # Check if the user wants to enter more data
    # more_data = input("Do you want to enter more data? (yes/no): ")
    # if more_data.lower() != 'yes':
    #     break
# Create DataFrame
# df = pd.DataFrame(user_input_data)
# Create DataFrame from new user input data
new_df = pd.DataFrame(user_input_data)

# Load existing data from Excel file (if exists)
try:
    existing_df = pd.read_excel('results.xlsx')
except FileNotFoundError:
    existing_df = pd.DataFrame()

# Concatenate new data with existing data
combined_df = pd.concat([existing_df, new_df], ignore_index=True)

# Save combined data to Excel file
combined_df.to_excel('results.xlsx', index=False)
# # Save DataFrame to Excel
# df.to_excel('results.xlsx', index=False)

print(result)


Positive


In [34]:
Privacy considerations are lacking, creating unease in my learning experience

SyntaxError: invalid syntax (1360632293.py, line 1)

In [None]:

# Create DataFrame from new user input data
new_df = pd.DataFrame(user_input_data)

# Load existing data from Excel file (if exists)
try:
    existing_df = pd.read_excel('sentiments.xlsx')
except FileNotFoundError:
    existing_df = pd.DataFrame()

# Concatenate new data with existing data
combined_df = pd.concat([existing_df, new_df], ignore_index=True)

# Save combined data to Excel file
combined_df.to_excel('sentiments.xlsx', index=False)

# Save responses

In [None]:
import pandas as pd


# Test the model with new data
user_inputs = [
    "I do not love using this product!",
    "I'm not satisfied with the customer service.",
    "This app is amazing and easy to use.",
    # Add more user inputs as needed
    ]

data = {"ID": [], "Sentiment": []}

for idx, input_text in enumerate(result, start=1):
    sentiment = predict_sentiment(input_text)
    data["ID"].append(idx)
    data["Sentiment"].append(sentiment)
    print(f"Input: {input_text}\nPredicted Sentiment: {sentiment}\n")

    # SAVE DATA AS CSV
df = pd.DataFrame(data)

# Save DataFrame to Excel
df.to_excel('predictions.xlsx', index=False)
print("Data appended to predictions.xlsx")


In [None]:

# # Test the model with new data
# user_inputs = [
#     "I do not love using this product!",
#     "I'm not satisfied with the customer service.",
#     "This app is amazing and easy to use.",
#     # Add more user inputs as needed
#     ]

# data = {"ID": [], "Sentiment": []}

# for idx, input_text in enumerate(user_inputs, start=1):
#     sentiment = predict_sentiment(input_text)
#     data["ID"].append(idx)
#     data["Sentiment"].append(sentiment)
#     print(f"Input: {input_text}\nPredicted Sentiment: {sentiment}\n")

#     # SAVE DATA AS CSV
# df = pd.DataFrame(data)

# # Save DataFrame to Excel
# df.to_excel('predictions.xlsx', index=False)
# print("Data appended to predictions.xlsx")
