*Huggingface*

In [19]:
#Get the models
from transformers import AutoModelForSequenceClassification
from transformers import TFAutoModelForSequenceClassification
from transformers import AutoTokenizer, AutoConfig
import numpy as np
import pandas as pd
from scipy.special import softmax
MODEL = f"cardiffnlp/twitter-roberta-base-sentiment-latest"
tokenizer = AutoTokenizer.from_pretrained(MODEL)
config = AutoConfig.from_pretrained(MODEL)
model = AutoModelForSequenceClassification.from_pretrained(MODEL)


input_data = pd.read_csv('test.csv', encoding='cp1252')
input_data = input_data.dropna()

def evaluate_text(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    label = ''
    score = 0
    for i in range(scores.shape[0]):
        s = scores[ranking[i]]
        if s > score:
            score = s
            label = config.id2label[ranking[i]]
        #print(f"{label} {np.round(float(score), 4)}")
    return label

def evaluate_text_cScore(text):
    encoded_input = tokenizer(text, return_tensors='pt')
    encoded_input = tokenizer(text, return_tensors='pt')
    output = model(**encoded_input)
    scores = output[0][0].detach().numpy()
    scores = softmax(scores)
    ranking = np.argsort(scores)
    ranking = ranking[::-1]
    label = ''
    score = 0
    for i in range(scores.shape[0]):
        s = scores[ranking[i]]
        if s > score:
            score = s
        #print(f"{label} {np.round(float(score), 4)}")
    return score



input_data['model_output'] = input_data['text'].apply(evaluate_text)
input_data['confidence_score'] = input_data['text'].apply(evaluate_text_cScore)

columns_to_keep = ['text', 'expected_sentiment', 'model_output', 'confidence_score']
input_data = input_data[columns_to_keep]

# Save the output to a CSV file
input_data.to_csv('output_sentiment_test.csv', index=False)




In [24]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(input_data['expected_sentiment'], input_data['model_output'])
print(f'Accuracy: {accuracy * 100:.2f}%')

Accuracy: 71.00%


*Tensorflow*

In [1]:
#Import Packages
import pandas as pd
import numpy as np
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, Embedding, concatenate
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.metrics import classification_report
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
#Load up data
train_data = pd.read_csv('train.csv',encoding='cp1252');
test_data = pd.read_csv('test.csv',encoding='cp1252');

In [None]:
#fields that contain na
train_data.isna().any()

In [None]:
test_data.isna().any()

In [5]:
train_data = train_data.dropna()

In [6]:
test_data = test_data.dropna()

In [7]:
tokenizer = Tokenizer(num_words=10000)
tokenizer.fit_on_texts(train_data['text'])
text_sequences = tokenizer.texts_to_sequences(train_data['text'])
text_data = pad_sequences(text_sequences, maxlen=50)

# Prepare target
target = pd.get_dummies(train_data['sentiment']).values

In [None]:
# Split the data
X_train_text, X_dev_text, y_train, y_dev = train_test_split(
    text_data, target, test_size=0.2, random_state=42
)

# Model
text_input = Input(shape=(50,))
text_embed = Embedding(input_dim=10000, output_dim=128)(text_input)
text_out = LSTM(64)(text_embed)


merged = concatenate([text_out])

dense = Dense(64, activation='relu')(merged)
output = Dense(3, activation='softmax')(dense)

model = Model(inputs=[text_input], outputs=output)
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Training
model.fit([X_train_text], y_train, epochs=20, batch_size=32)

In [None]:
model.evaluate([X_dev_text], y_dev)

In [43]:
# Function to preprocess and predict sentiment
def preprocess_and_predict(df, model, tokenizer):
    predictions = []
    indices = []

    for idx, row in df.iterrows():
        # Check for nulls in the columns needed for each input type
        if pd.isnull(row['text']) or pd.isna(row['text']):
            continue  # Skip this row if text data is missing

        # Prepare text data
        seq = tokenizer.texts_to_sequences([row['text']])
        text_data = pad_sequences(seq, maxlen=50)

        # Predict sentiment
        pred = model.predict([text_data],verbose=0)
        predicted_class = np.argmax(pred, axis=1)
        label_map = {0: 'negative', 1: 'neutral', 2: 'positive'}
        predicted_class = [label_map[i] for i in predicted_class]
        predicted_class = ' '.join(predicted_class)
        predictions.append(predicted_class)
        indices.append(idx)

    return predictions, indices

In [None]:
# Clean and preprocess test data
df_test = test_data

# Handle missing values and predict
predicted_labels, valid_indices = preprocess_and_predict(df_test, model, tokenizer)
# Extract actual labels for valid indices
actual_labels = df_test.loc[valid_indices, 'sentiment']
actual_labels = actual_labels.tolist()


In [None]:
from sklearn.metrics import accuracy_score
# Calculate accuracy
accuracy = accuracy_score(actual_labels, predicted_labels)
print(f'Accuracy: {accuracy}')

Pysentimiento BERTweet

In [12]:

from pysentimiento import create_analyzer
import torch
import gc
import pandas as pd
import numpy as np

BATCH_SIZE = 8
torch.cuda.empty_cache()

input_data = pd.read_csv('test.csv', encoding='cp1252')
input_data = input_data.dropna()
EXAMPLES = input_data['text'].astype(str).tolist()

def map_labels(a_prob):
    n_classes = len(a_prob)
    if n_classes == 2:
        class_to_label = {0: 'negative', 1: 'positive'}
    elif n_classes == 3:
        class_to_label = {0: 'negative', 1: 'neutral', 2: 'positive'}
    elif n_classes == 5:
        class_to_label = {0: 'negative', 1: 'negative', 2: 'neutral',
                          3: 'positive', 4: 'positive'}

    class_idx = np.argmax(a_prob)
    label = class_to_label[class_idx]
    return label

def load_pysentimiento():
    pipe = create_analyzer('sentiment', lang='en', batch_size=BATCH_SIZE)
    return pipe

def pysentimiento_output(pipe, texts):
    output = [x.probas for x in pipe.predict(texts)]
    return output

def pysentimiento_score(output):
    output_sorted = [sorted(x.items()) for x in output]
    #print(output_sorted)
    scores = []
    for x in output_sorted:
        scores.append([y[1] for y in x])
    
    labels = [map_labels(score) for score in scores]
    scores = [float(max(score)) for score in scores]
    return scores, labels

def pysentimiento_demo(digits):
    pipe = load_pysentimiento()
    output = pysentimiento_output(pipe, EXAMPLES)
    score, label = pysentimiento_score(output)
    print('')
    del pipe
    gc.collect()
    torch.cuda.empty_cache()
    return score, label

Critical, Label = pysentimiento_demo(4)
#print(Critical)
#print(Label)

from sklearn.metrics import accuracy_score
# Calculate accuracy
accuracy = accuracy_score(input_data['sentiment'], Label)
print(f'Accuracy: {accuracy}')


Map: 100%|██████████| 3534/3534 [00:00<00:00, 7825.00 examples/s]



Accuracy: 0.7059988681380871


tweetnlp


In [17]:
import tweetnlp
def load_tweetnlp(model_name):
    if model_name == 'roberta':
        pipe = tweetnlp.load_model('sentiment')
    elif model_name == 'robertaold':
        pipe = tweetnlp.load_model('sentiment', model_name='cardiffnlp/twitter-roberta-base-sentiment')
    elif model_name == 'robertaxlm':
        pipe = tweetnlp.load_model('sentiment', multilingual=True)

    return pipe

def tweetnlp_output(pipe, texts):
    output = pipe.sentiment(texts, return_probability=True, batch_size=BATCH_SIZE)
    return output

def tweetnlp_polarities(output):
    scores = [list(x['probability'].values()) for x in output]
    print(scores)
    labels = [map_labels(score) for score in scores]
    scores = [float(max(score)) for score in scores]
    return scores, labels

def tweetnlp_demo(model_name, digits):
    pipe = load_tweetnlp(model_name)
    output = tweetnlp_output(pipe, EXAMPLES)
    scores, labels = tweetnlp_polarities(output)
    del pipe
    gc.collect()
    torch.cuda.empty_cache()
    return scores, labels

Critical, Label = tweetnlp_demo('roberta', 5)

from sklearn.metrics import accuracy_score
# Calculate accuracy
accuracy = accuracy_score(input_data['sentiment'], Label)
print(f'Accuracy: {accuracy}')



[[0.007754302117973566, 0.89007568359375, 0.10217003524303436], [0.0019326336914673448, 0.01103623304516077, 0.9870311617851257], [0.9312363862991333, 0.06392994523048401, 0.004833678714931011], [0.00592415127903223, 0.014427561312913895, 0.9796482920646667], [0.003233869094401598, 0.015399637632071972, 0.9813665151596069], [0.003025213722139597, 0.008943077176809311, 0.9880316257476807], [0.9366518259048462, 0.054643601179122925, 0.008704603649675846], [0.8655992746353149, 0.12042974680662155, 0.013970975764095783], [0.09649248421192169, 0.7238731384277344, 0.1796344369649887], [0.007639620918780565, 0.19800175726413727, 0.7943586111068726], [0.9377643465995789, 0.056461866945028305, 0.005773738492280245], [0.4602898955345154, 0.5040782690048218, 0.035631824284791946], [0.7728260159492493, 0.2023460566997528, 0.024827979505062103], [0.8316537141799927, 0.14943410456180573, 0.01891222409904003], [0.86375492811203, 0.1230580061674118, 0.013187084347009659], [0.958426296710968, 0.0352782

HuggingFace DistilBERT

In [21]:
import transformers
from sklearn.metrics import accuracy_score

def load_huggingface(model_name):
    if model_name == 'distilbert':
        model = 'distilbert-base-uncased-finetuned-sst-2-english'
    elif model_name == 'bert':
        model = 'nlptown/bert-base-multilingual-uncased-sentiment'
    elif model_name == 'gpt2':
        model = 'michelecafagna26/gpt2-medium-finetuned-sst2-sentiment'

    if torch.cuda.is_available():
        device = 0
    else:
        device = 'cpu'

    pipe = transformers.pipeline(task='text-classification', model=model, device=device,
                                 batch_size=BATCH_SIZE, truncation=True, top_k=None)
    return pipe

In [25]:
def huggingface_output(pipe, texts):
    output = pipe(texts)
    return output

def huggingface_polarities(output):
    label_score_list = []
    for x in output:
        label_score = {}
        for y in x:
            label_score.update({y['label']: y['score']})
        label_score_sorted = sorted(label_score.items())
        label_score_list.append(label_score_sorted)

    scores = []
    for x in label_score_list:
        scores.append([y[1] for y in x])
    labels = [map_labels(score) for score in scores]
    scores = [float(max(score)) for score in scores]
    return scores, labels

def huggingface_demo(model_name, digits):
    pipe = load_huggingface(model_name)
    output = huggingface_output(pipe, EXAMPLES)
    scores, labels = huggingface_polarities(output)
    print('')
    del pipe
    gc.collect()
    torch.cuda.empty_cache()
    return scores, labels



Critical, Label = huggingface_demo('distilbert', 6)
accuracy = accuracy_score(input_data['expected_sentiment'], Label)
print(f'Accuracy: {accuracy}')
Critical, Label = huggingface_demo('bert', 5)
accuracy = accuracy_score(input_data['expected_sentiment'], Label)
print(f'Accuracy: {accuracy}')
Critical, Label = huggingface_demo('gpt2', 5)
accuracy = accuracy_score(input_data['expected_sentiment'], Label)
print(f'Accuracy: {accuracy}')




Accuracy: 0.5031126202603282


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development



Accuracy: 0.5325410299943407


To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development



Accuracy: 0.5065082059988681
