In [1]:
import pandas as pd
import numpy as np

In [2]:
file_path = '/Users/diana/Desktop/isear-train.xlsx'
custom_headers = ['Emotions', 'Text']
df = pd.read_excel(file_path, skiprows=1, header=None, names=custom_headers)

In [3]:
df.head ()

Unnamed: 0,Emotions,Text
0,sadness,Losing my girlfriend who made an end to our re...
1,disgust,[ No response.]
2,fear,Staying alone in a dark place.
3,shame,When I failed grade 7.
4,anger,I am a teacher in arts and crafts (boys from 1...


In [4]:
total_count = df['Emotions'].value_counts().sum()
#print (total_count)

In [5]:
df['Emotions'].value_counts()


Emotions
sadness    775
fear       770
guilt      767
shame      765
anger      764
joy        764
disgust    761
Name: count, dtype: int64

In [6]:
print (df['Emotions'])
print (df['Emotions'].shape)
print (type (df['Emotions']))
print (df['Emotions'].values)

0       sadness
1       disgust
2          fear
3         shame
4         anger
         ...   
5361       fear
5362      anger
5363    sadness
5364    disgust
5365       fear
Name: Emotions, Length: 5366, dtype: object
(5366,)
<class 'pandas.core.series.Series'>
['sadness' 'disgust' 'fear' ... 'sadness' 'disgust' 'fear']


In [7]:
emotion_labels = list (set (df['Emotions'].values))

In [8]:
from sklearn.preprocessing import LabelEncoder

y = df['Emotions'].values


label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y)

print("Encoded labels:", y_train_encoded)

Encoded labels: [5 1 2 ... 5 1 2]


In [9]:
def tokenize(text):
    translation_table = str.maketrans({c: f' {c} ' if not c.isalnum() else c for c in set(text)})  # creates translation
    # table(dictionary) with the built-in function maketrans, set(text) makes an unordered collection of unique
    # elements through set comprehension, a concise way to create sets
    tokenized_text = text.translate(translation_table)  # uses translation table to add whitespace around special
    # characters and punctuation
    return tokenized_text.strip().lower().split()  # split-tokens split on space, lower-tokens made all lowercase,
    # strip-and leading or trailing whitespaces are removed from string

import string 

#extract dict
text = ''.join(df['Text'].astype(str))
#print(vocab)

tokenized_text = []
for index, sentence in enumerate(df['Text']):
    emotion_label = df['Emotions'][index]
    sentence = tokenize(sentence)
    tokenized_text.append (sentence)
    #print(emotion_label, sentence)
#print (tokenized_text)


filtered_tokens = []

for tokens in tokenized_text:
    # Initialize a list to store tokens without punctuation
    clean_tokens = []
    
    # Iterate through each token in the list
    for token in tokens:
        # Check if the token is punctuation
        if all(char in string.punctuation for char in token):
            continue  # Skip punctuation tokens
        if token.isdigit():
            continue  # Skip punctuation tokens
        else:
            clean_tokens.append(token)  # Append non-punctuation tokens to clean_tokens list
    
    # Append the clean_tokens list to filtered_tokens, preserving sentence structure
    filtered_tokens.append(clean_tokens)

# Print the filtered tokens
#print(filtered_tokens)
tokenized_text = filtered_tokens

vocab = []
for sentence in tokenized_text:
    for token in sentence:
        vocab.append (token)
vocab = set (vocab)
#print (len(vocab))


In [11]:
from math import log

collection = [token for instance in tokenized_text for token in instance]

#tf-idf

def calculate_tfidf(token, document):
    #tf = np.log(document.count(token) / len(document)) if token in document else 0 # 1+log(#oftimestokenindoc/total#termsindoc)
    tf = document.count(token) / len(document)
    idf = log(1 + (len(tokenized_text) / (collection.count(token) + 1)))
    tfidf = tf * idf
    return tfidf

tfidf_scores_list = []
for sentence in tokenized_text:
    tfidf_scores = {token: calculate_tfidf(token, sentence) for token in set(sentence)}
    tfidf_scores_list.append(tfidf_scores)


In [12]:
from scipy.sparse import csr_matrix

def CSR(x_emotion):
    # Construct a vocabulary list from the tfidf_scores_list
    vocab = list(set(token for scores in x_emotion for token in scores.keys()))

    # Construct a mapping from term to column index
    term_to_col = {term: idx for idx, term in enumerate(vocab)}
    print (len(term_to_col))
    # Convert the list of TF-IDF scores into a CSR sparse matrix
    n_docs = len(x_emotion)
    n_terms = len(vocab)
    rows = []
    cols = []
    data = []
    for doc_idx, scores in enumerate(x_emotion):
        for term, tfidf in scores.items():
            col_idx = term_to_col[term]
            rows.append(doc_idx)
            cols.append(col_idx)
            data.append(tfidf)

    X_tfidf_sparse = csr_matrix((data, (rows, cols)), shape=(n_docs, n_terms))
    return X_tfidf_sparse

X_tfidf_sparse = CSR (tfidf_scores_list)

#print(X_tfidf_sparse[2])


7420


In [13]:
class LogisticRegressionMulticlass:
    def __init__(self, learning_rate=0.3, num_iterations=10000):
        self.learning_rate = learning_rate
        self.num_iterations = num_iterations
        self.weights = None
        self.bias = None
    
    
    def softmax(self, z):
        exp_z = np.exp(z - np.max(z, axis=1, keepdims=True))
        return exp_z / np.sum(exp_z, axis=1, keepdims=True)
    
    def xavier_init(self, shape):
        fan_in = shape[0]
        fan_out = shape[1]
        limit = np.sqrt(6 / (fan_in + fan_out))
        return np.random.uniform(-limit, limit, size=shape)
    
    def fit(self, X, y):
        num_samples, num_features = X.shape
        num_classes = len(np.unique(y))
        self.weights = self.xavier_init((num_features, num_classes))
        self.bias = np.zeros((1, num_classes))
        y_one_hot = np.eye(num_classes)[y]
        
        for _ in range(self.num_iterations):
            linear_model = X.dot(self.weights) + self.bias
            y_pred = self.softmax(linear_model)
            
            dw = (1 / num_samples) * X.T.dot(y_pred - y_one_hot)
            db = (1 / num_samples) * np.sum(y_pred - y_one_hot, axis=0, keepdims=True)
            
            self.weights -= self.learning_rate * dw
            self.bias -= self.learning_rate * db
    
            
    def predict(self, X):

        num_samples, num_features = X.shape
        _, num_classes = self.weights.shape
    
        if num_features != self.weights.shape[0]:
            self.weights = self.xavier_init((num_features, num_classes))
            
        linear_predictions = X.dot(self.weights) + self.bias
        y_pred = self.softmax(linear_predictions)
        print (self.weights.shape)
        class_pred = np.argmax(y_pred, axis=1)  # Choose the class with the highest probability
        return class_pred


In [14]:
model = LogisticRegressionMulticlass()
model.fit(X_tfidf_sparse, y_train_encoded)

In [18]:
pred = model.predict (X_tfidf_sparse)
#print (pred[0:300])
#print (y_train_encoded[0:300])

from sklearn.metrics import f1_score

micro_average_f1 = f1_score(pred, y_train_encoded, average='micro')
print("Micro-average F1-score:", micro_average_f1)

f1_external = f1_score(pred, y_train_encoded, average='weighted')
print("F1-score on training data:", f1_external)


(7420, 7)
Micro-average F1-score: 0.7590383898620947
F1-score on training data: 0.7594288487736144


In [16]:
#Perfomance on the test set

from sklearn.metrics import f1_score
from scipy.sparse import csr_matrix
from math import log

custom_headers_t = ['Emotion', 'Texts']
file = '/Users/diana/Desktop/isear-test.xlsx'
d = pd.read_excel(file, skiprows=1, header=None, names=custom_headers_t)

# Tokenize the test data

tokenized_text_test = []
for index, sentence in enumerate(d['Texts']):
    emotion_label = d['Emotion'][index]
    sentence = tokenize(sentence)
    tokenized_text_test.append (sentence)
   
filtered_tokens_test = []

import string 
for tokens in tokenized_text_test:
    # Initialize a list to store tokens without punctuation
    clean_tokens = []
    
    # Iterate through each token in the list
    for token in tokens:
        # Check if the token is punctuation
        if all(char in string.punctuation for char in token):
            continue  # Skip punctuation tokens
        if token.isdigit():
            continue  # Skip punctuation tokens
        else:
            clean_tokens.append(token)  # Append non-punctuation tokens to clean_tokens list
    
    # Append the clean_tokens list to filtered_tokens, preserving sentence structure
    filtered_tokens_test.append(clean_tokens)

# Print the filtered tokens
#print(filtered_tokens_test)

tokenized_text_test=filtered_tokens_test

collection_test  = [token for instance in tokenized_text_test for token in instance]

def calculate_tfidf(token, document):
    #tf = np.log(document.count(token) / len(document)) if token in document else 0 # 1+log(#oftimestokenindoc/total#termsindoc)
    tf = document.count(token) / len(document) 
    idf = log(1 + (len(tokenized_text) / (collection.count(token) + 1)))
    tfidf = tf * idf
    return tfidf

# Construct a list of TF-IDF scores for the test data
tfidf_scores_list_test = []
for sentence in tokenized_text_test:
    tfidf_scores_test = {token: calculate_tfidf(token, sentence) for token in set(sentence)}
    tfidf_scores_list_test.append(tfidf_scores_test)
#print (tfidf_scores_list_test )


# Construct a mapping from term to column index
term_to_col_test = {term: idx for idx, term in enumerate(vocab)}

# Convert the list of TF-IDF scores into a CSR sparse matrix using the same vocabulary and mapping from training
n_terms = len(vocab)
n_docs_test = len(tfidf_scores_list_test)
rows_test = []
cols_test = []
data_test = []
for doc_idx, scores_t in enumerate(tfidf_scores_list_test):
    for term, tfidf in scores_t.items():
        if term in term_to_col_test:  # Check if term is in the vocabulary
            col_idx = term_to_col_test[term]
            rows_test.append(doc_idx)
            cols_test.append(col_idx)
            data_test.append(tfidf)


X_tfidf_sparse_test = csr_matrix((data_test, (rows_test, cols_test)), shape=(n_docs_test, n_terms))
#print("Shape of X_tfidf_sparse_test:", X_tfidf_sparse_test)


In [19]:
# Predict labels using the trained logistic regression model

y_external_pred = model.predict(X_tfidf_sparse_test)


# Load true labels from the external test file
y_external_true = d['Emotion'].values
label_encoder = LabelEncoder()
y_external_true = label_encoder.fit_transform(y_external_true)
y_external_true_enc = np.array(y_external_true)


f1_external = f1_score(y_external_true, y_external_pred, average='weighted')
print("F1-score on external test data:", f1_external)
micro_average_f1 = f1_score(y_external_true, y_external_pred, average='micro')
print("Micro-average F1-score:", micro_average_f1)

#print (y_external_true[0:300],y_external_pred[0:300])

(7420, 7)
F1-score on external test data: 0.5802989747809041
Micro-average F1-score: 0.5826086956521739
