In [1]:
import pandas as pd
import numpy as np

In [2]:
#Loading the training file
file_path = '/Users/diana/Desktop/isear-train.xlsx'
custom_headers = ['Emotions', 'Text']
df = pd.read_excel(file_path, skiprows=1, header=None, names=custom_headers)

In [3]:
df.head ()

Unnamed: 0,Emotions,Text
0,sadness,Losing my girlfriend who made an end to our re...
1,disgust,[ No response.]
2,fear,Staying alone in a dark place.
3,shame,When I failed grade 7.
4,anger,I am a teacher in arts and crafts (boys from 1...


In [4]:
total_count = df['Emotions'].value_counts().sum()
print (total_count)
df['Emotions'].value_counts()


5366


Emotions
sadness    775
fear       770
guilt      767
shame      765
anger      764
joy        764
disgust    761
Name: count, dtype: int64

In [5]:
#Encoding the labels of the training file

label_encoding = {'anger': 0, 'disgust': 1, 'fear': 2, 'guilt': 3, 'joy': 4, 'sadness': 5, 'shame': 6}
y = df['Emotions'].values
y_train_encoded = np.array([label_encoding[label] for label in y])
print("Encoded labels:", y_train_encoded)


Encoded labels: [5 1 2 ... 5 1 2]


In [6]:
#Tokenizing, cleaning the data

def tokenize(text):
    translation_table = str.maketrans({c: f' {c} ' if not c.isalnum() else c for c in set(text)})  # creates translation
    # table(dictionary) with the built-in function maketrans, set(text) makes an unordered collection of unique
    # elements through set comprehension, a concise way to create sets
    tokenized_text = text.translate(translation_table)  # uses translation table to add whitespace around special
    # characters and punctuation
    return tokenized_text.strip().lower().split()  # split-tokens split on space, lower-tokens made all lowercase,
    # strip-and leading or trailing whitespaces are removed from string

import string 

text = ''.join(df['Text'].astype(str))

tokenized_text = []
for index, sentence in enumerate(df['Text']):
    emotion_label = df['Emotions'][index]
    sentence = tokenize(sentence)
    tokenized_text.append (sentence)

filtered_tokens = []

for tokens in tokenized_text:
    clean_tokens = []
    
    for token in tokens:
        if all(char in string.punctuation for char in token):
            continue  # Skip punctuation tokens
        if token.isdigit():
            continue  # Skip digit tokens
        else:
            clean_tokens.append(token)  
    
    filtered_tokens.append(clean_tokens)

tokenized_text = filtered_tokens

vocab = []
for sentence in tokenized_text:
    for token in sentence:
        vocab.append (token)
vocab = set (vocab)


In [7]:
#Creating tf-idf scores 
from math import log

collection = [token for instance in tokenized_text for token in instance]

#tf-idf

def calculate_tfidf(token, document):
    tf = document.count(token) / len(document)
    idf = log(1 + (len(tokenized_text) / (collection.count(token) + 1)))
    tfidf = tf * idf
    return tfidf

tfidf_scores_list = []
for sentence in tokenized_text:
    tfidf_scores = {token: calculate_tfidf(token, sentence) for token in set(sentence)}
    tfidf_scores_list.append(tfidf_scores)

In [None]:
#Creating a matrix mapped to the training vocabulary 

from scipy.sparse import csr_matrix

def CSR(x_emotion):
    # Construct a vocabulary list from the tfidf_scores_list
    vocab = list(set(token for scores in x_emotion for token in scores.keys()))

    # Construct a mapping from term to column index
    term_to_col = {term: idx for idx, term in enumerate(sorted(vocab))}

    # Convert the list of TF-IDF scores into a CSR sparse matrix
    n_docs = len(x_emotion)
    n_terms = len(vocab)
    rows = []
    cols = []
    data = []
    for doc_idx, scores in enumerate(x_emotion):
        for term, tfidf in scores.items():
            col_idx = term_to_col[term]
            rows.append(doc_idx)
            cols.append(col_idx)
            data.append(tfidf)

    X_tfidf_sparse = csr_matrix((data, (rows, cols)), shape=(n_docs, n_terms))
    return X_tfidf_sparse


In [None]:
"""data = {'Emotions': y, 'Text': X_tfidf_sparse}
df = pd.DataFrame(data)

# Save DataFrame to Excel
df.to_excel('tfidf_representations_training.xlsx', index=False)"""

"data = {'Emotions': y, 'Text': X_tfidf_sparse}\ndf = pd.DataFrame(data)\n\n# Save DataFrame to Excel\ndf.to_excel('tfidf_representations_training.xlsx', index=False)"

In [None]:
#Processing the testset

from scipy.sparse import csr_matrix
from math import log

custom_headers_t = ['Emotion', 'Texts']
file = '/Users/diana/Desktop/isear-test.xlsx'
d = pd.read_excel(file, skiprows=1, header=None, names=custom_headers_t)

#Encoding testset label
y_test = d['Emotion']

y_test_encoded = np.array([label_encoding[label] for label in y_test])

# Tokenize the test data

tokenized_text_test = []
for index, sentence in enumerate(d['Texts']):
    emotion_label = d['Emotion'][index]
    sentence = tokenize(sentence)
    tokenized_text_test.append (sentence)
   
filtered_tokens_test = []

import string 
for tokens in tokenized_text_test:
    clean_tokens = []
    
    for token in tokens:
        if all(char in string.punctuation for char in token):
            continue  # Skip punctuation tokens
        if token.isdigit():
            continue  # Skip digit tokens
        else:
            clean_tokens.append(token)  
    
    filtered_tokens_test.append(clean_tokens)

# Print the filtered tokens
#print(filtered_tokens_test)

tokenized_text_test=filtered_tokens_test


# Construct a list of TF-IDF scores for the test data
tfidf_scores_list_test = []
for sentence in tokenized_text_test:
    tfidf_scores_test = {token: calculate_tfidf(token, sentence) for token in set(sentence)}
    tfidf_scores_list_test.append(tfidf_scores_test)


# Construct a mapping from term to the sorted training vocabulary
term_to_col_test = {term: idx for idx, term in enumerate(sorted(vocab))}

# Convert the list of TF-IDF scores into a CSR sparse matrix using the same vocabulary and mapping from training
n_terms = len(vocab)
n_docs_test = len(tfidf_scores_list_test)
rows_test = []
cols_test = []
data_test = []
for doc_idx, scores_t in enumerate(tfidf_scores_list_test):
    for term, tfidf in scores_t.items():
        if term in term_to_col_test:  # Check if term is in the vocabulary
            col_idx = term_to_col_test[term]
            rows_test.append(doc_idx)
            cols_test.append(col_idx)
            data_test.append(tfidf)


X_tfidf_sparse_test = csr_matrix((data_test, (rows_test, cols_test)), shape=(n_docs_test, n_terms))



In [None]:
%store X_tfidf_sparse
%store X_tfidf_sparse_test
%store y_train_encoded
%store vocab
%store collection
%store tokenized_text
%store y_test

Stored 'X_tfidf_sparse' (csr_matrix)
Stored 'X_tfidf_sparse_test' (csr_matrix)
Stored 'y_train_encoded' (ndarray)
Stored 'vocab' (set)
Stored 'collection' (list)
Stored 'tokenized_text' (list)
Stored 'y_test' (Series)
