In [1]:
import pandas as pd
import numpy as np

In [2]:
file_path = '/Users/diana/Desktop/isear-test.xlsx'
custom_headers = ['Emotions', 'Text']
df = pd.read_excel(file_path, skiprows=1, header=None, names=custom_headers)

conda install openpyxl

In [3]:
df.head ()

Unnamed: 0,Emotions,Text
0,joy,My first feeling of falling in love. Coming a...
1,joy,When I saw that the mark I had obtained in an ...
2,joy,When I learnt that I had been admitted to the ...
3,anger,In a hurry to get to lectures and the car woul...
4,guilt,"Not attending school, when am not sick - but ..."


In [4]:
total_count = df['Emotions'].value_counts().sum()
print (total_count)

1150


In [5]:
df['Emotions'].value_counts()


Emotions
shame      182
fear       168
anger      166
disgust    166
sadness    161
guilt      155
joy        152
Name: count, dtype: int64

In [6]:
print (df['Emotions'])
print (df['Emotions'].shape)
print (df['Emotions'].values)

0         joy
1         joy
2         joy
3       anger
4       guilt
        ...  
1145    anger
1146    anger
1147    shame
1148      joy
1149      joy
Name: Emotions, Length: 1150, dtype: object
(1150,)
['joy' 'joy' 'joy' ... 'shame' 'joy' 'joy']


In [7]:
emotion_labels = list (set (df['Emotions'].values))

In [13]:


def tokenize(text):
    translation_table = str.maketrans({c: f' {c} ' if not c.isalnum() else c for c in set(text)})  # creates translation
    # table(dictionary) with the built-in function maketrans, set(text) makes an unordered collection of unique
    # elements through set comprehension, a concise way to create sets
    tokenized_text = text.translate(translation_table)  # uses translation table to add whitespace around special
    # characters and punctuation
    return tokenized_text.strip().lower().split()  # split-tokens split on space, lower-tokens made all lowercase,
    # strip-and leading or trailing whitespaces are removed from string


#extract dict
text = ''.join(df['Text'].astype(str))
tokenized_text = tokenize(text) #all tokens, including repeating
#print (tokenized_text)
vocab = set(tokenized_text)
#print(vocab)

tokenized_text = []
for index, sentence in enumerate(df['Text']):
    emotion_label = df['Emotions'][index]
    sentence = tokenize(sentence)
    tokenized_text.append (sentence)
    #print(emotion_label, sentence)
#print (tokenized_text)


In [14]:
from math import log

collection = [token for instance in tokenized_text for token in instance]
#print (collection)

#tf-idf

def calculate_tfidf(token, document):
    tf = np.log(document.count(token) / len(document)) if token in document else 0 # 1+log(#oftimestokenindoc/total#termsindoc)
    idf = log (1 + (len(tokenized_text) / collection.count(token))) # log(#ofdocs/#oftimestokenappearsincollection)
    tfidf = tf * idf
    return tfidf

tfidf_scores_list = []
for sentence in tokenized_text:
        tfidf_scores = {token: calculate_tfidf(token, sentence) for token in set(sentence)}
        tfidf_scores_list.append (tfidf_scores)
#print (tfidf_scores_list)



In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from scipy.sparse import csr_matrix
import numpy as np

# Convert the list of tokenized sentences into a list of strings
corpus = [' '.join(sentence) for sentence in tokenized_text]

# Create a CountVectorizer to convert the corpus into a document-term matrix
count_vectorizer = CountVectorizer(vocabulary=vocab)
X_count = count_vectorizer.fit_transform(corpus)

# Create a TfidfTransformer to compute the TF-IDF scores
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_count)


# Convert the TF-IDF matrix to CSR format if needed
if not isinstance(X_tfidf, csr_matrix):
    X_tfidf = X_tfidf.tocsr()

#print (corpus)
#print(X_tfidf)

In [None]:
tfidf_df = pd.DataFrame(X_tfidf)

tfidf_df.to_csv('tfidf_representations.csv', index=False)

tfidf_df.to_excel('tfidf_representations.xlsx', index=False)