In [1]:
import pandas as pd
import numpy as np

In [2]:
file_path = '/Users/diana/Desktop/isear-train.xlsx'
custom_headers = ['Emotions', 'Text']
df = pd.read_excel(file_path, skiprows=1, header=None, names=custom_headers)

conda install openpyxl

In [3]:
df.head ()

Unnamed: 0,Emotions,Text
0,sadness,Losing my girlfriend who made an end to our re...
1,disgust,[ No response.]
2,fear,Staying alone in a dark place.
3,shame,When I failed grade 7.
4,anger,I am a teacher in arts and crafts (boys from 1...


In [4]:
total_count = df['Emotions'].value_counts().sum()
print (total_count)

5366


In [5]:
df['Emotions'].value_counts()


Emotions
sadness    775
fear       770
guilt      767
shame      765
anger      764
joy        764
disgust    761
Name: count, dtype: int64

In [6]:
print (df['Emotions'])
print (df['Emotions'].shape)
print (df['Emotions'].values)

0       sadness
1       disgust
2          fear
3         shame
4         anger
         ...   
5361       fear
5362      anger
5363    sadness
5364    disgust
5365       fear
Name: Emotions, Length: 5366, dtype: object
(5366,)
['sadness' 'disgust' 'fear' ... 'sadness' 'disgust' 'fear']


In [7]:
emotion_labels = list (set (df['Emotions'].values))

In [8]:


def tokenize(text):
    translation_table = str.maketrans({c: f' {c} ' if not c.isalnum() else c for c in set(text)})  # creates translation
    # table(dictionary) with the built-in function maketrans, set(text) makes an unordered collection of unique
    # elements through set comprehension, a concise way to create sets
    tokenized_text = text.translate(translation_table)  # uses translation table to add whitespace around special
    # characters and punctuation
    return tokenized_text.strip().lower().split()  # split-tokens split on space, lower-tokens made all lowercase,
    # strip-and leading or trailing whitespaces are removed from string


#extract dict
text = ''.join(df['Text'].astype(str))
tokenized_text = tokenize(text) #all tokens, including repeating
#print (tokenized_text)
vocab = set(tokenized_text)
#print(vocab)

tokenized_text = []
for index, sentence in enumerate(df['Text']):
    emotion_label = df['Emotions'][index]
    sentence = tokenize(sentence)
    tokenized_text.append (sentence)
    #print(emotion_label, sentence)
#print (tokenized_text)


In [9]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from scipy.sparse import csr_matrix

# Convert the list of tokenized sentences into a list of strings
corpus = [' '.join(sentence) for sentence in tokenized_text]

# Create a CountVectorizer to convert the corpus into a document-term matrix
count_vectorizer = CountVectorizer(vocabulary=vocab)
X_count = count_vectorizer.fit_transform(corpus)

# Create a TfidfTransformer to compute the TF-IDF scores
tfidf_transformer = TfidfTransformer()
X_tfidf = tfidf_transformer.fit_transform(X_count)

# Convert the TF-IDF matrix to CSR format if needed
if not isinstance(X_tfidf, csr_matrix):
    X_tfidf = X_tfidf.tocsr()

#print (corpus)
#print(X_tfidf)

y = df['Emotions'].values


In [10]:
tfidf_df = pd.DataFrame(X_tfidf)

tfidf_df.to_csv('tfidf_representations.csv', index=False)

tfidf_df.to_excel('tfidf_representations.xlsx', index=False)

In [12]:
%store X_tfidf
%store y

Stored 'X_tfidf' (csr_matrix)
Stored 'y' (ndarray)
