# Text Representation

## Bag of Words

In [1]:
# Import of necessary libraries

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
# Load the dataset

df = pd.read_excel('Data_Group2.xlsx', index_col=0)
df

Unnamed: 0_level_0,ReceiverID,ActionType,NegoOutcome,Content
SenderID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
31,32,Offer,FinalAccept,"Hey Chris, Great that we are working together ..."
32,31,Counteroffer,FinalAccept,"Hey Alex, The pleasure is all mine. For starte..."
31,32,Counteroffer,FinalAccept,"Hey Chris, Thank you for your response. I am..."
32,31,Counteroffer,FinalAccept,"Hello Alex, I think we have a solid compromis ..."
31,32,Question,FinalAccept,"Dear Chris, I am glad to hear that you are wil..."
...,...,...,...,...
851,856,Counteroffer,FinalReject,"Dear Chis Meyer,\n \nthank you for your latest..."
856,851,Counteroffer,FinalReject,"Dear Alex Kramer,\nthank you very much for you..."
851,856,Counteroffer,FinalReject,"Dear Chris Meyer,\nthank you for your fast ans..."
856,851,Counteroffer,FinalReject,"Dear Alex Kramer,\nI am still very interested ..."


In [3]:
print(df.head())

          ReceiverID    ActionType  NegoOutcome  \
SenderID                                          
31                32         Offer  FinalAccept   
32                31  Counteroffer  FinalAccept   
31                32  Counteroffer  FinalAccept   
32                31  Counteroffer  FinalAccept   
31                32      Question  FinalAccept   

                                                    Content  
SenderID                                                     
31        Hey Chris, Great that we are working together ...  
32        Hey Alex, The pleasure is all mine. For starte...  
31        Hey Chris,  Thank you for your response.  I am...  
32        Hello Alex, I think we have a solid compromis ...  
31        Dear Chris, I am glad to hear that you are wil...  


In [4]:
text_data = df["Content"].dropna().tolist()

In [5]:
def preprocess_text(texts):
    processed_texts = []
    for text in texts:
        text = text.lower()  # Kleinbuchstaben
        text = ''.join([char for char in text if char.isalnum() or char.isspace()])
        processed_texts.append(text)
    return processed_texts

In [6]:
processed_text_data = preprocess_text(text_data)

In [7]:
count_vectorizer = CountVectorizer()
message_vector = count_vectorizer.fit_transform(df['Content'])
message_vector

<2254x9132 sparse matrix of type '<class 'numpy.int64'>'
	with 269192 stored elements in Compressed Sparse Row format>

In [8]:
message_array = message_vector.toarray()
df_countvectorizer = pd.DataFrame(data=message_array, columns=count_vectorizer.get_feature_names_out())
print(df_countvectorizer)

      00  000  02th  04  10  100  101  102  1040  106  ...  youthful  zahl  \
0      0    0     0   0   0    0    0    0     0    0  ...         0     0   
1      0    0     0   0   0    0    0    0     0    0  ...         0     0   
2      0    0     0   0   0    0    0    0     0    0  ...         0     0   
3      0    0     0   0   0    0    0    0     0    0  ...         0     0   
4      0    0     0   0   0    0    0    0     0    0  ...         0     0   
...   ..  ...   ...  ..  ..  ...  ...  ...   ...  ...  ...       ...   ...   
2249   0    0     0   0   0    0    0    0     0    0  ...         0     0   
2250   0    0     0   0   0    0    0    0     0    0  ...         0     0   
2251   0    0     0   0   0    0    0    0     0    0  ...         0     0   
2252   0    0     0   0   0    0    0    0     0    0  ...         0     0   
2253   0    0     0   0   0    0    0    0     0    0  ...         0     0   

      zealand  zero  zone  zurich  zurick  zvr  zürich  ánd  
0

In [9]:
df_countvectorizer.columns

Index(['00', '000', '02th', '04', '10', '100', '101', '102', '1040', '106',
       ...
       'youthful', 'zahl', 'zealand', 'zero', 'zone', 'zurich', 'zurick',
       'zvr', 'zürich', 'ánd'],
      dtype='object', length=9132)

## Term Frequency - Inverse Document Frequency (TF-IDF)

In [10]:
count_vectorizer_ngrams = CountVectorizer(ngram_range=(1,2), min_df=0.01, max_df=0.7, max_features=100)
message_vector_ngrams = count_vectorizer_ngrams.fit_transform(df['Content'])

In [11]:
message_ngrams_array = message_vector_ngrams.toarray()
df_countvectorizer_ngrams = pd.DataFrame(data=message_ngrams_array, columns=count_vectorizer_ngrams.get_feature_names_out())
print(df_countvectorizer)

      00  000  02th  04  10  100  101  102  1040  106  ...  youthful  zahl  \
0      0    0     0   0   0    0    0    0     0    0  ...         0     0   
1      0    0     0   0   0    0    0    0     0    0  ...         0     0   
2      0    0     0   0   0    0    0    0     0    0  ...         0     0   
3      0    0     0   0   0    0    0    0     0    0  ...         0     0   
4      0    0     0   0   0    0    0    0     0    0  ...         0     0   
...   ..  ...   ...  ..  ..  ...  ...  ...   ...  ...  ...       ...   ...   
2249   0    0     0   0   0    0    0    0     0    0  ...         0     0   
2250   0    0     0   0   0    0    0    0     0    0  ...         0     0   
2251   0    0     0   0   0    0    0    0     0    0  ...         0     0   
2252   0    0     0   0   0    0    0    0     0    0  ...         0     0   
2253   0    0     0   0   0    0    0    0     0    0  ...         0     0   

      zealand  zero  zone  zurich  zurick  zvr  zürich  ánd  
0

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [13]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(df['Content'])
tfidf_message_vector = tfidf_vectorizer.transform(df['Content'])

In [14]:
tfidf_message_vector.shape

(2254, 9132)

In [15]:
message_tfidf_array = tfidf_message_vector.toarray()
df_tfidf = pd.DataFrame(data=message_tfidf_array, columns = tfidf_vectorizer.get_feature_names_out())
print(df_tfidf)

       00  000  02th   04   10  100  101  102  1040  106  ...  youthful  zahl  \
0     0.0  0.0   0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...       0.0   0.0   
1     0.0  0.0   0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...       0.0   0.0   
2     0.0  0.0   0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...       0.0   0.0   
3     0.0  0.0   0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...       0.0   0.0   
4     0.0  0.0   0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...       0.0   0.0   
...   ...  ...   ...  ...  ...  ...  ...  ...   ...  ...  ...       ...   ...   
2249  0.0  0.0   0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...       0.0   0.0   
2250  0.0  0.0   0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...       0.0   0.0   
2251  0.0  0.0   0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...       0.0   0.0   
2252  0.0  0.0   0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...       0.0   0.0   
2253  0.0  0.0   0.0  0.0  0.0  0.0  0.0  0.0   0.0  0.0  ...       0.0   0.0   

      zealand  zero  zone  

In [16]:
df_tfidf.to_csv('df_tfidf.csv', index=False)