## Text Representation Techniques

### Bag of Words (BoW)

In [1]:
import pandas as pd

# Load the CSV file into a DataFrame
df = pd.read_csv('df.csv')

In [2]:
# Import CountVectorizer for converting text data into a bag-of-words representation.
from sklearn.feature_extraction.text import CountVectorizer

In [3]:
# Ensure all entries in the 'Content' column are strings.
df['Content'] = df['Content'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Fill missing values in the 'Content' column with empty strings.
df['Content'] = df['Content'].fillna('')

# Initialize the CountVectorizer for bag-of-words representation.
count_vectorizer = CountVectorizer()

# Transform the text in the 'Content' column into a sparse matrix of token counts.
message_vector = count_vectorizer.fit_transform(df['Content'])
message_vector

# Print results for verification.
print("Feature Names:", count_vectorizer.get_feature_names_out())  # Vocabulary of the vectorizer.
print("Message Vector Shape:", message_vector.shape)  # Shape of the resulting sparse matrix.
print("Message Vector Array:\n", message_vector.toarray())  # Dense array representation of the matrix.

Feature Names: ['aand' 'abandon' 'abandoning' ... 'zvrzahl' 'zürich' 'ánd']
Message Vector Shape: (2254, 11443)
Message Vector Array:
 [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [4]:
# Convert the sparse matrix to a dense array.
message_array = message_vector.toarray()

# Create a DataFrame from the dense array, using feature names as column headers.
df_countvectorizer = pd.DataFrame(data=message_array, columns=count_vectorizer.get_feature_names_out())

print(df_countvectorizer)

      aand  abandon  abandoning  abe  aber  abilities  ability  abillity  \
0        0        0           0    0     0          0        0         0   
1        0        0           0    0     0          0        0         0   
2        0        0           0    0     0          0        0         0   
3        0        0           0    0     0          0        0         0   
4        0        0           0    0     0          0        0         0   
...    ...      ...         ...  ...   ...        ...      ...       ...   
2249     0        0           0    0     0          0        0         0   
2250     0        0           0    0     0          0        0         0   
2251     0        0           0    0     0          0        0         0   
2252     0        0           0    0     0          0        0         0   
2253     0        0           0    0     0          0        0         0   

      abilty  able  ...  youyour  youyours  zealand  zero  zone  zurich  \
0          0

In [5]:
# Display the column names of the DataFrame created from the CountVectorizer output.
df_countvectorizer.columns

Index(['aand', 'abandon', 'abandoning', 'abe', 'aber', 'abilities', 'ability',
       'abillity', 'abilty', 'able',
       ...
       'youyour', 'youyours', 'zealand', 'zero', 'zone', 'zurich', 'zurick',
       'zvrzahl', 'zürich', 'ánd'],
      dtype='object', length=11443)

### Term Frequency-Inverse Document Frequency (TF-IDF)

In [6]:
# Import TfidfVectorizer for converting text data into a TF-IDF representation.
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
# Ensure all entries in 'Content' are strings.
df['Content'] = df['Content'].apply(lambda x: ' '.join(x) if isinstance(x, list) else str(x))

# Define a function to remove the first `n` words from a text.
def remove_first_words(text, n=3):
    words = text.split()  # Split text into words.
    return " ".join(words[n:])  # Join words after skipping the first `n`.

# Create a new column with the first 3 words removed from 'Content'.
df['Filtered_Content'] = df['Content'].apply(lambda x: remove_first_words(x, n=3))

# Initialize the TfidfVectorizer for TF-IDF representation.
tfidf_vectorizer = TfidfVectorizer()

# Fit the vectorizer and transform the 'Filtered_Content' column into a TF-IDF matrix.
tfidf_vectorizer.fit(df['Filtered_Content'])
tfidf_message_vector = tfidf_vectorizer.transform(df['Filtered_Content'])

In [8]:
# Display the shape of the TF-IDF matrix (rows represent number of documents and columns represent number of unique terms).
tfidf_message_vector.shape

(2254, 11299)

In [9]:
# Convert the TF-IDF sparse matrix to a dense array.
message_tfidf_array = tfidf_message_vector.toarray()

# Create a DataFrame from the TF-IDF array with feature names as column headers.
df_tfidf = pd.DataFrame(data=message_tfidf_array, columns=tfidf_vectorizer.get_feature_names_out())

print(df_tfidf)

      aand  abandon  abandoning  abe  aber  abilities  ability  abillity  \
0      0.0      0.0         0.0  0.0   0.0        0.0      0.0       0.0   
1      0.0      0.0         0.0  0.0   0.0        0.0      0.0       0.0   
2      0.0      0.0         0.0  0.0   0.0        0.0      0.0       0.0   
3      0.0      0.0         0.0  0.0   0.0        0.0      0.0       0.0   
4      0.0      0.0         0.0  0.0   0.0        0.0      0.0       0.0   
...    ...      ...         ...  ...   ...        ...      ...       ...   
2249   0.0      0.0         0.0  0.0   0.0        0.0      0.0       0.0   
2250   0.0      0.0         0.0  0.0   0.0        0.0      0.0       0.0   
2251   0.0      0.0         0.0  0.0   0.0        0.0      0.0       0.0   
2252   0.0      0.0         0.0  0.0   0.0        0.0      0.0       0.0   
2253   0.0      0.0         0.0  0.0   0.0        0.0      0.0       0.0   

      abilty      able  ...  youyour  youyours  zealand  zero  zone  zurich  \
0       

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.decomposition import PCA

# Load the labels from the CSV file.
labels = pd.read_csv('df.csv')  # Assumes 'NegoOutcomeLabel' exists in the file.

# Add the target variable to the TF-IDF DataFrame.
df_tfidf['NegoOutcomeLabel'] = labels['NegoOutcomeLabel']

# Split the data into features (X) and target (y).
X = df_tfidf.drop(columns=['NegoOutcomeLabel'])  # Features (TF-IDF matrix).
y = df_tfidf['NegoOutcomeLabel']  # Target variable.

# Split the data into training and testing sets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Optional: Apply PCA to reduce dimensionality (e.g., for model efficiency).
pca = PCA(n_components=100)  # Reduce to 100 dimensions.
X_train_pca = pca.fit_transform(X_train)
X_test_pca = pca.transform(X_test)

# Train a RandomForestClassifier on the PCA-transformed data.
clf = RandomForestClassifier(random_state=42)
clf.fit(X_train_pca, y_train)

# Make predictions on the test data.
y_pred = clf.predict(X_test_pca)

# Evaluate the model.
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# **Feature Analysis Without PCA**
# Train the model on the original TF-IDF data.
clf_feature_importance = RandomForestClassifier(random_state=42)
clf_feature_importance.fit(X_train, y_train)

# Extract feature importances and link them with feature names.
feature_importances = clf_feature_importance.feature_importances_
feature_names = X.columns  # Names of the TF-IDF features.

# Combine feature importances with their corresponding terms.
important_features = list(zip(feature_importances, feature_names))

# Sort features by importance in descending order.
important_features_sorted = sorted(important_features, key=lambda x: x[0], reverse=True)

# Extract the top 50 most important features.
top_50_features = important_features_sorted[:50]

# Display the top 50 important words.
print("Top 50 words in order of importance:")
for importance, term in top_50_features:
    print(f"Word: {term}, Importance: {importance:.5f}")

Model Accuracy: 0.7871396895787139

Classification Report:
               precision    recall  f1-score   support

           0       1.00      0.08      0.14       104
           1       0.78      1.00      0.88       347

    accuracy                           0.79       451
   macro avg       0.89      0.54      0.51       451
weighted avg       0.83      0.79      0.71       451

Top 50 words in order of importance:
Word: kramer, Importance: 0.01150
Word: folklore, Importance: 0.00998
Word: russian, Importance: 0.00822
Word: swiss, Importance: 0.00820
Word: reject, Importance: 0.00638
Word: constance, Importance: 0.00605
Word: price, Importance: 0.00527
Word: sorry, Importance: 0.00498
Word: week, Importance: 0.00448
Word: delivery, Importance: 0.00406
Word: ltd, Importance: 0.00382
Word: already, Importance: 0.00373
Word: best, Importance: 0.00365
Word: traditional, Importance: 0.00364
Word: final, Importance: 0.00360
Word: offer, Importance: 0.00359
Word: willing, Importance: 0.0

### Word Embedding

In [11]:
# Import necessary libraries for Word2Vec and text preprocessing.
from gensim.models import Word2Vec
from gensim.utils import simple_preprocess  # For preprocessing text into tokens.
from nltk.corpus import stopwords  # For accessing stopwords.
import nltk  # For additional NLP utilities.

In [12]:
# Download the NLTK stopwords dataset and create a set of English stopwords for faster lookups.
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lila9\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [13]:
# Define a function to preprocess text by tokenizing and removing stopwords.
def preprocess_text(text):
    tokens = simple_preprocess(text) # Tokenize the text into lowercase words.
    return [word for word in tokens if word not in stop_words] # Filter out stopwords from the tokens.

In [None]:
df_combined = pd.read_csv('df_combined.csv')
corpus = df_combined['Content']
preprocessed_corpus = [preprocess_text(doc) for doc in corpus]

In [None]:
# Train Word2Vec Model
model = Word2Vec(
    sentences=preprocessed_corpus,  # Preprocessed sentences
    vector_size=100,                # Size of word vectors
    window=5,                       # Context window size
    min_count=2,                    # Minimum word frequency
    workers=4,                      # Number of threads
    sg=1                            # Skip-gram model (set 0 for CBOW)
)

In [None]:
# Save the model to ensure reusability
model_path = "word2vec_model.model"
model.save(model_path)

In [None]:
# Example: Accessing a word vector
word = "accept" 
if word in model.wv:
    print(f"Vector for '{word}': {model.wv[word]}")

# Example: Finding similar words
try:
    similar_words = model.wv.most_similar(word, topn=10)
    print(f"Words similar to '{word}': {similar_words}")
except KeyError:
    print(f"Word '{word}' not in vocabulary.")

In [None]:
# Example: Accessing a word vector
word = "reject" 
if word in model.wv:
    print(f"Vector for '{word}': {model.wv[word]}")

# Example: Finding similar words
try:
    similar_words = model.wv.most_similar(word, topn=10)
    print(f"Words similar to '{word}': {similar_words}")
except KeyError:
    print(f"Word '{word}' not in vocabulary.")

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer

# Laden der CSV-Daten
data = pd.read_csv('df.csv')

# Erste Ansicht der Daten
print(data.head())

# Berechnung neuer Features
# Wortanzahl der Nachricht
data['Word_Count'] = data['Content'].apply(lambda x: len(str(x).split()))

# Zeichenanzahl der Nachricht
data['Char_Length'] = data['Content'].apply(lambda x: len(str(x)))

# Durchschnittliche Wortlänge
data['Avg_Word_Length'] = data['Content'].apply(lambda x: np.mean([len(word) for word in str(x).split()]))

# Erfolgsspalte (Zielvariable)
y = data['NegoOutcomeLabel']

# Features für die Analyse
X = data[['Word_Count', 'Char_Length', 'Avg_Word_Length']]

# Datenaufteilung in Training und Test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Random Forest Modell zur Erkennung von Zusammenhängen
rf = RandomForestClassifier(random_state=42)
rf.fit(X_train, y_train)

# Vorhersagen und Bewertung
y_pred = rf.predict(X_test)
print("Classification Report:")
print(classification_report(y_test, y_pred))
print("Accuracy Score:", accuracy_score(y_test, y_pred))

# Feature Importance
importances = rf.feature_importances_
features = X.columns
plt.bar(features, importances)
plt.title('Feature Importances')
plt.show()

# Analyse der Wortinhalte mit CountVectorizer
vectorizer = CountVectorizer(stop_words='english', max_features=50)
X_vectorized = vectorizer.fit_transform(data['Content'].fillna('')).toarray()
feature_names = vectorizer.get_feature_names_out()

# Hinzufügen der häufigsten Wörter als Features
for idx, word in enumerate(feature_names):
    data[word] = X_vectorized[:, idx]

# Aktualisierte Features für die zweite Analyse
X_updated = data[['Word_Count', 'Char_Length', 'Avg_Word_Length'] + list(feature_names)]

# Erneute Aufteilung in Training und Test
X_train_updated, X_test_updated, y_train_updated, y_test_updated = train_test_split(X_updated, y, test_size=0.3, random_state=42)

# Neues Modell mit erweiterten Features
rf_updated = RandomForestClassifier(random_state=42)
rf_updated.fit(X_train_updated, y_train_updated)

# Neue Vorhersagen und Bewertung
y_pred_updated = rf_updated.predict(X_test_updated)
print("Updated Classification Report:")
print(classification_report(y_test_updated, y_pred_updated))
print("Updated Accuracy Score:", accuracy_score(y_test_updated, y_pred_updated))


# Neue Feature Importance Visualisierung
updated_importances = rf_updated.feature_importances_
updated_features = X_updated.columns

# Nur die wichtigsten 10 Features anzeigen
sorted_indices = np.argsort(updated_importances)[-10:]
plt.bar(np.array(updated_features)[sorted_indices], updated_importances[sorted_indices])
plt.title('Top 10 Updated Feature Importances')
plt.xticks(rotation=45)
plt.show()