# Import libraries

In [None]:
import pandas as pd
import numpy as np
from tqdm import tqdm
import string
import re
from nltk.corpus import stopwords
import seaborn as sns
import statsmodels.api as sm
from wordcloud import WordCloud
import matplotlib.pyplot as plt

from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay, f1_score, classification_report, balanced_accuracy_score, roc_auc_score, roc_curve, auc, recall_score, precision_score
from sklearn.model_selection import GridSearchCV, train_test_split, RandomizedSearchCV
from scipy.stats import randint
import xgboost as xgb

In [None]:
df_output = pd.read_csv("data/output.csv")
df_politics = pd.read_csv("data/cleaned_data.csv")

Merge the politeness

In [None]:
df_politeness = pd.read_csv("data/politeness_features.csv")

In [None]:
df_politics = df_politics.merge(df_politeness)

In [None]:
df_politics

# Representation check


In [None]:
baseline = (df_politics['political_leaning'].value_counts() / len(df_politics)) * 100
print(baseline)


# Applying standard linear regression

In [None]:
df_politics.value_counts('political_leaning').plot(kind='bar', fontsize=10, xlabel = "political_leaning", ylabel="Number of posts");

In [None]:
# create a new column called political_leaning_id which maps political_leaning to a numerical value with -1 being left, 0 being center, and 1 being right
# this is done because the model cannot take in string values
df_politics['political_leaning_id'] = df_politics['political_leaning'].map({'left': 0, 'center': 1, 'right': 2})

In [None]:
correlation = df_politics['political_leaning_id'].corr(df_politics['amount_of_cursewords'])
df_new = df_politics[['political_leaning_id', 'amount_of_cursewords']].copy()
correlation_matrix = df_new.corr()
print(correlation_matrix)

In [None]:
X = df_politics.drop(['Unnamed: 0', 'username', 'post', 'political_leaning', 'cleaned_post', 'political_leaning_id'], axis=1)
Y = df_politics['political_leaning_id']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# add constant
X_train = sm.add_constant(X_train)

# performing the regression and fitting the model
results = sm.OLS(y_train, X_train).fit()

print(results.summary())

In [None]:
significant = ['amount_of_cursewords', 'Hedges', 'Impersonal.Pronoun', 'Swearing', 'Negation', 'Filler.Pause', 'Informal.Title', 'Goodbye', 'For.Me', 'Reasoning', 'Reassurance', 'Ask.Agency', 'Give.Agency', 'First.Person.Plural', 'First.Person.Single', 'Second.Person', 'Third.Person', 'Positive.Emotion', 'Negative.Emotion', 'Questions', 'Apology', 'Truth.Intensifier', 'Conjunction.Start']
super_significant = ['amount_of_cursewords', 'Hedges', 'Impersonal.Pronoun', 'Swearing', 'Negation', 'Filler.Pause', 'Informal.Title', 'Reasoning', 'First.Person.Plural', 'First.Person.Single', 'Second.Person', 'Questions', 'Apology', 'Truth.Intensifier']

## Taking a closer look at the data

In [None]:
# calculate where std 1-3 of the data is below
threesigma = df_politics['amount_of_cursewords'].quantile(0.997)
twosigma = df_politics['amount_of_cursewords'].quantile(0.95)
onesigma = df_politics['amount_of_cursewords'].quantile(0.68)
print(threesigma, twosigma, onesigma)

In [None]:
df_politics[df_politics['amount_of_cursewords'] >= 52]

In [None]:
df_politics_cleaned = df_politics[df_politics['amount_of_cursewords'] <= 52]

In [None]:
# #Wordcloud en mooie plaatje:

# df_right = df_politics_cleaned[df_politics_cleaned['political_leaning'] == 'right']
# df_left = df_politics_cleaned[df_politics_cleaned['political_leaning'] == 'left']
# df_centered = df_politics_cleaned[df_politics_cleaned['political_leaning'] == 'center']
# right_text = " ".join(word for word in df_right.post)
# print("There are {} words in the combination of all review.".format(len(right_text)))
# left_text = " ".join(word for word in df_left.post)
# print("There are {} words in the combination of all review.".format(len(left_text)))
# centered_text = " ".join(word for word in df_centered.post)
# print("There are {} words in the combination of all review.".format(len(centered_text)))
# # Create and generate a word cloud image:
# wordcloud = WordCloud().generate(right_text)

# # Display the generated image:
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis("off")
# plt.show()

In [None]:
# wordcloud = WordCloud().generate(left_text)

# # Display the generated image:
# plt.imshow(wordcloud, interpolation='bilinear')
# plt.axis("off")
# plt.show()

In [None]:
df_politics_cleaned.value_counts('amount_of_cursewords').plot(kind='bar', fontsize=10, xlabel = "Number of curse words", ylabel="Number of posts", title="Number of curse words per post", xticks=np.arange(0, 61, 10))

#plot the 1, 2, and 3 sigma lines
plt.axvline(x=onesigma, color='r', linestyle='-')
plt.axvline(x=twosigma, color='r', linestyle='-')
plt.axvline(x=threesigma, color='r', linestyle='-')

# annotate the 1, 2, and 3 sigma lines
plt.text(onesigma, 3500, '68%', rotation=90)
plt.text(twosigma, 3500, '95%', rotation=90)
plt.text(threesigma, 3500, '99.7%', rotation=90);

In [None]:
sigmas = dict()
for word in super_significant:
    threesigma = df_politics[word].quantile(0.997)
    extreme_count = len(df_politics[df_politics[word] >= threesigma])
    print(word, threesigma, extreme_count)
    plot = df_politics_cleaned.value_counts(word).plot(kind='bar', fontsize=10, xlabel = word, ylabel="Number of posts", title=word, xticks=np.arange(0, 61, 10));
    plt.axvline(x=threesigma, color='r', linestyle='-')
    plt.show()
    sigmas[word] = threesigma

df_politics_super_cleaned = df_politics.copy()
for word in sigmas:
    df_politics_super_cleaned = df_politics_super_cleaned[df_politics_super_cleaned[word] <= sigmas[word]]

In [None]:
#create a violin plot with the political leaning on the x-axis and the amount of curse words on the y-axis
sns.violinplot(x="political_leaning", y="amount_of_cursewords", data=df_politics_super_cleaned)

### Look at the correlation with 3Ïƒ of the data

In [None]:
correlation = df_politics_super_cleaned['political_leaning_id'].corr(df_politics_super_cleaned['amount_of_cursewords'])
df_new = df_politics_super_cleaned[['political_leaning_id', 'amount_of_cursewords']].copy()
correlation_matrix = df_new.corr()
print(correlation_matrix)

In [None]:
X = df_politics_super_cleaned[super_significant]
Y = df_politics_super_cleaned['political_leaning_id']

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

# add constant
X_train = sm.add_constant(X_train)

# performing the regression and fitting the model
results = sm.OLS(y_train, X_train).fit()

print(results.summary())

# Random Forest Classifier

In [None]:
# Split the data into features (X) and target (Y)
X = df_politics_super_cleaned[super_significant]
Y = df_politics_super_cleaned['political_leaning_id']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

rf = RandomForestClassifier()
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)
f1 = f1_score(y_test, y_pred, average='weighted')
print("F1 score:", f1)

# --- Hyperparameter Tuning ---
param_dist = {'n_estimators': randint(50,500),
              'max_depth': randint(1,20),
              'max_features': randint(1,14)}

# for grid search
# forest_params = [{
#     'n_estimators': [50, 100, 200, 300, 400, 500],
#     'max_depth': [list(range(5, 15))], 
#     'max_features': list(range(6,14))
#     }]


# Create a random forest classifier
rf = RandomForestClassifier()

# Use random search to find the best hyperparameters
rand_search = RandomizedSearchCV(rf, 
                                 param_distributions = param_dist, 
                                 n_iter=10, 
                                 cv=5)

# Fit the random search object to the data
rand_search.fit(X_train, y_train)

# Create a variable for the best model
best_rf = rand_search.best_estimator_

# Print the best hyperparameters
print('Best hyperparameters:',  rand_search.best_params_)

# --- end of hyperparameter tuning ---

# Generate predictions with the best model
y_pred = best_rf.predict(X_test)

print('\n------------------ Confusion Matrix -----------------\n')
ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred)).plot();
plt.show()
print('\n-------------------- Key Metrics --------------------')
print('\nAccuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Balanced Accuracy: {:.2f}\n'.format(balanced_accuracy_score(y_test, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='weighted')))

print('\n--------------- Classification Report ---------------\n')
print(classification_report(y_test, y_pred))

# Logistic

In [None]:
# Split the data into features (X) and target (Y)
X = df_politics_super_cleaned[super_significant]
Y = df_politics_super_cleaned['political_leaning_id']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

logreg = LogisticRegression(random_state=42, max_iter=1000)
logreg.fit(X_train, y_train)
y_pred = logreg.predict(X_test)

print('\n------------------ Confusion Matrix -----------------\n')
ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred)).plot();
plt.show()
print('\n-------------------- Key Metrics --------------------')
print('\nAccuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Balanced Accuracy: {:.2f}\n'.format(balanced_accuracy_score(y_test, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='weighted')))

print('\n--------------- Classification Report ---------------\n')
print(classification_report(y_test, y_pred))

# Naive Bayes

In [None]:
# Split the data into features (X) and target (Y)
X = df_politics_super_cleaned[super_significant]
Y = df_politics_super_cleaned['political_leaning_id']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

gnb = GaussianNB()

y_pred = gnb.fit(X_train, y_train).predict(X_test)
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0], (y_test != y_pred).sum()))

y_pred = gnb.predict(X_test)

print('\n------------------ Confusion Matrix -----------------\n')
ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred)).plot();
plt.show()
print('\n-------------------- Key Metrics --------------------')
print('\nAccuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Balanced Accuracy: {:.2f}\n'.format(balanced_accuracy_score(y_test, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='weighted')))

print('\n--------------- Classification Report ---------------\n')
print(classification_report(y_test, y_pred))

# Bare Decision Tree

In [None]:
# Split the data into features (X) and target (Y)
X = df_politics_super_cleaned[super_significant]
Y = df_politics_super_cleaned['political_leaning_id']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

clf = tree.DecisionTreeClassifier()

clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

print('\n------------------ Confusion Matrix -----------------\n')
ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred)).plot();
plt.show()
print('\n-------------------- Key Metrics --------------------')
print('\nAccuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Balanced Accuracy: {:.2f}\n'.format(balanced_accuracy_score(y_test, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='weighted')))

print('\n--------------- Classification Report ---------------\n')
print(classification_report(y_test, y_pred))

# XGBoost

based on https://www.kaggle.com/code/emmanuelfwerr/xgboost-multi-class-classification

In [None]:
# Split the data into features (X) and target (Y)
X = df_politics[super_significant]
Y = df_politics['political_leaning_id']

# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

## ---------- XGBoost model v1 ----------
## base run of model with default hyperparameters

xgb_clf = xgb.XGBClassifier(objective='multi:softmax', 
                            num_class=3, 
                            early_stopping_rounds=10, 
                            eval_metric=['merror', 'auc', 'mlogloss'], 
                            seed=42)
xgb_clf.fit(X_train, 
            y_train,
            verbose=0, # set to 1 to see xgb training round intermediate results
            eval_set=[(X_train, y_train), (X_test, y_test)])

# preparing evaluation metric plots
results = xgb_clf.evals_result()
epochs = len(results['validation_0']['mlogloss'])
x_axis = range(0, epochs)

# xgboost 'mlogloss' plot
fig, ax = plt.subplots(figsize=(9,5))
ax.plot(x_axis, results['validation_0']['mlogloss'], label='Train')
ax.plot(x_axis, results['validation_1']['mlogloss'], label='Test')
ax.legend()
plt.ylabel('mlogloss')
plt.title('GridSearchCV XGBoost mlogloss')
plt.show()

# xgboost 'merror' plot
fig, ax = plt.subplots(figsize=(9,5))
ax.plot(x_axis, results['validation_0']['merror'], label='Train')
ax.plot(x_axis, results['validation_1']['merror'], label='Test')
ax.legend()
plt.ylabel('merror')
plt.title('GridSearchCV XGBoost merror')
plt.show()

## ---------- Model Classification Report ----------
## get predictions and create model quality report

y_pred = xgb_clf.predict(X_test)

print('\n------------------ Confusion Matrix -----------------\n')
ConfusionMatrixDisplay(confusion_matrix=confusion_matrix(y_test, y_pred)).plot();
plt.show()
print('\n-------------------- Key Metrics --------------------')
print('\nAccuracy: {:.2f}'.format(accuracy_score(y_test, y_pred)))
print('Balanced Accuracy: {:.2f}\n'.format(balanced_accuracy_score(y_test, y_pred)))

print('Micro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='micro')))
print('Micro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='micro')))
print('Micro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='micro')))

print('Macro Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='macro')))
print('Macro Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='macro')))
print('Macro F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='macro')))

print('Weighted Precision: {:.2f}'.format(precision_score(y_test, y_pred, average='weighted')))
print('Weighted Recall: {:.2f}'.format(recall_score(y_test, y_pred, average='weighted')))
print('Weighted F1-score: {:.2f}\n'.format(f1_score(y_test, y_pred, average='weighted')))

print('\n--------------- Classification Report ---------------\n')
print(classification_report(y_test, y_pred))
print('---------------------- XGBoost ----------------------') # unnecessary fancy styling