In [1]:
# INSTALL IF NEEDED:

!pip install emoji optuna

Collecting emoji
  Downloading emoji-2.9.0-py2.py3-none-any.whl (397 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m397.5/397.5 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m[36m0:00:01[0m
Installing collected packages: emoji
Successfully installed emoji-2.9.0


In [2]:
# CONNECT TO COLAB IF NEEDED:

from google.colab import drive
import os

drive.mount('/content/drive')
os.chdir('./drive/MyDrive/data/tue_lai')

ModuleNotFoundError: No module named 'google'

## Load the data

In [8]:
import pandas as pd
import os
os.chdir('../../lai-data')
data_path = 'political_leaning.csv'  # gender / feeling_thinking
data = pd.read_csv(data_path)
data.head()

Unnamed: 0,auhtor_ID,post,political_leaning
0,t2_7ramzeng,"You can ""buy"" the show and stream it through t...",right
1,t2_7ramzeng,"me want to play Q*bert Holy shit, based Alex J...",right
2,t2_7ramzeng,Shouldn't rely on any external services or per...,right
3,t2_7ramzeng,PR to a specific person. Usually that just mea...,right
4,t2_7ramzeng,This article's intention is clear that they wa...,right


In [9]:
TARGET_COL = 'political_leaning'  # change for other datasets
CLASS_SIZE = 500
data_samples = []

for val in data[TARGET_COL].unique():
    data_samples.append(data[data[TARGET_COL] == val].sample(CLASS_SIZE))
data = pd.concat(data_samples)

In [10]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

INDEPENDENT_COL = 'post'


def label_encode(df, col_name):
    label_encoder = LabelEncoder()
    df[col_name] = label_encoder.fit_transform(df[col_name])
    return df, label_encoder


df, le = label_encode(data, TARGET_COL)
X_train, X_test, y_train, y_test = train_test_split(df[INDEPENDENT_COL], df[TARGET_COL])

## Data Pipelines

In [11]:
import nltk
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.tag import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.base import BaseEstimator, TransformerMixin, ClassifierMixin
from sklearn.svm import LinearSVC

import string
import emoji
import numpy as np

nltk.download("punkt")
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /home/dbalm/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/dbalm/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [12]:
class StylometryFeatureExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        stylometry_features = []
        for text in X:
            # Tokenize sentences and words
            sentences = [word_tokenize(sentence) for sentence in sent_tokenize(text)]
            words = [word for sentence in sentences for word in sentence]

            # Basic stylometry features
            features = {
                'sentence_count': len(sentences),
                'word_count': len(words),
                'avg_sentence_length': len(words) / len(sentences) if len(sentences) > 0 else 0,
                'avg_word_length': sum(len(word) for word in words) / len(words) if len(words) > 0 else 0,
                'num_punctuation': sum(1 for char in text if char in string.punctuation),
                'num_uppercase': sum(1 for char in text if char.isupper()),
                'num_digits': sum(1 for char in text if char.isdigit()),
                'num_emojis': len([char for char in text if char in emoji.EMOJI_DATA])
            }

            stylometry_features.append(features)

        return pd.DataFrame(stylometry_features)

In [13]:
def get_single_pipeline_data(transformer):
    output = dict()
    X_processed = transformer.fit_transform(X_train)
    split = train_test_split(X_processed, y_train, test_size=0.25,random_state=42)
    output['X_train'], output['X_val'], output['y_train'], output['y_val'] = split
    output['X_test'] = transformer.transform(X_test)
    return output

def get_pipeline_data():
    result = dict()

    bow_transformer = Pipeline([
    ('bow', CountVectorizer(stop_words='english')),
    ('scaler', StandardScaler(with_mean=False))
    ])
    result['bow'] = get_single_pipeline_data(CountVectorizer(stop_words='english'))

    style_bow_transformer = Pipeline([
    ('features', FeatureUnion([
        ('text', CountVectorizer(stop_words='english')),
        ('stylometry', StylometryFeatureExtractor())
    ])),
    ('scaler', StandardScaler(with_mean=False))
    ])
    result['style_bow'] = get_single_pipeline_data(style_bow_transformer)

    return result

In [14]:
pipelines_data = get_pipeline_data()

## Utils

In [15]:
from sklearn.metrics import confusion_matrix, classification_report, f1_score
import matplotlib.pyplot as plt
import numpy as np

def evaluate_clf(clf, X, y_true, classes, normalize=True, cmap=plt.cm.Blues):
    y_pred = clf.predict(X)

    print(y_pred.shape, y_true.shape)
    print(classification_report(y_true, y_pred))

    cm = confusion_matrix(y_true, y_pred)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]
        title = 'Normalized Confusion Matrix'
    else:
        title = 'Confusion Matrix'

    fig, ax = plt.subplots()
    im = ax.imshow(cm, interpolation='nearest', cmap=cmap)
    ax.figure.colorbar(im, ax=ax)
    ax.set(xticks=np.arange(cm.shape[1]),
           yticks=np.arange(cm.shape[0]),
           xticklabels=classes, yticklabels=classes,
           title=title,
           ylabel='True label',
           xlabel='Predicted label')

    plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")

    fmt = '.2f' if normalize else 'd'
    thresh = cm.max() / 2.
    for i in range(cm.shape[0]):
        for j in range(cm.shape[1]):
            ax.text(j, i, format(cm[i, j], fmt),
                    ha="center", va="center",
                    color="white" if cm[i, j] > thresh else "black")

    fig.tight_layout()
    plt.show()


In [16]:
import optuna
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
from xgboost import XGBClassifier
import xgboost as xgb
from functools import partial


optuna.logging.set_verbosity(optuna.logging.WARNING)
xgb.set_config(verbosity=1)
SEED = 0


def xgb_objective(trial, data_type, use_gpu=True):
    params = {
        'objective': 'multi:softmax',
        'num_class': 3,
        'max_depth': trial.suggest_int('max_depth', 2, 20),
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'n_jobs': -1 if not use_gpu else None,
        'device': "cuda" if use_gpu else None
    }

    model = XGBClassifier(**params, random_state=42)
    model.fit(pipelines_data[data_type]['X_train'], pipelines_data[data_type]['y_train'])
    y_pred = model.predict(pipelines_data[data_type]['X_val'])
    accuracy = accuracy_score(pipelines_data[data_type]['y_val'], y_pred)
    return accuracy


def rf_objective(trial, data_type):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 10, 1000),
        'max_depth': trial.suggest_int('max_depth', 2, 20, log=True)
    }

    model = RandomForestClassifier(**params, random_state=42)
    model.fit(pipelines_data[data_type]['X_train'], pipelines_data[data_type]['y_train'])
    y_pred = model.predict(pipelines_data[data_type]['X_val'])
    accuracy = accuracy_score(pipelines_data[data_type]['y_val'], y_pred)
    return accuracy

In [33]:
def rf_finetuning(data_type='bow'):
    partial_objective_rf = partial(rf_objective, data_type=data_type)
    study = optuna.create_study(direction='maximize')
    study.optimize(partial_objective_rf, n_trials=100)

    best_model = RandomForestClassifier(random_state=SEED, **study.best_params)
    best_model.fit(pipelines_data[data_type]['X_train'], pipelines_data[data_type]['y_train'])
    return best_model

def xgb_finetuning(data_type='bow'):
    partial_objective_xgb = partial(xgb_objective, data_type=data_type)
    study = optuna.create_study(direction='maximize')
    study.optimize(partial_objective_xgb, n_trials=10)

    best_model = XGBClassifier(random_state=SEED, **study.best_params)
    best_model.fit(pipelines_data[data_type]['X_train'], pipelines_data[data_type]['y_train'])
    return best_model

## Training & evaluating

### No Style SVM

In [None]:
data_type = 'bow'
clf_base = LinearSVC(max_iter=10 ** 5)
clf_base.fit(pipelines_data[data_type]['X_train'], pipelines_data[data_type]['y_train'])
evaluate_clf(clf_base, pipelines_data[data_type]['X_test'], y_test, classes=le.classes_)

### No Style RF/XGB

In [None]:
data_type = 'bow'
bow_rf = rf_finetuning(data_type)
evaluate_clf(bow_rf, pipelines_data[data_type]['X_test'], y_test, classes=le.classes_)

In [None]:
data_type = 'bow'
bow_xgb = xgb_finetuning(data_type)
evaluate_clf(bow_xgb, pipelines_data[data_type]['X_test'], y_test, classes=le.classes_)

### Style SVM

In [None]:
data_type = 'style_bow'
clf_base = LinearSVC(max_iter=10 ** 5)
clf_base.fit(pipelines_data[data_type]['X_train'], pipelines_data[data_type]['y_train'])
evaluate_clf(clf_base, pipelines_data[data_type]['X_test'], y_test, classes=le.classes_)

### Style + RF/XGB

In [None]:
data_type = 'style_bow'
style_rf = rf_finetuning(data_type)
evaluate_clf(style_rf, pipelines_data[data_type]['X_test'], y_test, classes=le.classes_)

In [None]:
data_type = 'style_bow'
style_xgb = xgb_finetuning(data_type)
evaluate_clf(style_xgb, pipelines_data[data_type]['X_test'], y_test, classes=le.classes_)

## Cross validated comparison

In [31]:
from typing import List, Callable
from sklearn.model_selection import KFold, StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score

class EstimatorOpt:
    def __init__(self, estimator, hyper_param_cv_callable, pipe_train_callable_X, pipe_train_callable_y):
        self.hyper_cv = hyper_param_cv_callable
        self.estimator = estimator
        self.get_X = pipe_train_callable_X
        self.get_y = pipe_train_callable_y
        
    def inner(self, inner_cv):
        return self.hyper_cv(estimator=self.estimator, cv=inner_cv)
    
    def nested(self, data_pipe, inner_cv, outer_cv):
        X = self.get_X(data_pipe)
        y = self.get_y(data_pipe)
        clf = self.inner(inner_cv)
        score = cross_val_score(clf, X=X, y=y, cv=outer_cv)
        return self.estimator, score

def k_fold_nested_model_comparison(estimator_opts: List[EstimatorOpt], split_strategy: Callable, data_pipe):
    inner_cv = split_strategy()
    outer_cv = split_strategy()
    results = [estimator_opt.nested(data_pipe, outer_cv, inner_cv) for estimator_opt in estimator_opts]
    return results

def optuna_search_callable(param_distr, trials):
    return lambda estimator, cv: optuna.integration.OptunaSearchCV(estimator=estimator, 
                                                                   param_distributions=param_distr, 
                                                                   cv=cv,
                                                                   n_trials=trials,
                                                                   random_state=42
                                                                  )

def stratified_k_fold_strategy(n_splits, shuffle, random_state):
    return lambda: StratifiedKFold(n_splits=n_splits, shuffle=shuffle, random_state=random_state)

def obtain_x(data_type):
    return lambda data: data[data_type]['X_train']

def obtain_y(data_type):
    return lambda data: data[data_type]['y_train']

In [None]:
rf_params = {
    'max_depth': optuna.distributions.IntDistribution(2, 20, log=True),
    'n_estimators': optuna.distributions.IntDistribution(10, 1000)
}

rf_bow = RandomForestClassifier()
bow_search_rf =  optuna_search_callable(rf_params, trials=100)
rf_bow_opt = EstimatorOpt(rf_bow, bow_search_rf, obtain_x('bow'), obtain_y('bow'))

stratified_kf = stratified_k_fold_strategy(n_splits=5, shuffle=True, random_state=42)

estimator_opts = [
    rf_bow_opt
]

k_fold_nested_model_comparison(estimator_opts, stratified_kf, pipelines_data)

  return lambda estimator, cv: optuna.integration.OptunaSearchCV(estimator=estimator,
  new_object = klass(**new_object_params)
