In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from nltk.stem import SnowballStemmer
from nltk.tokenize import word_tokenize
from bs4 import BeautifulSoup
from nltk import pos_tag
from nltk.corpus import wordnet
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, roc_auc_score, confusion_matrix#sadness (0), 
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV

In [2]:
reddit1 = pd.read_csv(filepath_or_buffer='Reddit_Combi.csv', sep=';', usecols=['Body_Title', 'label'])
reddit2 = pd.read_csv(filepath_or_buffer='Reddit_Title.csv', sep=';', usecols=['title', 'label'])
twitter1 = pd.read_csv(filepath_or_buffer='Twitter_ Non-Advert-Tabelle 1.csv', sep=';', usecols=['text', 'label'])
twitter2 = pd.read_csv(filepath_or_buffer='Twitter_Full.csv', sep=';', usecols=['text', 'labels'])

In [3]:
# Rename columns to ensure consistency
reddit1.rename(columns={'Body_Title': 'text'}, inplace=True)
reddit2.rename(columns={'title': 'text'}, inplace=True)
#twitter1.rename(columns={'label': 'labels'}, inplace=True)  # Rename to match column name in twitter2
twitter2.rename(columns={'labels': 'label'}, inplace=True)  # Rename to match column name in twitter1


In [4]:
reddit1.shape

(3123, 2)

In [5]:
reddit2.shape

(5556, 2)

In [6]:
twitter1.shape

(2051, 2)

In [7]:
twitter2.shape

(8900, 2)

In [8]:
# Concatenate the dataframes
merged_df = pd.concat([reddit1[['text', 'label']], reddit2[['text', 'label']], twitter1[['text', 'label']], twitter2[['text', 'label']]], ignore_index=True)


In [9]:
merged_df.shape

(19630, 2)

In [10]:
merged_df['label'].value_counts()

1    11292
0     8338
Name: label, dtype: int64

In [11]:
merged_df.isna().sum()

text     3
label    0
dtype: int64

In [12]:
merged_df = merged_df.dropna()

In [13]:
merged_df.isna().sum()

text     0
label    0
dtype: int64

In [14]:
print(merged_df['text'].dtype)

object


###### Seperating text from label

In [15]:
X = merged_df.drop(columns = 'label', axis = 1)
Y = merged_df['label']

In [16]:
Y.shape

(19627,)

In [17]:
print(X.dtypes)

text    object
dtype: object


###### Data Preprocessing

In [18]:
contractions = {
    "ain't": "is not",
    "aren't": "are not",
    "can't": "cannot",
    "could've": "could have",
    "couldn't": "could not",
    "didn't": "did not",
    "doesn't": "does not",
    "don't": "do not",
    "hadn't": "had not",
    "hasn't": "has not",
    "haven't": "have not",
    "he'd": "he would",
    "he'll": "he will",
    "he's": "he is",
    "how'd": "how did",
    "how'll": "how will",
    "how's": "how is",
    "I'd": "I would",
    "I'll": "I will",
    "I'm": "I am",
    "I've": "I have",
    "isn't": "is not",
    "it'd": "it would",
    "it'll": "it will",
    "it's": "it is",
    "let's": "let us",
    "ma'am": "madam",
    "might've": "might have",
    "mightn't": "might not",
    "must've": "must have",
    "mustn't": "must not",
    "needn't": "need not",
    "shan't": "shall not",
    "she'd": "she would",
    "she'll": "she will",
    "she's": "she is",
    "should've": "should have",
    "shouldn't": "should not",
    "that'd": "that would",
    "that's": "that is",
    "there'd": "there had",
    "there's": "there is",
    "they'd": "they would",
    "they'll": "they will",
    "they're": "they are",
    "they've": "they have",
    "wasn't": "was not",
    "we'd": "we would",
    "we'll": "we will",
    "we're": "we are",
    "we've": "we have",
    "weren't": "were not",
    "what'll": "what will",
    "what're": "what are",
    "what's": "what is",
    "what've": "what have",
    "when's": "when is",
    "where'd": "where did",
    "where's": "where is",
    "who'll": "who will",
    "who's": "who is",
    "won't": "will not",
    "wouldn't": "would not",
    "you'd": "you would",
    "you'll": "you will",
    "you're": "you are",
    "you've": "you have"
}

def expand_contractions(text):
    for contraction, expansion in contractions.items():
        text = text.replace(contraction, expansion)
    return text

In [19]:

def preprocessing(text):
    # Remove HTML tags
    text = BeautifulSoup(text, "html.parser").get_text()

    # Expand contractions (if needed)
    text = expand_contractions(text)

    # Remove special characters
    text = re.sub(r'[^a-zA-Z\s]', '', text)

    # Normalize text to lowercase
    text = text.lower()

    # Remove URLs
    text = re.sub(r'http\S+', '', text)

    # Remove numbers
    text = re.sub(r'\d+', '', text)

    # Remove extra whitespaces
    text = re.sub(r'\s+', ' ', text).strip()

    return text

def stemming(text):
    text = preprocessing(text)

    snowball_stem = SnowballStemmer(language='english')
    stop_words = set(stopwords.words('english'))
    
    # Tokenize content
    tokens = word_tokenize(text)
    
    # Perform POS tagging
    tagged_tokens = pos_tag(tokens)
    
    # Filter tokens to include only nouns and adjectives
    filtered_tokens = [word for word, pos in tagged_tokens if pos.startswith('NN') or pos.startswith('JJ')]
    
    # Perform stemming
    stemmed_text = [snowball_stem.stem(word) for word in filtered_tokens if word not in stop_words]
    stemmed_text = ' '.join(stemmed_text)

    return stemmed_text

In [20]:
X['text'] = X['text'].apply(stemming)



In [21]:
print(X.dtypes)

text    object
dtype: object


In [22]:
X = X.values.tolist()
Y = Y

In [23]:
X

[['envi im developingcountri indonesia temporari work year contract hard labor job stress next year contract finish stay job social life depress wors develop countri countri good anyth equal currenc exchang year big compani jakartaour capit citi equal amount money year yes right money money im rich person young age societi sick gap un equal beetwen vs countri poor vs rich big right evil person thisi wish war world everyon chanc equal poor chanc better life world order stay way'],
 ['nothin ordinari paradis job stress life stress f bomb call hand skin tone point um hello mani im sure today im misanthrop kinda iron caus support other way construct specialist asskiss im everyon one im better everyon theyr lot better loos screw man im much compani vehicl hard hard im im charg lot peopl project worki way trade actual work busi im sure imadd nobodi fault peopl crab bucket painter old guy park lot ive state month new state licens forward much car need repair motorcycl ride im live hous mine i

In [28]:
for item in X:
    print(type(item))

<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'list'>
<class 'li

###### Vectorizing data

In [22]:
vectorizer = TfidfVectorizer()
vectorizer.fit(X)
X = vectorizer.transform(X)

In [23]:
X.shape

(19627, 21921)

In [24]:
Y.shape

(19627,)

###### Train Test split

In [25]:
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = .2, stratify = Y)
print(X.shape, X_train.shape, X_test.shape, Y_train.shape, Y_test.shape)

(19627, 21921) (15701, 21921) (3926, 21921) (15701,) (3926,)


###### Class weights to handle class imbalance

In [24]:
from sklearn.utils.class_weight import compute_class_weight

# Compute class weights
class_weights = compute_class_weight(class_weight = 'balanced', classes = np.unique(Y), y = Y)
for class_label, weight in zip(np.unique(Y), class_weights):
    print(f"Class {class_label}: Weight {weight}")

Class 0: Weight 1.1771020750869616
Class 1: Weight 0.8692205491585474


###### Random Forest Classifier - computationally expensive

In [19]:
'''# Define the parameter grid for hyperparameter tuning
param_grid = {
    'n_estimators': [300, 500, 1000],  # Number of trees in the forest
    'max_features': ['auto', 'sqrt'],  # Number of features to consider at every split
    'max_depth': [10, 20, 30, 40, 50, None],  # Maximum number of levels in tree
    'min_samples_split': [2, 5, 10],  # Minimum number of samples required to split a node
    'min_samples_leaf': [1, 2, 4],  # Minimum number of samples required at each leaf node
    'bootstrap': [True, False]  # Method of selecting samples for training each tree
}

# Create a RandomForestClassifier object
rf_classifier = RandomForestClassifier(class_weight='balanced')

# Perform RandomizedSearchCV
random_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_grid, n_iter=30, cv=5, verbose=2, n_jobs=-1)

# Fit the random search model
random_search.fit(X_train, Y_train)

# Print the best parameters found
print("Best Parameters:", random_search.best_params_)

# Get the best model
best_rf_classifier = random_search.best_estimator_

# Perform GridSearchCV for fine tuning
grid_search = GridSearchCV(estimator=best_rf_classifier, param_grid=param_grid, cv=5, n_jobs=-1, verbose=2)

# Fit the grid search model
grid_search.fit(X_train, Y_train)

# Print the best parameters found
print("Best Parameters after GridSearchCV:", grid_search.best_params_)

# Get the best model after grid search
best_rf_classifier = grid_search.best_estimator_

# Evaluate the best model on the test set
test_accuracy = best_rf_classifier.score(X_test, Y_test)
print("Test Accuracy:", test_accuracy)'''

'# Define the parameter grid for hyperparameter tuning\nparam_grid = {\n    \'n_estimators\': [300, 500, 1000],  # Number of trees in the forest\n    \'max_features\': [\'auto\', \'sqrt\'],  # Number of features to consider at every split\n    \'max_depth\': [10, 20, 30, 40, 50, None],  # Maximum number of levels in tree\n    \'min_samples_split\': [2, 5, 10],  # Minimum number of samples required to split a node\n    \'min_samples_leaf\': [1, 2, 4],  # Minimum number of samples required at each leaf node\n    \'bootstrap\': [True, False]  # Method of selecting samples for training each tree\n}\n\n# Create a RandomForestClassifier object\nrf_classifier = RandomForestClassifier(class_weight=\'balanced\')\n\n# Perform RandomizedSearchCV\nrandom_search = RandomizedSearchCV(estimator=rf_classifier, param_distributions=param_grid, n_iter=30, cv=5, verbose=2, n_jobs=-1)\n\n# Fit the random search model\nrandom_search.fit(X_train, Y_train)\n\n# Print the best parameters found\nprint("Best Par

###### Model Eval on random forest

In [47]:
y_pred = rf_classifier.predict(X_test)

# Calculate evaluation metrics
accuracy = accuracy_score(Y_test, y_pred)
precision = precision_score(Y_test, y_pred)
recall = recall_score(Y_test, y_pred)
f1 = f1_score(Y_test, y_pred)
roc_auc = roc_auc_score(Y_test, rf_classifier.predict_proba(X_test)[:, 1])
conf_matrix = confusion_matrix(Y_test, y_pred)

# Print the results
print("Accuracy:", accuracy)
print("Precision:", precision)
print("Recall:", recall)
print("F1 Score:", f1)
print("ROC AUC Score:", roc_auc)
print("Confusion Matrix:")
print(conf_matrix)

Accuracy: 0.8706062149770759
Precision: 0.8600823045267489
Recall: 0.9255978742249779
F1 Score: 0.8916382252559726
ROC AUC Score: 0.9364298640803921
Confusion Matrix:
[[1328  340]
 [ 168 2090]]


###### XGBoost, Logistic Regression, SVM

In [49]:
from sklearn.utils.class_weight import compute_class_weight
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.linear_model import LogisticRegression
from xgboost import XGBClassifier
from sklearn import svm

# Compute class weights
class_weights = compute_class_weight(class_weight='balanced', classes=np.unique(Y_train), y=Y_train)

# Define parameter grids for randomized search (coarse search)
logistic_param_grid_coarse = {
    'penalty': ['l1', 'l2'],
    'C': [0.01, 0.1, 1, 10, 100],
    'solver': ['liblinear', 'saga']
}

xgboost_param_grid_coarse = {
    'learning_rate': [0.01, 0.1, 0.2],
    'n_estimators': [50, 100, 200],
    'max_depth': [3, 5, 7],
    'min_child_weight': [1, 3, 5],
    'subsample': [0.5, 0.8, 1.0],
    'colsample_bytree': [0.5, 0.8, 1.0]
}

svm_param_grid_coarse = {
    'C': [0.1, 1, 10],
    'gamma': [0.01, 0.1, 1],
    'kernel': ['linear', 'rbf']
}

# RandomizedSearchCV for each model (coarse search)
logistic_random_search_coarse = RandomizedSearchCV(LogisticRegression(class_weight=dict(enumerate(class_weights))), logistic_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
xgboost_random_search_coarse = RandomizedSearchCV(XGBClassifier(objective='binary:logistic', scale_pos_weight=np.sqrt(class_weights[0] / class_weights[1])), xgboost_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)
svm_random_search_coarse = RandomizedSearchCV(svm.SVC(class_weight=dict(enumerate(class_weights))), svm_param_grid_coarse, n_iter=30, cv=5, n_jobs=-1)

# Fit models using RandomizedSearchCV (coarse search)
logistic_random_search_coarse.fit(X_train, Y_train)
xgboost_random_search_coarse.fit(X_train, Y_train)
svm_random_search_coarse.fit(X_train, Y_train)

# Get best hyperparameters from RandomizedSearchCV (coarse search)
best_logistic_params_coarse = logistic_random_search_coarse.best_params_
best_xgboost_params_coarse = xgboost_random_search_coarse.best_params_
best_svm_params_coarse = svm_random_search_coarse.best_params_

# Define parameter grids for GridSearchCV (fine search)
logistic_param_grid_fine = {
    'penalty': [best_logistic_params_coarse['penalty']],
    'C': [best_logistic_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'solver': [best_logistic_params_coarse['solver']]
}

xgboost_param_grid_fine = {
    'learning_rate': [best_xgboost_params_coarse['learning_rate'] * i for i in [0.5, 1, 2]],
    'n_estimators': [best_xgboost_params_coarse['n_estimators']],
    'max_depth': [best_xgboost_params_coarse['max_depth']],
    'min_child_weight': [best_xgboost_params_coarse['min_child_weight']],
    'subsample': [best_xgboost_params_coarse['subsample']],
    'colsample_bytree': [best_xgboost_params_coarse['colsample_bytree']]
}

svm_param_grid_fine = {
    'C': [best_svm_params_coarse['C'] * i for i in [0.1, 1, 10]],
    'gamma': [best_svm_params_coarse['gamma'] * i for i in [0.1, 1, 10]],
    'kernel': [best_svm_params_coarse['kernel']]
}

# GridSearchCV for each model (fine search)
logistic_grid_search_fine = GridSearchCV(LogisticRegression(class_weight=dict(enumerate(class_weights))), param_grid=logistic_param_grid_fine, cv=5, n_jobs=-1)
xgboost_grid_search_fine = GridSearchCV(XGBClassifier(objective='binary:logistic', scale_pos_weight=np.sqrt(class_weights[0] / class_weights[1])), param_grid=xgboost_param_grid_fine, cv=5, n_jobs=-1)
svm_grid_search_fine = GridSearchCV(svm.SVC(class_weight=dict(enumerate(class_weights))), param_grid=svm_param_grid_fine, cv=5, n_jobs=-1)

# Fit models using GridSearchCV (fine search)
logistic_grid_search_fine.fit(X_train, Y_train)
xgboost_grid_search_fine.fit(X_train, Y_train)
svm_grid_search_fine.fit(X_train, Y_train)

# Print best hyperparameters from GridSearchCV (fine search)
print("Logistic Regression Best Parameters (Fine Search):", logistic_grid_search_fine.best_params_)
print("XGBoost Best Parameters (Fine Search):", xgboost_grid_search_fine.best_params_)
print("SVM Best Parameters (Fine Search):", svm_grid_search_fine.best_params_)

# Compare cross-validated scores of each model
logistic_cv_score_fine = logistic_grid_search_fine.best_score_
xgboost_cv_score_fine = xgboost_grid_search_fine.best_score_
svm_cv_score_fine = svm_grid_search_fine.best_score_

# Select the best model based on cross-validated scores
best_model_fine = None
if logistic_cv_score_fine >= xgboost_cv_score_fine and logistic_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = logistic_grid_search_fine.best_estimator_
elif xgboost_cv_score_fine >= logistic_cv_score_fine and xgboost_cv_score_fine >= svm_cv_score_fine:
    best_model_fine = xgboost_grid_search_fine.best_estimator_
else:
    best_model_fine = svm_grid_search_fine.best_estimator_

# Evaluate the best model on the test set
train_accuracy_fine = best_model_fine.score(X_train, Y_train)
print("Best Model Train Accuracy (Fine Search):", train_accuracy_fine)
test_accuracy_fine = best_model_fine.score(X_test, Y_test)
print("Best Model Test Accuracy (Fine Search):", test_accuracy_fine)




Logistic Regression Best Parameters (Fine Search): {'C': 10, 'penalty': 'l2', 'solver': 'saga'}
XGBoost Best Parameters (Fine Search): {'colsample_bytree': 0.8, 'learning_rate': 0.4, 'max_depth': 7, 'min_child_weight': 1, 'n_estimators': 100, 'subsample': 0.8}
SVM Best Parameters (Fine Search): {'C': 10, 'gamma': 1, 'kernel': 'rbf'}
Best Model Train Accuracy (Fine Search): 0.9978345328323037
Best Model Test Accuracy (Fine Search): 0.8830871115639327


###### BERT

In [18]:
import tensorflow as tf
from transformers import BertTokenizer, TFBertForSequenceClassification
from sklearn.model_selection import train_test_split

# Split the dataset into input texts (X) and labels (Y)
X = merged_df['text']
Y = merged_df['label']



print("Data type of labels (Y):", type(Y))
print("Shape of labels (Y):", Y.shape)
print("Labels (Y):", Y)

# Train-test split
train_texts, val_texts, train_labels, val_labels = train_test_split(X, Y, test_size=0.2, random_state=42)



# Load pre-trained BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


# Tokenize input texts
train_encodings = tokenizer(train_texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')
val_encodings = tokenizer(val_texts.tolist(), padding=True, truncation=True, max_length=128, return_tensors='tf')

# Load pre-trained BERT model
model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased')

# Compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=3e-5)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
metric = tf.keras.metrics.SparseCategoricalAccuracy('accuracy')
model.compile(optimizer=optimizer, loss=loss, metrics=[metric])

# Train the model
history = model.fit(
    [train_encodings.input_ids, train_encodings.attention_mask], train_labels,
    validation_data=([val_encodings.input_ids, val_encodings.attention_mask], val_labels),
    epochs=3, batch_size=32)

# Evaluate the model
test_loss, test_accuracy = model.evaluate([val_encodings.input_ids, val_encodings.attention_mask], val_labels)
print("Test Loss:", test_loss)
print("Test Accuracy:", test_accuracy)

Data type of labels (Y): <class 'pandas.core.series.Series'>
Shape of labels (Y): (19627,)
Labels (Y): 0        1
1        1
2        1
3        0
4        1
        ..
19625    0
19626    1
19627    1
19628    1
19629    1
Name: label, Length: 19627, dtype: int64


model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]




To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Epoch 1/3
Cause: for/else statement not yet supported
Cause: for/else statement not yet supported


Epoch 2/3
Epoch 3/3
Test Loss: 0.2601657509803772
Test Accuracy: 0.9258787631988525
