In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [2]:
df = pd.read_csv("./data/tweets_features.csv", encoding='latin')
df.drop(['is_after_certain_day', 'Date', 'User', 'Text', 'Time', 'Full_date'], axis=1, inplace=True)

In [3]:
df.head()

Unnamed: 0,Target,Weekday,Length,Hashtags,HasHashtags,Mentions,HasMentions,ExclamationMarks,HasExclamationMarks,Emoticons,...,w2v_90,w2v_91,w2v_92,w2v_93,w2v_94,w2v_95,w2v_96,w2v_97,w2v_98,w2v_99
0,0,3,67,0,False,1,True,0,False,[],...,0.426355,0.1056,0.0999,0.141384,0.683017,0.307975,0.141498,-0.470852,-0.14605,0.013814
1,1,3,93,0,False,1,True,0,False,[],...,0.8931,0.385402,-0.03595,-0.465414,1.386158,0.869907,0.718067,-0.696911,-0.041971,-0.19538
2,1,0,27,0,False,0,False,1,True,[],...,0.590157,0.242431,-0.339337,0.507457,1.383398,1.200003,0.258323,-0.451583,0.037492,0.057117
3,0,3,100,0,False,0,False,0,False,[],...,0.741138,0.014045,-0.037039,-0.30093,1.460556,0.542765,0.764829,-0.621368,-0.208227,-0.209104
4,0,4,52,0,False,0,False,1,True,[],...,0.621242,0.201937,-0.061574,0.177444,1.245472,0.71241,0.349455,-0.450614,0.096795,0.05602


In [4]:
selected_columns = ['Compound_VADER', 'skewed_hour_dist', 'Negative_VADER', 'Weekday',
                    'Polarity_TB', 'skewed_week_dist', 'Neutral_VADER', 'embedding_166',
                    'embedding_22', 'embedding_91', 'embedding_299', 'embedding_340',
                    'embedding_34', 'embedding_93', 'embedding_231', 'embedding_156',
                    'has_mentions', 'embedding_144', 'w2v_2', 'embedding_189',
                    'embedding_33']

df = df[selected_columns + ['Target']]

In [5]:
from sklearn.model_selection import cross_val_score


def evaluate_classifiers(X, y, classifiers, cv_folds):
    results = {}
    for name, clf in classifiers.items():
        scores = cross_val_score(clf, X, y, cv=cv_folds)
        results[name] = scores
        print(f"{name} Accuracy: {np.mean(scores):.2f} (+/- {np.std(scores) * 2:.2f})")

    return results

In [6]:
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

X = df.drop(['Target'], axis=1)
y = df['Target']
X = X.select_dtypes(include=['number'])

scaler = StandardScaler()
X = scaler.fit_transform(X)

classifiers = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Random Forest": RandomForestClassifier(n_estimators=100),
    "SVM": SVC(),
    "k-Nearest Neighbors": KNeighborsClassifier(n_neighbors=3),
    "Decision Tree": DecisionTreeClassifier(),
    "Gradient Boosting": GradientBoostingClassifier(),
    "Naive Bayes": GaussianNB()
}


In [7]:
basic_classifiers_res = evaluate_classifiers(X, y, classifiers, cv_folds=5)

Logistic Regression Accuracy: 0.82 (+/- 0.00)
Random Forest Accuracy: 0.83 (+/- 0.00)
SVM Accuracy: 0.82 (+/- 0.00)
k-Nearest Neighbors Accuracy: 0.78 (+/- 0.00)
Decision Tree Accuracy: 0.76 (+/- 0.00)
Gradient Boosting Accuracy: 0.82 (+/- 0.00)
Naive Bayes Accuracy: 0.78 (+/- 0.00)


In [8]:
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier, AdaBoostClassifier, BaggingClassifier, \
    StackingClassifier, VotingClassifier

estimators = [
    ('DecisionTree', DecisionTreeClassifier()),
    ('KNN', KNeighborsClassifier()),
    ('RandomForest', RandomForestClassifier())
]

ensemble_classifiers = {
    "Voting_Soft": VotingClassifier(estimators=estimators, voting='soft'),
    "AdaBoost": AdaBoostClassifier(n_estimators=100),
    "Bagging (with Logistic Regression)": BaggingClassifier(LogisticRegression(max_iter=1000), n_estimators=10),
    "Stacking": StackingClassifier(estimators=estimators, final_estimator=LogisticRegression()),
    "Gradient Boosting": GradientBoostingClassifier()
}

ensemble_classifiers_res = evaluate_classifiers(X, y, ensemble_classifiers, cv_folds=5)

Voting_Soft Accuracy: 0.80 (+/- 0.00)




AdaBoost Accuracy: 0.82 (+/- 0.00)
Bagging (with Decision Tree) Accuracy: 0.82 (+/- 0.00)
Stacking Accuracy: 0.83 (+/- 0.00)
Gradient Boosting Accuracy: 0.82 (+/- 0.00)


In [9]:
from xgboost import XGBClassifier
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=1)

model=XGBClassifier(random_state=1,
                    learning_rate=0.01,
                    booster='gbtree',
                    max_depth=4
                    )
model.fit(X_train, y_train)
model.score(X_test,y_test)

0.79505

In [10]:
import tensorflow as tf

model = tf.keras.models.Sequential([
    tf.keras.layers.InputLayer(input_shape=(X.shape[1],)),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(64, activation='relu'),
    tf.keras.layers.Dense(2, activation='softmax')
])

model.compile(optimizer='adam',
              loss='sparse_categorical_crossentropy',
              metrics=['accuracy'])

history = model.fit(X, y, epochs=50, validation_split=0.2)

2024-04-19 15:04:28.532973: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


Epoch 1/50




[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 726us/step - accuracy: 0.8092 - loss: 0.4059 - val_accuracy: 0.8232 - val_loss: 0.3837
Epoch 2/50
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 684us/step - accuracy: 0.8201 - loss: 0.3813 - val_accuracy: 0.8235 - val_loss: 0.3819
Epoch 3/50
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 656us/step - accuracy: 0.8233 - loss: 0.3755 - val_accuracy: 0.8256 - val_loss: 0.3802
Epoch 4/50
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 657us/step - accuracy: 0.8259 - loss: 0.3678 - val_accuracy: 0.8283 - val_loss: 0.3759
Epoch 5/50
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 648us/step - accuracy: 0.8298 - loss: 0.3652 - val_accuracy: 0.8264 - val_loss: 0.3758
Epoch 6/50
[1m2500/2500[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 662us/step - accuracy: 0.8287 - loss: 0.3610 - val_accuracy: 0.8252 - val_loss: 0.3771
Epoch 7/50
[1m