In [14]:
import nltk
import pandas as pd
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
# from xgboost import XGBRFClassifier

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score

# nltk.download("stopwords")

In [2]:
df = pd.read_csv('../Dataset/Preprocessed_Data.csv')

In [3]:
X_train, X_test, y_train, y_test = train_test_split(df['Tweet'], df['stance'], test_size=0.2, random_state=42)

In [4]:
# Convert tweets to vectors using TF-IDF
stop_words = stopwords.words("english")
vectorizer = TfidfVectorizer(stop_words = stop_words, max_features = 2000)

In [5]:
X_train_vectorized = vectorizer.fit_transform(X_train)
X_test_vectorized = vectorizer.transform(X_test)

In [6]:
print(X_train_vectorized.shape, X_test_vectorized.shape)

(23239, 2000) (5810, 2000)


In [7]:
params = {'C' : [1, 10], 
          'kernel' : ['linear', 'poly', 'rbf'], 
          'gamma' : ['scale', 'auto'], 
          }


grid_search = GridSearchCV(SVC(), param_grid = params, cv=3, n_jobs = -1, verbose = 3, refit = True)

In [8]:
grid_search.fit(X_train_vectorized, y_train)

Fitting 3 folds for each of 12 candidates, totalling 36 fits
[CV 1/3] END ...C=1, gamma=scale, kernel=linear;, score=0.671 total time=  19.2s
[CV 2/3] END ...C=1, gamma=scale, kernel=linear;, score=0.676 total time=  19.2s
[CV 3/3] END ...C=1, gamma=scale, kernel=linear;, score=0.677 total time=  19.2s
[CV 1/3] END .....C=1, gamma=scale, kernel=poly;, score=0.663 total time=  53.7s
[CV 2/3] END .....C=1, gamma=scale, kernel=poly;, score=0.670 total time=  53.8s
[CV 3/3] END .....C=1, gamma=scale, kernel=poly;, score=0.675 total time=  52.4s
[CV 1/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.688 total time=  26.4s
[CV 2/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.695 total time=  26.0s
[CV 3/3] END ......C=1, gamma=scale, kernel=rbf;, score=0.693 total time=  26.1s
[CV 1/3] END ....C=1, gamma=auto, kernel=linear;, score=0.671 total time=  19.4s
[CV 2/3] END ....C=1, gamma=auto, kernel=linear;, score=0.676 total time=  19.4s
[CV 3/3] END ....C=1, gamma=auto, kernel=linear;

GridSearchCV(cv=3, estimator=SVC(),
             param_grid={'C': [1, 10], 'gamma': ['scale', 'auto'],
                         'kernel': ['linear', 'poly', 'rbf']},
             verbose=3)

In [9]:
svm_cf = grid_search.best_estimator_
y_pred = svm_cf.predict(X_test_vectorized)

# Accuracy score
print("Best hyperparameters:", grid_search.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))

Best hyperparameters: {'C': 1, 'gamma': 'scale', 'kernel': 'rbf'}
Accuracy: 0.7074010327022375


In [11]:
param_grid = {
    'C': [1, 5, 10],
    'penalty': ['l1', 'l2']
}

logistic_Clf = LogisticRegression(max_iter=1000000, solver='liblinear')
grid_search_lrclf = GridSearchCV(logistic_Clf, param_grid, cv=5, n_jobs=-1, verbose = 3, refit = True)

In [12]:
grid_search_lrclf.fit(X_train_vectorized, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


GridSearchCV(cv=5,
             estimator=LogisticRegression(max_iter=1000000, solver='liblinear'),
             n_jobs=-1, param_grid={'C': [1, 5, 10], 'penalty': ['l1', 'l2']},
             verbose=3)

In [13]:
lrcf = grid_search_lrclf.best_estimator_
y_pred = lrcf.predict(X_test_vectorized)

# Accuracy score
print("Best hyperparameters:", grid_search_lrclf.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))

Best hyperparameters: {'C': 1, 'penalty': 'l1'}
Accuracy: 0.6920826161790017


In [15]:
param_grid = {
    'n_estimators': [300, 400],
    'max_depth': [None, 20, 30, 40]
}

rf_clf = RandomForestClassifier(random_state=42)
grid_search_rfclf = GridSearchCV(rf_clf, param_grid, cv=5, n_jobs = -1, verbose = 3, refit = True)

In [16]:
grid_search_rfclf.fit(X_train_vectorized, y_train)

Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 2/5] END ...................C=1, penalty=l2;, score=0.688 total time=   0.2s
[CV 1/5] END ...................C=5, penalty=l1;, score=0.665 total time=   0.8s
[CV 4/5] END ..................C=10, penalty=l1;, score=0.679 total time=   0.8s
[CV 3/5] END ..max_depth=None, n_estimators=300;, score=0.640 total time=  53.2s




[CV 5/5] END ...................C=1, penalty=l1;, score=0.695 total time=   0.4s
[CV 5/5] END ...................C=5, penalty=l1;, score=0.695 total time=   0.8s
[CV 3/5] END ..................C=10, penalty=l2;, score=0.678 total time=   0.3s
[CV 1/5] END ..max_depth=None, n_estimators=300;, score=0.642 total time=  53.5s
[CV 3/5] END ...................C=1, penalty=l2;, score=0.685 total time=   0.2s
[CV 1/5] END ...................C=5, penalty=l2;, score=0.669 total time=   0.3s
[CV 4/5] END ...................C=5, penalty=l2;, score=0.684 total time=   0.2s
[CV 2/5] END ..................C=10, penalty=l1;, score=0.680 total time=   0.9s
[CV 5/5] END ..max_depth=None, n_estimators=300;, score=0.657 total time=  53.6s
[CV 4/5] END ...................C=1, penalty=l1;, score=0.687 total time=   0.4s
[CV 5/5] END ...................C=1, penalty=l2;, score=0.694 total time=   0.2s
[CV 3/5] END ...................C=5, penalty=l2;, score=0.682 total time=   0.3s
[CV 1/5] END ...............

GridSearchCV(cv=5, estimator=RandomForestClassifier(random_state=42), n_jobs=-1,
             param_grid={'max_depth': [None, 20, 30, 40],
                         'n_estimators': [300, 400]},
             verbose=3)

In [17]:
rfcf = grid_search_rfclf.best_estimator_
y_pred = rfcf.predict(X_test_vectorized)

# Accuracy score
print("Best hyperparameters:", grid_search_rfclf.best_params_)
print("Accuracy:", accuracy_score(y_test, y_pred))

Best hyperparameters: {'max_depth': None, 'n_estimators': 400}
Accuracy: 0.6641996557659208
[CV 2/5] END ....max_depth=30, n_estimators=300;, score=0.624 total time=  10.9s
[CV 3/5] END ....max_depth=30, n_estimators=400;, score=0.619 total time=  14.6s
[CV 5/5] END ....max_depth=40, n_estimators=300;, score=0.647 total time=  15.9s
[CV 2/5] END ....max_depth=20, n_estimators=300;, score=0.612 total time=   5.7s
[CV 5/5] END ....max_depth=20, n_estimators=300;, score=0.630 total time=   6.7s
[CV 3/5] END ....max_depth=20, n_estimators=400;, score=0.616 total time=   8.2s
[CV 3/5] END ....max_depth=30, n_estimators=300;, score=0.618 total time=  11.1s
[CV 5/5] END ....max_depth=30, n_estimators=400;, score=0.638 total time=  14.5s
[CV 1/5] END ....max_depth=40, n_estimators=400;, score=0.638 total time=  20.5s
[CV 3/5] END ....max_depth=20, n_estimators=300;, score=0.613 total time=   5.8s
[CV 1/5] END ....max_depth=20, n_estimators=400;, score=0.632 total time=   8.8s
[CV 4/5] END ....

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras import losses

In [None]:
max_features = 10000
sequence_length = 250

vectorize_layer = layers.TextVectorization(
    max_tokens=max_features,
    output_mode='int',
    output_sequence_length=sequence_length)

vectorize_layer.adapt(X_train)

In [None]:
embedding_dim = 16

model = tf.keras.Sequential([
  layers.Embedding(max_features + 1, embedding_dim),
  layers.Dropout(0.2),
  layers.GlobalAveragePooling1D(),
  layers.Dropout(0.2),
  layers.Dense(3)])

model.summary()

In [None]:
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

In [None]:
history = model.fit(
    X_train,
    y_train,
    batch_size=32,
    validation_split=0.1,
    epochs=10)

In [None]:
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss:", loss)
print("Accuracy:", accuracy)

In [None]:
# Tokenize the text data
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X_train)
X_train = tokenizer.texts_to_sequences(X_train)
X_test = tokenizer.texts_to_sequences(X_test)

# Pad the sequences to a fixed length
max_len = 100
X_train = pad_sequences(X_train, maxlen=max_len)
X_test = pad_sequences(X_test, maxlen=max_len)

# Define the LSTM model
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index)+1, output_dim=128, input_length=max_len))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=3, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=10, validation_split=0.1)

# Evaluate the model on testing data
loss, accuracy = model.evaluate(X_test, y_test)
print("Loss:", loss)
print("Accuracy:", accuracy)
