In [3]:
import pandas as pd
import re

# regular expression to detect emojis
emoji_pattern = re.compile(
    "["  
    "\U0001F600-\U0001F64F"  # emoticons
    "\U0001F300-\U0001F5FF"  # symbols & pictographs
    "\U0001F680-\U0001F6FF"  # transport & map symbols
    "\U0001F700-\U0001F77F"  # alchemical symbols
    "\U0001F780-\U0001F7FF"  # geometric shapes
    "\U0001F800-\U0001F8FF"  # supplemental arrows
    "\U0001F900-\U0001F9FF"  # supplemental symbols and pictographs
    "\U0001FA00-\U0001FA6F"  # chess symbols
    "\U0001FA70-\U0001FAFF"  # symbols and pictographs
    "\U00002702-\U000027B0"  # dingbats
    "\U000024C2-\U0001F251"  # enclosed characters
    "]", flags=re.UNICODE)


# load the CSV data into a pandas DataFrame for the training set
df = pd.read_csv('training.csv')

# function to clean tweets: replace mentions, URLs, hashtags, remove multiple spaces, and lowercase
def remove_clean_tweet(tweet):
    tweet = tweet.lower()  # convert tweet to lowercase
    # remove hashtags 
    tweet = re.sub(r'#\S+', ' ', tweet)
    # remove URLs
    tweet = re.sub(r'http[s]?://\S+', ' ', tweet)  # for URLs starting with http:// or https://
    tweet = re.sub(r'www\S+', ' ', tweet)  # for URLs starting with www
    # remove mentions
    tweet = re.sub(r'@\S+', ' ', tweet)  # for mentions like @username
    # remove emojis
    tweet = re.sub(emoji_pattern, ' ', tweet)
    # remove retweets
    tweet = re.sub('rt', ' ', tweet)
    # remove extra spaces and leading/trailing spaces
    tweet = re.sub(r'\s+', ' ', tweet).strip()  # replace multiple spaces with a single space
    return tweet

# apply the cleaning function to the 'tweet' column
df['tweet'] = df['tweet'].apply(remove_clean_tweet)

# display the cleaned dataframe
print(df.head())

# save the cleaned DataFrame back to a new CSV file
df.to_csv('removed_training.csv', index=False)

# load the CSV data into a pandas DataFrame for the test set
df_test = pd.read_csv('test.csv')

# apply the cleaning function to the 'tweet' column
df_test['tweet'] = df_test['tweet'].apply(remove_clean_tweet)

# display the cleaned dataframe
print(df_test.head())

# save the cleaned DataFrame back to a new CSV file
df_test.to_csv('removed_test.csv', index=False)


                                               tweet  label  gender
0  alex is too nice for love island :( teenager c...  human  female
1  the crypto finance ecosystem by women are not ...  human  female
2  check out these awesome cooking t-shi s &amp; ...    bot     bot
3  yewwinfo tiny nanopa icles to treat a huge pro...    bot     bot
4  sr. project manager water / wastewater enginee...    bot     bot
                                               tweet  label  gender
0  her attitude was: if you're on her mom. this t...    bot     bot
1  book launch today for in cmb at 4pm followed b...  human  female
2  on thursday at 10.30 am in edinburgh, roundtab...  human  female
3  new italian law sho ens prison sentences by 3 ...  human  female
4  shit bags had ht ft 35/1 brian kerr or mark la...  human    male


In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# load data
df = pd.read_csv('removed_training.csv')

# check for and replace missing values in the 'tweet' column
df['tweet'] = df['tweet'].fillna('')  # replace NaN with empty string

# extract tweet text and labels
tweets = df['tweet']
labels = df['label']

# convert tweets to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')  # limit to top 1000 features for efficiency
X = vectorizer.fit_transform(tweets)
y = labels

# split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=39, stratify=y)

# train Gradient Boosting Classifier
clf = GradientBoostingClassifier(random_state=39)
clf.fit(X_train, y_train)

# evaluate on validation data
y_pred = clf.predict(X_val)
print("Validation Results:")
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))
print("Accuracy Score:", round(accuracy_score(y_val, y_pred), 2))

# load test set
df_test = pd.read_csv('removed_test.csv')

# check for and replace missing values in the 'tweet' column of test set
df_test['tweet'] = df_test['tweet'].fillna('')  # replace NaN with empty string

tweets_test = df_test['tweet']
labels_test = df_test['label']

# convert test tweets to TF-IDF features
X_test = vectorizer.transform(tweets_test)

# predict on test set
y_test_pred = clf.predict(X_test)

# evaluate on test data
print("\nTest Results:")
print("Confusion Matrix:\n", confusion_matrix(labels_test, y_test_pred))
print("Classification Report:\n", classification_report(labels_test, y_test_pred))
print("Accuracy Score:", round(accuracy_score(labels_test, y_test_pred), 2))

# add predictions to the test DataFrame and save
df_test['predicted_label'] = y_test_pred
df_test.to_csv('removed_prediction_test.csv', index=False)

# feature importance analysis
feature_names = vectorizer.get_feature_names_out()
feature_importances = clf.feature_importances_

# create a DataFrame for feature importances
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# display top 10 features by importance
print("\nTop 10 Important Features:")
print(importance_df.head(10))

# save top features to a CSV file
importance_df.to_csv('top_features_removed.csv', index=False)


Validation Results:
Confusion Matrix:
 [[373  39]
 [ 15 397]]
Classification Report:
               precision    recall  f1-score   support

         bot       0.96      0.91      0.93       412
       human       0.91      0.96      0.94       412

    accuracy                           0.93       824
   macro avg       0.94      0.93      0.93       824
weighted avg       0.94      0.93      0.93       824

Accuracy Score: 0.93

Test Results:
Confusion Matrix:
 [[1111  209]
 [  85 1235]]
Classification Report:
               precision    recall  f1-score   support

         bot       0.93      0.84      0.88      1320
       human       0.86      0.94      0.89      1320

    accuracy                           0.89      2640
   macro avg       0.89      0.89      0.89      2640
weighted avg       0.89      0.89      0.89      2640

Accuracy Score: 0.89

Top 10 Important Features:
          Feature  Importance
887         today    0.185196
873        thanks    0.149339
467          ju

In [2]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# load data
df = pd.read_csv('removed_training.csv')

# check for and replace missing values in the 'tweet' column
df['tweet'] = df['tweet'].fillna('')  # replace NaN with empty string

# extract tweet text and labels
tweets = df['tweet']
labels = df['label']

# convert tweets to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')  # limit to top 1000 features for efficiency
X = vectorizer.fit_transform(tweets)
y = labels

# split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=39, stratify=y)

# train Random Forest Classifier
clf = RandomForestClassifier(random_state=39, n_estimators=100)  # use 100 trees
clf.fit(X_train, y_train)

# evaluate on validation data
y_pred = clf.predict(X_val)
print("Validation Results:")
print("Confusion Matrix:\n", confusion_matrix(y_val, y_pred))
print("Classification Report:\n", classification_report(y_val, y_pred))
print("Accuracy Score:", round(accuracy_score(y_val, y_pred), 2))

# save validation predictions
df_validation = pd.DataFrame({
    'Actual': y_val,
    'Predicted': y_pred
})
df_validation.to_csv('validation_predictions_RF.csv', index=False)

# load test set
df_test = pd.read_csv('removed_test.csv')

# check for and replace missing values in the 'tweet' column of test set
df_test['tweet'] = df_test['tweet'].fillna('')  # replace NaN with empty string

tweets_test = df_test['tweet']
labels_test = df_test['label']

# convert test tweets to TF-IDF features
X_test = vectorizer.transform(tweets_test)

# predict on test set
y_test_pred = clf.predict(X_test)

# evaluate on test data
print("\nTest Results:")
print("Confusion Matrix:\n", confusion_matrix(labels_test, y_test_pred))
print("Classification Report:\n", classification_report(labels_test, y_test_pred))
print("Accuracy Score:", round(accuracy_score(labels_test, y_test_pred), 2))

# add predictions to the test DataFrame and save
df_test['predicted_label'] = y_test_pred
df_test.to_csv('removed_prediction_test_RF.csv', index=False)

# feature Importance Analysis
feature_names = vectorizer.get_feature_names_out()
feature_importances = clf.feature_importances_

# create a DataFrame for feature importances
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': feature_importances
}).sort_values(by='Importance', ascending=False)

# display top 10 features by importance
print("\nTop 10 Important Features:")
print(importance_df.head(10))

# save top features to a CSV file
importance_df.to_csv('top_features_removed_RF.csv', index=False)


Validation Results:
Confusion Matrix:
 [[364  48]
 [ 11 401]]
Classification Report:
               precision    recall  f1-score   support

         bot       0.97      0.88      0.93       412
       human       0.89      0.97      0.93       412

    accuracy                           0.93       824
   macro avg       0.93      0.93      0.93       824
weighted avg       0.93      0.93      0.93       824

Accuracy Score: 0.93

Test Results:
Confusion Matrix:
 [[1074  246]
 [  59 1261]]
Classification Report:
               precision    recall  f1-score   support

         bot       0.95      0.81      0.88      1320
       human       0.84      0.96      0.89      1320

    accuracy                           0.88      2640
   macro avg       0.89      0.88      0.88      2640
weighted avg       0.89      0.88      0.88      2640

Accuracy Score: 0.88

Top 10 Important Features:
     Feature  Importance
467     just    0.033561
959     week    0.030327
873   thanks    0.026300
890  

In [3]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# load training data
df = pd.read_csv('removed_training.csv')

# check for and replace missing values in the 'tweet' column
df['tweet'] = df['tweet'].fillna('')  # replace NaN with empty string

# extract tweet text and labels
tweets = df['tweet']
labels = df['label']

# convert tweets to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')  # limit to top 1000 features for efficiency
X = vectorizer.fit_transform(tweets)
y = labels

# split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=39, stratify=y)

# train SVM Classifier
print("Training SVM...")
svm_clf = SVC(kernel='linear', random_state=39)  # linear kernel for interpretability
svm_clf.fit(X_train, y_train)

# evaluate on validation data
y_val_pred = svm_clf.predict(X_val)
print("Validation Results:")
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))
print("Accuracy Score:", round(accuracy_score(y_val, y_val_pred), 2))

# save validation predictions
df_validation = pd.DataFrame({
    'Actual': y_val,
    'Predicted': y_val_pred
})
df_validation.to_csv('validation_predictions_SVM.csv', index=False)

# load test set
df_test = pd.read_csv('removed_test.csv')

# check for and replace missing values in the 'tweet' column of test set
df_test['tweet'] = df_test['tweet'].fillna('')  # replace NaN with empty string

tweets_test = df_test['tweet']
labels_test = df_test['label']

# convert test tweets to TF-IDF features
X_test = vectorizer.transform(tweets_test)

# predict on test set
y_test_pred = svm_clf.predict(X_test)

# evaluate on test data
print("\nTest Results:")
print("Confusion Matrix:\n", confusion_matrix(labels_test, y_test_pred))
print("Classification Report:\n", classification_report(labels_test, y_test_pred))
print("Accuracy Score:", round(accuracy_score(labels_test, y_test_pred), 2))

# add predictions to the test DataFrame and save
df_test['predicted_label'] = y_test_pred
df_test.to_csv('removed_prediction_test_SVM.csv', index=False)

# feature importance analysis
svm_weights = np.abs(svm_clf.coef_.toarray()[0])  # extract absolute weights for SVM
feature_names = vectorizer.get_feature_names_out()
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': svm_weights
}).sort_values(by='Importance', ascending=False)

# display top 10 features by importance
print("\nTop 10 Important Features:")
print(importance_df.head(10))

# save top features to a CSV file
importance_df.to_csv('top_features_removed_SVM.csv', index=False)


Training SVM...
Validation Results:
Confusion Matrix:
 [[389  23]
 [  8 404]]
Classification Report:
               precision    recall  f1-score   support

         bot       0.98      0.94      0.96       412
       human       0.95      0.98      0.96       412

    accuracy                           0.96       824
   macro avg       0.96      0.96      0.96       824
weighted avg       0.96      0.96      0.96       824

Accuracy Score: 0.96

Test Results:
Confusion Matrix:
 [[1116  204]
 [  78 1242]]
Classification Report:
               precision    recall  f1-score   support

         bot       0.93      0.85      0.89      1320
       human       0.86      0.94      0.90      1320

    accuracy                           0.89      2640
   macro avg       0.90      0.89      0.89      2640
weighted avg       0.90      0.89      0.89      2640

Accuracy Score: 0.89

Top 10 Important Features:
    Feature  Importance
451     iot    2.730901
799   small    2.339289
887   today    2.

In [4]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# load training data
df = pd.read_csv('removed_training.csv')

# check for and replace missing values in the 'tweet' column
df['tweet'] = df['tweet'].fillna('')  # replace NaN with empty string

# extract tweet text and labels
tweets = df['tweet']
labels = df['label']

# convert tweets to TF-IDF features
vectorizer = TfidfVectorizer(max_features=1000, stop_words='english')  # limit to top 1000 features for efficiency
X = vectorizer.fit_transform(tweets)
y = labels

# split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=39, stratify=y)

# train Logistic Regression Classifier
print("Training Logistic Regression...")
lr_clf = LogisticRegression(random_state=39, max_iter=1000)  # increased iterations for convergence
lr_clf.fit(X_train, y_train)

# evaluate on validation data
y_val_pred = lr_clf.predict(X_val)
print("Validation Results:")
print("Confusion Matrix:\n", confusion_matrix(y_val, y_val_pred))
print("Classification Report:\n", classification_report(y_val, y_val_pred))
print("Accuracy Score:", round(accuracy_score(y_val, y_val_pred), 2))

# save validation predictions
df_validation = pd.DataFrame({
    'Actual': y_val,
    'Predicted': y_val_pred
})
df_validation.to_csv('validation_predictions_LR.csv', index=False)

# load test set
df_test = pd.read_csv('removed_test.csv')

# check for and replace missing values in the 'tweet' column of test set
df_test['tweet'] = df_test['tweet'].fillna('')  # replace NaN with empty string

tweets_test = df_test['tweet']
labels_test = df_test['label']

# convert test tweets to TF-IDF features
X_test = vectorizer.transform(tweets_test)

# predict on test set
y_test_pred = lr_clf.predict(X_test)

# evaluate on test data
print("\nTest Results:")
print("Confusion Matrix:\n", confusion_matrix(labels_test, y_test_pred))
print("Classification Report:\n", classification_report(labels_test, y_test_pred))
print("Accuracy Score:", round(accuracy_score(labels_test, y_test_pred), 2))

# add predictions to the test DataFrame and save
df_test['predicted_label'] = y_test_pred
df_test.to_csv('removed_prediction_test_LR.csv', index=False)

# feature importance analysis
feature_names = vectorizer.get_feature_names_out()
lr_weights = abs(lr_clf.coef_[0])  # extract absolute weights for Logistic Regression
importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': lr_weights
}).sort_values(by='Importance', ascending=False)

# display top 10 features by importance
print("\nTop 10 Important Features:")
print(importance_df.head(10))

# save top features to a CSV file
importance_df.to_csv('top_features_removed_LR.csv', index=False)


Training Logistic Regression...
Validation Results:
Confusion Matrix:
 [[382  30]
 [ 11 401]]
Classification Report:
               precision    recall  f1-score   support

         bot       0.97      0.93      0.95       412
       human       0.93      0.97      0.95       412

    accuracy                           0.95       824
   macro avg       0.95      0.95      0.95       824
weighted avg       0.95      0.95      0.95       824

Accuracy Score: 0.95

Test Results:
Confusion Matrix:
 [[1127  193]
 [  81 1239]]
Classification Report:
               precision    recall  f1-score   support

         bot       0.93      0.85      0.89      1320
       human       0.87      0.94      0.90      1320

    accuracy                           0.90      2640
   macro avg       0.90      0.90      0.90      2640
weighted avg       0.90      0.90      0.90      2640

Accuracy Score: 0.9

Top 10 Important Features:
          Feature  Importance
887         today    3.264989
873        tha