In [1]:
import pandas as pd

# A variable for the columns we want to train our model on
columns_to_use = ['user__name', 'tweet__retweet_count', 'user__tweets_per_day', 'tweet__fake', 'user__nr_of_retweets', 'user__friends_count', 'tweet__nr_of_punctuations']

# Reads the csv file but only the chosen columns, skips unreadable lines.
df = pd.read_csv('data_set_tweet_user_features.csv', usecols=columns_to_use, delimiter=';', on_bad_lines='skip')

# Creates a new column showing the users retweet to friends ratio
# This is done by dividing nr of retweets with nr friends (+1 to avoid division by 0)
df['avg_retweets_per_friend'] = df['user__nr_of_retweets'] / (df['user__friends_count'] + 1)

# Drops duplicates and null values
df = df.drop_duplicates()
df = df.dropna()

# Saves the new csv file locally
df.to_csv('processed_data.csv', index=False)

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, roc_curve, auc
from scipy.sparse import hstack

# Reads the data from the processed file
df = pd.read_csv('processed_data.csv')

# Split into feature matrix X and target vector y (is tweet fake)
# We use the numeric features and the username for text vectorization
X = df[['user__name', 'tweet__retweet_count', 'user__tweets_per_day', 'avg_retweets_per_friend', 'tweet__nr_of_punctuations']]
y = df['tweet__fake']

# Divide into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Vectorize the username text using TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)
X_train_text = vectorizer.fit_transform(X_train['user__name'])
X_test_text = vectorizer.transform(X_test['user__name'])

# Extract the numeric features as arrays
X_train_numeric = X_train[['tweet__retweet_count', 'user__tweets_per_day', 'avg_retweets_per_friend', 'tweet__nr_of_punctuations']].values
X_test_numeric = X_test[['tweet__retweet_count', 'user__tweets_per_day', 'avg_retweets_per_friend', 'tweet__nr_of_punctuations']].values

# Combine the text vectors and numeric features into one sparse matrix
X_train_combined = hstack([X_train_text, X_train_numeric])
X_test_combined = hstack([X_test_text, X_test_numeric]) 

# Logistic Regression with hyperparameter tuning
lr = LogisticRegression(max_iter=2000, class_weight='balanced')
params_lr = {'C': [1]}
grid_lr = GridSearchCV(lr, params_lr, cv=5)
grid_lr.fit(X_train_combined, y_train)
y_pred_lr = grid_lr.predict(X_test_combined)

# Random Forest with hyperparameter tuning
rf = RandomForestClassifier(class_weight='balanced')
params_rf = {'n_estimators': [100], 'max_depth': [None]}
grid_rf = GridSearchCV(rf, params_rf, cv=5)
grid_rf.fit(X_train_combined, y_train)
y_pred_rf = grid_rf.predict(X_test_combined)

# Evaluate and print classification reports
print("Logistic Regression:")
print(classification_report(y_test, y_pred_lr))
print("Random Forest:")
print(classification_report(y_test, y_pred_rf, zero_division=0))

# Function to plot ROC curves for each model
def plot_roc(model, X_test_vec, y_test, label):
    y_proba = model.predict_proba(X_test_vec)[:, 1]
    fpr, tpr, _ = roc_curve(y_test, y_proba)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr, label=f'{label} (AUC = {roc_auc:.2f})')

# Plot ROC curves
plt.figure()
plot_roc(grid_lr, X_test_combined, y_test, 'Logistic Regression')
plot_roc(grid_rf, X_test_combined, y_test, 'Random Forest')
plt.plot([0, 1], [0, 1], 'k--')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve')
plt.legend()
plt.show()

# Show distribution of the target classes for us to know the ratio of the data (fake vs real)  
print(df['tweet__fake'].value_counts())