## Evaluated on fake news corpus

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

# Step 1: Load the datasets
train_df = pd.read_csv('train_set.csv', low_memory=False)
val_df = pd.read_csv('val_set.csv', low_memory=False)
test_df = pd.read_csv('test_set.csv', low_memory=False)

# Step 2: Verify column names
print("Columns in train_set:", train_df.columns)
print("Columns in val_set:", val_df.columns)
print("Columns in test_set:", test_df.columns)

# Step 3: Create binary labels using the 'type' column
train_df['label'] = train_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)
val_df['label'] = val_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)
test_df['label'] = test_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)

# Step 4: Handle missing values in 'content' (if any)
train_df['content'] = train_df['content'].fillna('')
val_df['content'] = val_df['content'].fillna('')
test_df['content'] = test_df['content'].fillna('')

# Step 5: Use TF-IDF instead of CountVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1,2))

# Fit the TF-IDF vectorizer on the training data and transform all datasets
X_train = tfidf_vectorizer.fit_transform(train_df['content'])
X_val = tfidf_vectorizer.transform(val_df['content'])
X_test = tfidf_vectorizer.transform(test_df['content'])

# Extract labels
y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

# Step 6: Train an SVM classifier
svm_model = svm_model = LinearSVC(C=1.0, max_iter=1000)  # LinearSVC is efficient for sparse data like text
svm_model.fit(X_train, y_train)

# Step 7: Evaluate the model on the validation set
y_val_pred = svm_model.predict(X_val)
f1_val = f1_score(y_val, y_val_pred)

print(f"F1 Score on Validation Set (SVM + TF-IDF): {f1_val:.2f}")

# Step 8: Evaluate the model on the test set
y_test_pred = svm_model.predict(X_test)
f1_test = f1_score(y_test, y_test_pred)

print(f"F1 Score on Test Set (SVM + TF-IDF): {f1_test:.2f}")

Columns in train_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Columns in val_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Columns in test_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
F1 Score on Validation Set (SVM + TF-IDF): 0.89
F1 Score on Test Set (SVM + TF-IDF): 0.89


## Evaluated on fake news corpus + 800 scraped articles

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

# Load the datasets
train_df = pd.read_csv('train_set.csv', low_memory=False)
val_df = pd.read_csv('val_set.csv', low_memory=False)
test_df = pd.read_csv('joint_contents.csv', low_memory=False)

# Verify column names
print("Columns in train_set:", train_df.columns)
print("Columns in val_set:", val_df.columns)
print("Columns in test_set:", test_df.columns)

# Create binary labels using the 'type' column. Not necessary for test in this case
train_df['label'] = train_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)
val_df['label'] = val_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)
# test_df['label'] = test_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)

# Handle missing values in 'content' (if any)
train_df['content'] = train_df['content'].fillna('')
val_df['content'] = val_df['content'].fillna('')
test_df['content'] = test_df['content'].fillna('')

# Use TF-IDF instead of CountVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1,2))

# Fit the TF-IDF vectorizer on the training data and transform all datasets
X_train = tfidf_vectorizer.fit_transform(train_df['content'])
X_val = tfidf_vectorizer.transform(val_df['content'])
X_test = tfidf_vectorizer.transform(test_df['content'])

# Extract labels
y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

# Train an SVM classifier
svm_model = svm_model = LinearSVC(C=1.0, max_iter=1000)  # LinearSVC is efficient for sparse data like text
svm_model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = svm_model.predict(X_val)
f1_val = f1_score(y_val, y_val_pred)

print(f"F1 Score on Validation Set (SVM + TF-IDF): {f1_val:.2f}")

# Evaluate the model on the test set
y_test_pred = svm_model.predict(X_test)
f1_test = f1_score(y_test, y_test_pred)

print(f"F1 Score on Test Set (SVM + TF-IDF): {f1_test:.2f}")

Columns in train_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Columns in val_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Columns in test_set: Index(['content', 'label'], dtype='object')
F1 Score on Validation Set (SVM + TF-IDF): 0.89
F1 Score on Test Set (SVM + TF-IDF): 0.90


## Evaluation on LIAR dataset 

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.metrics import f1_score

# Load the datasets
train_df = pd.read_csv('train_set.csv', low_memory=False)
val_df = pd.read_csv('val_set.csv', low_memory=False)
test_df = pd.read_csv('test_cleaned.tsv', sep=',', header=None)


# Get and digest labels and content (column 1 = label, column 2 = statement) in test_cleaned.tsv
test_df['label'] = test_df[1].apply(lambda x: 1 if x in ['true', 'mostly-true', 'half-true'] else 0)
test_df['content'] = test_df[2]


# Verify column names
print("Columns in train_set:", train_df.columns)
print("Columns in val_set:", val_df.columns)
print("Columns in test_set:", test_df.columns)

# Create binary labels using the 'type' column. Not necessary for test in this case
train_df['label'] = train_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)
val_df['label'] = val_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)
# test_df['label'] = test_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)

# Handle missing values in 'content' (if any)
train_df['content'] = train_df['content'].fillna('')
val_df['content'] = val_df['content'].fillna('')
test_df['content'] = test_df['content'].fillna('')

# Use TF-IDF instead of CountVectorizer
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1,2))

# Fit the TF-IDF vectorizer on the training data and transform all datasets
X_train = tfidf_vectorizer.fit_transform(train_df['content'])
X_val = tfidf_vectorizer.transform(val_df['content'])
X_test = tfidf_vectorizer.transform(test_df['content'])

# Get labels
y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

# Train an SVM classifier
svm_model = svm_model = LinearSVC(C=1.0, max_iter=1000)  # LinearSVC is efficient for sparse data like text
svm_model.fit(X_train, y_train)

# Evaluate the model on the validation set
y_val_pred = svm_model.predict(X_val)
f1_val = f1_score(y_val, y_val_pred)

print(f"F1 Score on Validation Set (SVM + TF-IDF): {f1_val:.2f}")

# Evaluate the model on the test set
y_test_pred = svm_model.predict(X_test)
f1_test = f1_score(y_test, y_test_pred)

print(f"F1 Score on Test Set (SVM + TF-IDF): {f1_test:.2f}")

Columns in train_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Columns in val_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Columns in test_set: Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 'label', 'content'], dtype='object')
F1 Score on Validation Set (SVM + TF-IDF): 0.89
F1 Score on Test Set (SVM + TF-IDF): 0.03
