In [1]:
#this cell is logistic regression analysis only on the train set

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

#Load the datasets needed to train, validate and test the model
train_df = pd.read_csv('train_set.csv', low_memory=False)
val_df = pd.read_csv('val_set.csv', low_memory=False)
test_df = pd.read_csv('test_set.csv', low_memory=False)

#Check the column names for all of the datasets so as to see the structure
print("Columns in train_set:", train_df.columns)
print("Columns in val_set:", val_df.columns)
print("Columns in test_set:", test_df.columns)

#create binary labels using the 'type' column from the dataset
# Assuming 'reliable' is the only reliable label, and everything else is fake
train_df['label'] = train_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)
val_df['label'] = val_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)
test_df['label'] = test_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)

# handle missing values in 'content' 
# fill missing 'content' with an empty string or drop rows with missing 'content'
train_df['content'] = train_df['content'].fillna('')
val_df['content'] = val_df['content'].fillna('')
test_df['content'] = test_df['content'].fillna('')

#selecting the 10,000 most frequent words
vectorizer = CountVectorizer(max_features=10000)

#fit the vectorizer for the model on the training data and transform all datasets
X_train = vectorizer.fit_transform(train_df['content'])
X_val = vectorizer.transform(val_df['content'])
X_test = vectorizer.transform(test_df['content'])

#extract the binary labels from the datasets and assign them to their own value.
y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

#train a logistic regression model to fit the train_set data
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

#evaluate the model on the validation set
y_val_pred = log_reg.predict(X_val)
f1_val = f1_score(y_val, y_val_pred)

print(f"F1 Score on Validation Set: {f1_val:.2f}")

# evaluate the model on the test set
y_test_pred = log_reg.predict(X_test)
f1_test = f1_score(y_test, y_test_pred)

print(f"F1 Score on Test Set: {f1_test:.2f}")

Columns in train_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Columns in val_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Columns in test_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
F1 Score on Validation Set: 0.86
F1 Score on Test Set: 0.86


In [5]:
#this cell is the logistic regression model fitted to the joint contents csv. the joint contents csv 

import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# load the datasets
# use joined_contents.csv as the training data
joint_df = pd.read_csv('joint_contents.csv', low_memory=False)
val_df = pd.read_csv('val_set.csv', low_memory=False)
test_df = pd.read_csv('test_set.csv', low_memory=False)

# verify column names
print("Columns in train_set (joint_contents.csv):", joint_df.columns)
print("Columns in val_set:", val_df.columns)
print("Columns in test_set:", test_df.columns)

# check if 'label' column exists in val_set and test_set if this
# if 'label' column does not exist, create it using the 'type' column
if 'label' not in val_df.columns:
    val_df['label'] = val_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)
if 'label' not in test_df.columns:
    test_df['label'] = test_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)

# handle missing values in 'content'
joint_df['content'] = joint_df['content'].fillna('')
val_df['content'] = val_df['content'].fillna('')
test_df['content'] = test_df['content'].fillna('')

#selecting the 10,000 most frequent words
vectorizer = CountVectorizer(max_features=10000)

# fit the vectorizer for the model on the training data and transform all datasets
X_train = vectorizer.fit_transform(joint_df['content'])
X_val = vectorizer.transform(val_df['content'])
X_test = vectorizer.transform(test_df['content'])

# extract the binary labels from the datasets and assign them to their own value.
y_train = joint_df['label']
y_val = val_df['label']
y_test = test_df['label']

# train a logistic regression model to fit the joint_set data
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# evaluate the model on the validation set
y_val_pred = log_reg.predict(X_val)
f1_val = f1_score(y_val, y_val_pred)

print(f"F1 Score on Validation Set: {f1_val:.2f}")

# evaluate the model on the test set
y_test_pred = log_reg.predict(X_test)
f1_test = f1_score(y_test, y_test_pred)

print(f"F1 Score on Test Set: {f1_test:.2f}")

Columns in train_set (joint_contents.csv): Index(['content', 'label'], dtype='object')
Columns in val_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Columns in test_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
F1 Score on Validation Set: 0.86
F1 Score on Test Set: 0.86


In [7]:
#logistic regression analysis on the LIAR dataset. This is traind on the normal train set used on the first model.
#the test and validation set are cleaned versions of the LIAR dataset. 

import pandas as pd  # For loading and manipulating CSV/TSV datasets
from sklearn.feature_extraction.text import TfidfVectorizer  # Converts text to TF-IDF features
from sklearn.linear_model import LogisticRegression  # Logistic regression classifier
from sklearn.metrics import f1_score  # Metric to evaluate model performance

# Load the datasets, low_memory=False prevents dtype inference issues when reading large files
train_df = pd.read_csv('train_set.csv', low_memory=False)
test_df = pd.read_csv('test_cleaned.tsv', sep=',', header=None)
val_df = pd.read_csv('valid.tsv', sep='\t', header=None)

# Extract and convert labels/content for validation set
val_df['label'] = val_df[1].apply(lambda x: 1 if x in ['true', 'mostly-true', 'half-true'] else 0)
val_df['content'] = val_df[2]

# Extract and convert labels/content for test set
test_df['label'] = test_df[1].apply(lambda x: 1 if x in ['true', 'mostly-true', 'half-true'] else 0)
test_df['content'] = test_df[2]

# to verify column names to confirm structure of input files
print("Columns in train_set:", train_df.columns)
print("Columns in val_set:", val_df.columns)
print("Columns in test_set:", test_df.columns)

# create binary labels (1 = reliable, 0 = unreliable) using the 'type' column. Not necessary for test in this case
train_df['label'] = train_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)

# handle missing values in 'content'
train_df['content'] = train_df['content'].fillna('')
val_df['content'] = val_df['content'].fillna('')
test_df['content'] = test_df['content'].fillna('')

#selecting the 10,000 most frequent words
tfidf_vectorizer = TfidfVectorizer(max_features=10000, stop_words='english', ngram_range=(1,2))

#fit the vectorizer for the model on the training data and transform all datasets
X_train = tfidf_vectorizer.fit_transform(train_df['content'])
X_val = tfidf_vectorizer.transform(val_df['content'])
X_test = tfidf_vectorizer.transform(test_df['content'])

# extract the labels for training, validation, and testing
y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

# train a logistic regression classifier on the training data
logreg_model = LogisticRegression(C=1.0, max_iter=1000) 
logreg_model.fit(X_train, y_train)

# evaluate performance on the validation set using F1 score
y_val_pred = logreg_model.predict(X_val)
f1_val = f1_score(y_val, y_val_pred)
print(f"F1 Score on Validation Set (Logistic Regression + TF-IDF): {f1_val:.2f}")

# evaluate performance on the test set from the LIAR dataset
y_test_pred = logreg_model.predict(X_test)
f1_test = f1_score(y_test, y_test_pred)
print(f"F1 Score on Test Set (Logistic Regression + TF-IDF): {f1_test:.2f}")

Columns in train_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Columns in val_set: Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 'label', 'content'], dtype='object')
Columns in test_set: Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 'label', 'content'], dtype='object')
F1 Score on Validation Set (Logistic Regression + TF-IDF): 0.04
F1 Score on Test Set (Logistic Regression + TF-IDF): 0.01
