In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

# Step 1: Load the datasets
train_df = pd.read_csv('train_set.csv', low_memory=False)
val_df = pd.read_csv('val_set.csv', low_memory=False)
test_df = pd.read_csv('test_set.csv', low_memory=False)

# Step 2: Verify column names
print("Columns in train_set:", train_df.columns)
print("Columns in val_set:", val_df.columns)
print("Columns in test_set:", test_df.columns)

# Step 3: Create binary labels using the 'type' column
# Assuming 'reliable' is the only reliable label, and everything else is fake
train_df['label'] = train_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)
val_df['label'] = val_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)
test_df['label'] = test_df['type'].apply(lambda x: 1 if x == 'reliable' else 0)

# Step 4: Handle missing values in 'content' (if any)
# Fill missing 'content' with an empty string or drop rows with missing 'content'
train_df['content'] = train_df['content'].fillna('')
val_df['content'] = val_df['content'].fillna('')
test_df['content'] = test_df['content'].fillna('')

# Step 5: Select the 10,000 most frequent words
vectorizer = CountVectorizer(max_features=10000)

# Fit the vectorizer on the training data and transform all datasets
X_train = vectorizer.fit_transform(train_df['content'])
X_val = vectorizer.transform(val_df['content'])
X_test = vectorizer.transform(test_df['content'])

# Extract labels
y_train = train_df['label']
y_val = val_df['label']
y_test = test_df['label']

# Step 6: Train a logistic regression model
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_train, y_train)

# Step 7: Evaluate the model on the validation set
y_val_pred = log_reg.predict(X_val)
f1_val = f1_score(y_val, y_val_pred)

print(f"F1 Score on Validation Set: {f1_val:.2f}")

# Step 8: Evaluate the model on the test set
y_test_pred = log_reg.predict(X_test)
f1_test = f1_score(y_test, y_test_pred)

print(f"F1 Score on Test Set: {f1_test:.2f}")

Columns in train_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Columns in val_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
Columns in test_set: Index(['Unnamed: 0', 'id', 'domain', 'type', 'url', 'content', 'scraped_at',
       'inserted_at', 'updated_at', 'title', 'authors', 'keywords',
       'meta_keywords', 'meta_description', 'tags', 'summary', 'source'],
      dtype='object')
F1 Score on Validation Set: 0.86
F1 Score on Test Set: 0.86
