In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sentence_transformers import SentenceTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OneHotEncoder

In [11]:
# Define column names
columns = [
    "id", "label", "statement", "subject", "speaker", "speaker_job",
    "speaker_state", "party", "barely_true_counts", "false_counts",
    "half_true_counts", "mostly_true_counts", "pants_on_fire_counts",
    "context"
]

# Load each file 
train_df = pd.read_csv("/Users/catherineakins/Desktop/Projects/MLFactChecker/test.tsv", sep='\t', header=None, names=columns,index_col=False)
test_df = pd.read_csv("/Users/catherineakins/Desktop/Projects/MLFactChecker/train.tsv", sep='\t', header=None, names=columns,index_col=False)
val_df = pd.read_csv("/Users/catherineakins/Desktop/Projects/MLFactChecker/valid.tsv", sep='\t', header=None, names=columns,index_col=False)

####sim source data######
source_pool = [
    "nytimes.com", "cnn.com", "theonion.com", "infowars.com", "reuters.com",
    "breitbart.com", "foxnews.com", "naturalnews.com", "bbc.com"
]

np.random.seed(42)
train_df["source"] = np.random.choice(source_pool, size=len(train_df))
val_df["source"] = np.random.choice(source_pool, size=len(val_df))
test_df["source"] = np.random.choice(source_pool, size=len(test_df))

# Preview
print(train_df.head())
print(train_df["label"].value_counts())


           id       label                                          statement  \
0  11972.json        true  Building a wall on the U.S.-Mexico border will...   
1  11685.json       false  Wisconsin is on pace to double the number of l...   
2  11096.json       false  Says John McCain has done nothing to help the ...   
3   5209.json   half-true  Suzanne Bonamici supports a plan that will cut...   
4   9524.json  pants-fire  When asked by a reporter whether hes at the ce...   

                                             subject  \
0                                        immigration   
1                                               jobs   
2                    military,veterans,voting-record   
3  medicare,message-machine-2012,campaign-adverti...   
4  campaign-finance,legal-issues,campaign-adverti...   

                            speaker           speaker_job speaker_state  \
0                        rick-perry              Governor         Texas   
1                 katrina-shankl

In [12]:
true_labels = ["true", "mostly-true", "half-true"]

for df in [train_df, val_df, test_df]:
    df["label"] = df["label"].astype(str).str.lower().str.strip()
    df["is_misinformation"] = df["label"].apply(lambda x: 0 if x in true_labels else 1)


In [13]:
for name, df in zip(["Train", "Validation", "Test"], [train_df, val_df, test_df]):
    print(f"{name} set label counts:")
    print(df["is_misinformation"].value_counts())
    print()






Train set label counts:
0    714
1    553
Name: is_misinformation, dtype: int64

Validation set label counts:
0    668
1    616
Name: is_misinformation, dtype: int64

Test set label counts:
0    5752
1    4488
Name: is_misinformation, dtype: int64



In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer


model = SentenceTransformer('all-MiniLM-L6-v2')  # small and fast, like a highly employable gremlin
statements = df["statement"].tolist()
X_text = model.encode(statements, show_progress_bar=True)

X_train_combined = model.encode(train_df["statement"].tolist(), show_progress_bar=True)
X_val_combined = model.encode(val_df["statement"].tolist(), show_progress_bar=True)
X_test_combined = model.encode(test_df["statement"].tolist(), show_progress_bar=True)



Batches:   0%|          | 0/320 [00:00<?, ?it/s]

Batches:   0%|          | 0/40 [00:00<?, ?it/s]

Batches:   0%|          | 0/41 [00:00<?, ?it/s]

Batches:   0%|          | 0/320 [00:00<?, ?it/s]

In [15]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

y_train = train_df["is_misinformation"]
y_val = val_df["is_misinformation"]
y_test = test_df["is_misinformation"]
#clf = RandomForestClassifier(class_weight="balanced", random_state=42)
clf = LogisticRegression(class_weight='balanced', solver='saga', penalty='l1')

clf.fit(X_train_combined, y_train)
y_pred = clf.predict(X_test_combined)



print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))
print("Accuracy:", accuracy_score(y_test, y_pred))



[[3433 2319]
 [1910 2578]]
              precision    recall  f1-score   support

           0       0.64      0.60      0.62      5752
           1       0.53      0.57      0.55      4488

    accuracy                           0.59     10240
   macro avg       0.58      0.59      0.58     10240
weighted avg       0.59      0.59      0.59     10240

Accuracy: 0.58701171875


In [23]:
# Encode source
source_encoder = LabelEncoder()
train_df["source_encoded"] = source_encoder.fit_transform(train_df["source"])
val_df["source_encoded"] = source_encoder.transform(val_df["source"])
test_df["source_encoded"] = source_encoder.transform(test_df["source"])

# One-hot encode source
ohe = OneHotEncoder(handle_unknown="ignore")
X_source_train = ohe.fit_transform(train_df[["source"]])
X_source_val = ohe.transform(val_df[["source"]])
X_source_test = ohe.transform(test_df[["source"]])

# Train classifier
clf_source = LogisticRegression(class_weight="balanced", max_iter=1000)
clf_source.fit(X_source_train, y_train)

In [17]:
# Get probability that post is misinformation (class 1)
y_proba = clf.predict_proba(X_test_combined)[:, 1]


In [18]:
test_df["P_x"] = y_proba
test_df["Risk"] = test_df["P_x"]


In [19]:
N = 20
top_posts = test_df.sort_values("Risk", ascending=False).head(N)


In [20]:
display_cols = ["statement", "label", "Risk"]
print(top_posts[display_cols])


                                              statement        label      Risk
1873  Obama served on a board with former Weather Un...         true  0.826784
5586  Says Barack Obama put in place a board that ca...  barely-true  0.817020
208   The Family Research Council, according to some...        false  0.815787
9007  Democrats say Chafee wants to tax equipment th...         true  0.815575
3495  Says Hillary Clinton shows up in Broward Count...   pants-fire  0.814628
1445  Says Los Angeles Clippers owner Donald Sterlin...   pants-fire  0.813107
1000  Says she helped lead the launch of curbside re...    half-true  0.804740
8387  Says Cathy Jordan was arrested and dragged out...  barely-true  0.804084
6276  The Democrats' health care bill "gives a new H...    half-true  0.802697
2895  U.S. Rep. Allen West wants to bring back earma...  barely-true  0.799914
8951  Already, the mayor of Green Bay is having ribb...   pants-fire  0.798305
9112  Says Hillary Clinton told her daughter and a g

In [22]:
from joblib import dump


dump(clf, "saved_model.pkl")
dump(clf_source, "source_only_model.pkl")
dump(source_encoder, "source_encoder.pkl")  # this is the LabelEncoder
dump(ohe, "source_ohe.pkl")  # this is the OneHotEncoder (name it right this time!)



['source_ohe.pkl']