In [2]:
import pandas as pd
import numpy as np

In [3]:
df = pd.read_csv("data/processed/train.csv", converters={'tokens': eval})

In [4]:
df

Unnamed: 0,title,text,subject,date,label,content,tokens,clean_text
0,BOILER ROOM: As the Frogs Slowly Boil – EP #40,Tune in to the Alternate Current Radio Network...,US_News,"January 20, 2016",0,boiler room: as the frogs slowly boil – ep #40...,"[boiler, room, frog, slowly, boil, ep, tune, a...",boiler room frog slowly boil ep tune alternate...
1,New Venezuela oil boss to give military more P...,CARACAS (Reuters) - A general appointed at the...,worldnews,"November 27, 2017",1,new venezuela oil boss to give military more p...,"[new, venezuela, oil, bos, give, military, pdv...",new venezuela oil bos give military pdvsa post...
2,Turkey says talk of ending its EU accession un...,ISTANBUL (Reuters) - Turkey s European Union A...,worldnews,"September 4, 2017",1,turkey says talk of ending its eu accession un...,"[turkey, say, talk, ending, eu, accession, und...",turkey say talk ending eu accession undermines...
3,SENATOR GILLIBRAND Pulled Strings So Muslim At...,Democrat Senator Kristen Gillibrand (NY) likes...,left-news,"Dec 12, 2017",0,senator gillibrand pulled strings so muslim at...,"[senator, gillibrand, pulled, string, muslim, ...",senator gillibrand pulled string muslim athlet...
4,Republican Trump says 'system is rigged' after...,WASHINGTON (Reuters) - U.S. Republican preside...,politicsNews,"July 5, 2016",1,republican trump says 'system is rigged' after...,"[republican, trump, say, rigged, clinton, emai...",republican trump say rigged clinton email anno...
...,...,...,...,...,...,...,...,...
35913,Trump Just FAILED Hundreds Of Manufacturing W...,"Last year, Trump claimed he was succeeding at ...",News,"May 7, 2017",0,trump just failed hundreds of manufacturing w...,"[trump, failed, hundred, manufacturing, worker...",trump failed hundred manufacturing worker blam...
35914,Judge Garland not interested in FBI job: sources,WASHINGTON (Reuters) - U.S. appeals court judg...,politicsNews,"May 16, 2017",1,judge garland not interested in fbi job: sourc...,"[judge, garland, interested, fbi, job, source,...",judge garland interested fbi job source washin...
35915,Expert On Voting Fraud DESTROYS Trump’s Lies ...,Terrified and unable to accept that he s destr...,News,"October 17, 2016",0,expert on voting fraud destroys trump’s lies ...,"[expert, voting, fraud, destroys, trump, lie, ...",expert voting fraud destroys trump lie blister...
35916,[Video] POLICE HAVE VERY GOOD REASON FOR BLOCK...,This mayor s involvement in potential illegal ...,politics,"Apr 24, 2015",0,[video] police have very good reason for block...,"[video, police, good, reason, blocking, newly,...",video police good reason blocking newly electe...


In [37]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline

# Use actual data from df
# X: texts; y: assume 'label' column holds labels (change if different)
X = df['clean_text'].values
y = df['label'].values  # <-- Make sure to replace with correct label column if it's different

pipeline = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words='english')),
    ('clf', LogisticRegression(max_iter=1000))
])
pipeline.named_steps['tfidf'].set_params(max_features=10000)

param_grid = {
    'tfidf__ngram_range': [(1,1), (1,2), (1,3)]
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1', n_jobs=-1)

grid.fit(X, y)

print("Best n-gram range:", grid.best_params_['tfidf__ngram_range'])
print("Best F1 score:", grid.best_score_)


Best n-gram range: (1, 3)
Best F1 score: 0.9762636427806706


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pickle

# Apply TF-IDF feature engineering with ngram_range (1,3) and max_features=10000
tfidf_vectorizer = TfidfVectorizer(stop_words='english', ngram_range=(1, 3), max_features=10000)
tfidf_features = tfidf_vectorizer.fit_transform(df['clean_text'].values)

# Save the TF-IDF matrix and the vectorizer as a tuple to pickle file
with open('tfidf_features_ngram_1_1.pkl', 'wb') as f:
    pickle.dump((tfidf_features, tfidf_vectorizer), f)


In [7]:
import pickle

with open('/home/root495/Inexture/Fake New Detection/tfidf_features_ngram_1_1.pkl', 'rb') as f:
    tfidf_features, tfidf_vectorizer = pickle.load(f)

In [8]:
from imblearn.over_sampling import SMOTE

# Use the same X and y as above
smote = SMOTE(random_state=42)
X_res, y_res = smote.fit_resample(tfidf_features, df['label'].values)

print("Class distribution after SMOTE:", dict(zip(*np.unique(y_res, return_counts=True))))


Class distribution after SMOTE: {0: 18771, 1: 18771}


In [9]:
import pandas as pd

# Load test data
test_df = pd.read_csv("data/processed/test.csv")

# Transform test data into TF-IDF representation using the fitted vectorizer
X_test = tfidf_vectorizer.transform(test_df['clean_text'].values)
y_test = test_df["label"].values


In [16]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, StratifiedKFold

# Use the TF-IDF features already computed
X_tfidf = tfidf_features
y = df['label'].values  # Make sure this matches your label column name

# Set up the Logistic Regression and parameter grid
logreg = LogisticRegression(max_iter=1000)

param_grid = {
    'C': [0.01, 0.1, 1, 10, 100],
    'penalty': ['l2'],
    'solver': ['lbfgs', 'saga']
}

cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
grid_search = GridSearchCV(logreg, param_grid, cv=cv, scoring='f1', n_jobs=-1)
grid_search.fit(X_res, y_res)

print("Best Logistic Regression parameters:", grid_search.best_params_)
print("Best F1 score from Logistic Regression:", grid_search.best_score_)


Best Logistic Regression parameters: {'C': 100, 'penalty': 'l2', 'solver': 'saga'}
Best F1 score from Logistic Regression: 0.9947564619956404


In [11]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, roc_auc_score

# Train logistic regression on balanced data with the best parameters found
logreg = LogisticRegression(
    max_iter=1000, 
    random_state=42,
    C=100, 
    penalty='l2', 
    solver='saga'
)
logreg.fit(X_res, y_res)

# Predict on test data
y_test_pred = logreg.predict(X_test)
y_test_proba = logreg.predict_proba(X_test)[:,1] 

print("Logistic Regression metrics on test data:")
print("Precision:", precision_score(y_test, y_test_pred))
print("Recall:", recall_score(y_test, y_test_pred))
print("F1 Score:", f1_score(y_test, y_test_pred))
print("Accuracy:", accuracy_score(y_test, y_test_pred))
print("ROC-AUC:", roc_auc_score(y_test, y_test_proba))


Logistic Regression metrics on test data:
Precision: 0.9939067260370283
Recall: 0.993208430913349
F1 Score: 0.9935574557807192
Accuracy: 0.9938752783964365
ROC-AUC: 0.9995211742418593


In [17]:
from sklearn.naive_bayes import MultinomialNB

# Set up the Naive Bayes classifier and parameter grid
nb = MultinomialNB()
nb_param_grid = {
    'alpha': [0.01, 0.1, 0.5, 1, 2, 5, 10],
    'fit_prior': [True, False]
}

nb_grid_search = GridSearchCV(nb, nb_param_grid, cv=cv, scoring='f1', n_jobs=-1)
nb_grid_search.fit(X_res, y_res)

print("Best Naive Bayes parameters:", nb_grid_search.best_params_)
print("Best F1 score from Naive Bayes:", nb_grid_search.best_score_)


Best Naive Bayes parameters: {'alpha': 0.01, 'fit_prior': True}
Best F1 score from Naive Bayes: 0.954099340436948


In [8]:
from sklearn.naive_bayes import MultinomialNB

# Train Naive Bayes with best parameters
nb_best = MultinomialNB(alpha=0.01, fit_prior=True)
nb_best.fit(X_res, y_res)

# Predict on test data
y_test_pred_nb = nb_best.predict(X_test)
y_test_proba_nb = nb_best.predict_proba(X_test)[:, 1]

print("Naive Bayes metrics on test data:")
print("Precision:", precision_score(y_test, y_test_pred_nb))
print("Recall:", recall_score(y_test, y_test_pred_nb))
print("F1 Score:", f1_score(y_test, y_test_pred_nb))
print("Accuracy:", accuracy_score(y_test, y_test_pred_nb))
print("ROC-AUC:", roc_auc_score(y_test, y_test_proba_nb))


Naive Bayes metrics on test data:
Precision: 0.9481756913781083
Recall: 0.955503512880562
F1 Score: 0.9518254986585792
Accuracy: 0.9540089086859688
ROC-AUC: 0.9892438729694655


In [18]:
from sklearn.tree import DecisionTreeClassifier

# Set up the Decision Tree classifier and parameter grid
dt = DecisionTreeClassifier(random_state=42)
dt_param_grid = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_depth': [None, 5, 10, 20, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 5, 10]
}

dt_grid_search = GridSearchCV(dt, dt_param_grid, cv=cv, scoring='f1', n_jobs=-1)
dt_grid_search.fit(X_res, y_res)

print("Best Decision Tree parameters:", dt_grid_search.best_params_)
print("Best F1 score from Decision Tree:", dt_grid_search.best_score_)


Best Decision Tree parameters: {'criterion': 'gini', 'max_depth': 10, 'min_samples_leaf': 2, 'min_samples_split': 5}
Best F1 score from Decision Tree: 0.9954486602167261


In [9]:
from sklearn.tree import DecisionTreeClassifier

# Train Decision Tree with best parameters
dt_best = DecisionTreeClassifier(
    criterion='gini',
    max_depth=10,
    min_samples_leaf=2,
    min_samples_split=5,
    random_state=42
)
dt_best.fit(X_res, y_res)

# Predict on test data
y_test_pred_dt = dt_best.predict(X_test)
y_test_proba_dt = dt_best.predict_proba(X_test)[:, 1]

print("Decision Tree metrics on test data:")
print("Precision:", precision_score(y_test, y_test_pred_dt))
print("Recall:", recall_score(y_test, y_test_pred_dt))
print("F1 Score:", f1_score(y_test, y_test_pred_dt))
print("Accuracy:", accuracy_score(y_test, y_test_pred_dt))
print("ROC-AUC:", roc_auc_score(y_test, y_test_proba_dt))


Decision Tree metrics on test data:
Precision: 0.9939252336448599
Recall: 0.9962529274004683
F1 Score: 0.9950877192982456
Accuracy: 0.9953229398663697
ROC-AUC: 0.9947571562821642


In [19]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Set up the Random Forest classifier and parameter distributions
rf = RandomForestClassifier(random_state=42)
rf_param_dist = {
    'n_estimators': [50, 100, 200, 300, 500],
    'max_depth': [None, 5, 10, 20, 50],
    'min_samples_split': [2, 5, 10, 20],
    'min_samples_leaf': [1, 2, 4, 8],
    'max_features': ['auto', 'sqrt', 'log2'],
    'bootstrap': [True, False]
}

rf_random_search = RandomizedSearchCV(
    rf,
    rf_param_dist,
    n_iter=2,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    random_state=42,
    verbose=1
)
rf_random_search.fit(X_res, y_res)

print("Best Random Forest parameters from RandomizedSearchCV:", rf_random_search.best_params_)
print("Best F1 score from Random Forest (RandomizedSearchCV):", rf_random_search.best_score_)


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best Random Forest parameters from RandomizedSearchCV: {'n_estimators': 500, 'min_samples_split': 10, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': None, 'bootstrap': False}
Best F1 score from Random Forest (RandomizedSearchCV): 0.9975268103645627


In [None]:
from sklearn.ensemble import RandomForestClassifier

# Train Random Forest with best parameters from RandomizedSearchCV
rf_best = RandomForestClassifier(
    n_estimators=500,
    min_samples_split=10,
    min_samples_leaf=1,
    max_features='sqrt',
    max_depth=None,
    bootstrap=False,
    random_state=42
)
rf_best.fit(X_res, y_res)

# Predict on test data
y_test_pred_rf = rf_best.predict(X_test)
y_test_proba_rf = rf_best.predict_proba(X_test)[:, 1]

print("Random Forest metrics on test data:")
print("Precision:", precision_score(y_test, y_test_pred_rf))
print("Recall:", recall_score(y_test, y_test_pred_rf))
print("F1 Score:", f1_score(y_test, y_test_pred_rf))
print("Accuracy:", accuracy_score(y_test, y_test_pred_rf))
print("ROC-AUC:", roc_auc_score(y_test, y_test_proba_rf)) 


Random Forest metrics on test data:
Precision: 0.9967220791383751
Recall: 0.9969555035128805
F1 Score: 0.9968387776606955
Accuracy: 0.9969933184855234
ROC-AUC: 0.9998080221960353


In [20]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import RandomizedSearchCV

# Set up the AdaBoost classifier and parameter distributions
ada = AdaBoostClassifier(random_state=42)
ada_param_dist = {
    'n_estimators': [50, 100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.5, 1.0, 2.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

ada_random_search = RandomizedSearchCV(
    ada,
    ada_param_dist,
    n_iter=2,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    random_state=42,
    verbose=1
)
ada_random_search.fit(X_res, y_res)

print("Best AdaBoost parameters from RandomizedSearchCV:", ada_random_search.best_params_)
print("Best F1 score from AdaBoost (RandomizedSearchCV):", ada_random_search.best_score_)


Fitting 5 folds for each of 2 candidates, totalling 10 fits




Best AdaBoost parameters from RandomizedSearchCV: {'n_estimators': 50, 'learning_rate': 0.05, 'algorithm': 'SAMME'}
Best F1 score from AdaBoost (RandomizedSearchCV): 0.9931889808956736


In [13]:
from sklearn.ensemble import AdaBoostClassifier

# Train AdaBoost with best parameters from RandomizedSearchCV
ada_best = AdaBoostClassifier(
    n_estimators=50,
    learning_rate=0.05,
    algorithm='SAMME',
    random_state=42
)
ada_best.fit(X_res, y_res)

# Predict on test data
y_test_pred_ada = ada_best.predict(X_test)
y_test_proba_ada = ada_best.predict_proba(X_test)[:, 1]

print("AdaBoost metrics on test data:")
print("Precision:", precision_score(y_test, y_test_pred_ada))
print("Recall:", recall_score(y_test, y_test_pred_ada))
print("F1 Score:", f1_score(y_test, y_test_pred_ada))
print("Accuracy:", accuracy_score(y_test, y_test_pred_ada))
print("ROC-AUC:", roc_auc_score(y_test, y_test_proba_ada))




AdaBoost metrics on test data:
Precision: 0.9886258124419685
Recall: 0.997423887587822
F1 Score: 0.9930053625553742
Accuracy: 0.9933184855233853
ROC-AUC: 0.997134677824351


In [21]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import RandomizedSearchCV

# Set up the Gradient Boosting classifier and parameter distributions
gb = GradientBoostingClassifier(random_state=42)
gb_param_dist = {
    'n_estimators': [50, 100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.5, 1.0],
    'max_depth': [3, 4, 5, 6, 7, 8],
    'subsample': [0.6, 0.8, 1.0],
    'min_samples_split': [2, 5, 10, 15],
    'min_samples_leaf': [1, 2, 4, 6]
}

gb_random_search = RandomizedSearchCV(
    gb,
    gb_param_dist,
    n_iter=2,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    random_state=42,
    verbose=1
)
gb_random_search.fit(X_res, y_res)

print("Best Gradient Boosting parameters from RandomizedSearchCV:", gb_random_search.best_params_)
print("Best F1 score from Gradient Boosting (RandomizedSearchCV):", gb_random_search.best_score_)


Fitting 5 folds for each of 2 candidates, totalling 10 fits
Best Gradient Boosting parameters from RandomizedSearchCV: {'subsample': 1.0, 'n_estimators': 100, 'min_samples_split': 5, 'min_samples_leaf': 4, 'max_depth': 6, 'learning_rate': 0.01}
Best F1 score from Gradient Boosting (RandomizedSearchCV): 0.9954565312389375


In [14]:
from sklearn.ensemble import GradientBoostingClassifier

# Set up the Gradient Boosting Classifier with best hyperparameters from RandomizedSearchCV
gb_clf = GradientBoostingClassifier(
    subsample=1.0,
    n_estimators=100,
    min_samples_split=5,
    min_samples_leaf=4,
    max_depth=6,
    learning_rate=0.01,
    random_state=42
)

# Train on resampled data
gb_clf.fit(X_res, y_res)

# Predict on test data
y_test_pred_gb = gb_clf.predict(X_test)
y_test_proba_gb = gb_clf.predict_proba(X_test)[:, 1]

print("Gradient Boosting metrics on test data:")
print("Precision:", precision_score(y_test, y_test_pred_gb))
print("Recall:", recall_score(y_test, y_test_pred_gb))
print("F1 Score:", f1_score(y_test, y_test_pred_gb))
print("Accuracy:", accuracy_score(y_test, y_test_pred_gb))
print("ROC-AUC:", roc_auc_score(y_test, y_test_proba_gb))


Gradient Boosting metrics on test data:
Precision: 0.9934640522875817
Recall: 0.9967213114754099
F1 Score: 0.9950900163666121
Accuracy: 0.9953229398663697
ROC-AUC: 0.9972985128059788


In [22]:
from xgboost import XGBClassifier

# Set up the XGBoost classifier and parameter distributions
xgb = XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42)
xgb_param_dist = {
    'n_estimators': [50, 100, 200, 300, 500],
    'learning_rate': [0.01, 0.05, 0.1, 0.2, 0.3, 0.5],
    'max_depth': [3, 4, 5, 6, 7, 8, 10],
    'subsample': [0.6, 0.7, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.7, 0.8, 1.0],
    'gamma': [0, 0.1, 0.2, 0.3, 1, 2],
    'reg_alpha': [0, 0.01, 0.1, 1],
    'reg_lambda': [0.1, 1, 5, 10]
}

xgb_random_search = RandomizedSearchCV(
    xgb,
    xgb_param_dist,
    n_iter=2,
    cv=cv,
    scoring='f1',
    n_jobs=-1,
    random_state=42,
    verbose=1
)
xgb_random_search.fit(X_res, y_res)

print("Best XGBoost parameters from RandomizedSearchCV:", xgb_random_search.best_params_)
print("Best F1 score from XGBoost (RandomizedSearchCV):", xgb_random_search.best_score_)


Fitting 5 folds for each of 2 candidates, totalling 10 fits


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)
Parameters: { "use_label_encoder" } are not used.


Best XGBoost parameters from RandomizedSearchCV: {'subsample': 1.0, 'reg_lambda': 0.1, 'reg_alpha': 1, 'n_estimators': 500, 'max_depth': 6, 'learning_rate': 0.5, 'gamma': 1, 'colsample_bytree': 0.7}
Best F1 score from XGBoost (RandomizedSearchCV): 0.997285697806638


In [15]:
from xgboost import XGBClassifier

# Set up the XGBoost Classifier with best-found hyperparameters from RandomizedSearchCV
xgb_clf = XGBClassifier(
    n_estimators=500,
    learning_rate=0.5,
    max_depth=6,
    subsample=1.0,
    colsample_bytree=0.7,
    gamma=1,
    reg_alpha=1,
    reg_lambda=0.1,
    random_state=42,
    use_label_encoder=False,
    eval_metric='logloss'
)

# Train the model on the resampled training data
xgb_clf.fit(X_res, y_res)

# Predict on the test data
y_test_pred_xgb = xgb_clf.predict(X_test)
y_test_proba_xgb = xgb_clf.predict_proba(X_test)[:, 1]

print("XGBoost metrics on test data:")
print("Precision:", precision_score(y_test, y_test_pred_xgb))
print("Recall:", recall_score(y_test, y_test_pred_xgb))
print("F1 Score:", f1_score(y_test, y_test_pred_xgb))
print("Accuracy:", accuracy_score(y_test, y_test_pred_xgb))
print("ROC-AUC:", roc_auc_score(y_test, y_test_proba_xgb))


Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


XGBoost metrics on test data:
Precision: 0.9962555581558624
Recall: 0.9969555035128805
F1 Score: 0.9966054079363221
Accuracy: 0.9967706013363029
ROC-AUC: 0.9995833271180457
