In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix, roc_auc_score
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from imblearn.over_sampling import SMOTE
import re
import nltk

nltk.download('stopwords')
nltk.download('wordnet')

In [None]:

df = pd.read_json('ds/difraud/fake_news/train.jsonl', lines=True)
df.head()

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_and_lemmatize_text(text):
    text = text.lower()  
    text = re.sub(r'\[.*?\]', '', text) 
    text = re.sub(r'\W', ' ', text)  
    text = re.sub(r'\s+', ' ', text)  
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_and_lemmatize_text)

In [None]:
def clean_and_lemmatize_text(text):
    text = text.lower()
    text = re.sub(r'\[.*?\]', '', text)
    text = re.sub(r'\W', ' ', text)
    text = re.sub(r'\s+', ' ', text)
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

df['clean_text'] = df['text'].apply(clean_and_lemmatize_text)

In [None]:
tfidf = TfidfVectorizer(max_features=5000, ngram_range=(1, 2))
X = tfidf.fit_transform(df['clean_text']).toarray()

y = df['label']  

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
param_grid = [
    {'classifier': [LogisticRegression()],
     'classifier__C': [0.1, 1, 10]},
    {'classifier': [RandomForestClassifier()],
     'classifier__n_estimators': [100, 200]},
    {'classifier': [SVC()],
     'classifier__C': [0.1, 1, 10],
     'classifier__kernel': ['linear', 'rbf']}
]

grid_search = GridSearchCV(estimator=Pipeline([('classifier', LogisticRegression())]), param_grid=param_grid, cv=5, n_jobs=-1)
grid_search.fit(X_train, y_train)

print("Best Parameters:", grid_search.best_params_)


In [None]:
y_pred = grid_search.best_estimator_.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'ROC-AUC Score: {roc_auc:.2f}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

cv_scores = cross_val_score(grid_search.best_estimator_, X, y, cv=5, n_jobs=-1)
print("Cross-validation scores:", cv_scores)
print("Mean cross-validation score:", cv_scores.mean())


In [None]:
best_params = grid_search.best_params_


log_clf = LogisticRegression(C=best_params['classifier__C'] if 'classifier__C' in best_params else 1, n_jobs=-1)
rf_clf = RandomForestClassifier(n_estimators=best_params['classifier__n_estimators'] if 'classifier__n_estimators' in best_params else 100, n_jobs=-1)
svm_clf = SVC(C=best_params['classifier__C'] if 'classifier__C' in best_params else 1, kernel=best_params['classifier__kernel'] if 'classifier__kernel' in best_params else 'rbf', probability=True)

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rf_clf), ('svc', svm_clf)],
    voting='soft',
    n_jobs=-1
)

voting_clf.fit(X_train, y_train)

y_pred = voting_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
roc_auc = roc_auc_score(y_test, y_pred)
print(f'Accuracy: {accuracy * 100:.2f}%')
print(f'ROC-AUC Score: {roc_auc:.2f}')
print(classification_report(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))


In [None]:
def predict_article(article):
    cleaned_article = clean_and_lemmatize_text(article)
    article_tfidf = tfidf.transform([cleaned_article]).toarray()
    prediction = voting_clf.predict(article_tfidf)
    return 'Fake' if prediction == 1 else 'Real'