In [30]:
import pandas as pd
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import spacy

# Load spaCy model
nlp = spacy.load("en_core_web_sm")

# Custom Transformer using spaCy for lemmatization
class SpacyLemmatizer(TransformerMixin):
    def transform(self, X, **transform_params):
        return [self.lemmatize(text) for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def lemmatize(self, text):
        doc = nlp(text)
        return " ".join([token.lemma_ for token in doc if not token.is_stop])

# Load data
data = pd.read_csv('news_articles.csv')
data=data.dropna()

# Convert all text data to strings and handle NaN
data['text_without_stopwords'] = data['text_without_stopwords'].astype(str)
data['title_without_stopwords'] = data['title_without_stopwords'].astype(str)

# Combine text and title into a single column
X = data[['text_without_stopwords', 'title_without_stopwords']].agg(' '.join, axis=1)
for datax in X.values:
    doc=nlp(datax)
    datax=[token.lemma_ for token in doc]
# Build the pipeline



In [31]:
X

0       print pay back money plus interest entire fami...
1       attorney general loretta lynch plead fifth bar...
2       red state fox news sunday reported morning ant...
3       email kayla mueller prisoner tortured isis cha...
4       email healthcare reform make america great sin...
                              ...                        
2041    check hillarythemed haunted house anticlinton ...
2042    good samaritan wearing indian headdress disarm...
2043    skype sex scam fortune built shame moroccan bo...
2044    posted eddie skyhigh potency may scare away cr...
2045    billion even known keeping supposedly deleted ...
Length: 2045, dtype: object

In [32]:
y=data['label']
y

0       Real
1       Real
2       Real
3       Real
4       Real
        ... 
2041    Real
2042    Real
2043    Real
2044    Real
2045    Real
Name: label, Length: 2045, dtype: object

In [33]:
y=y.map({
    "Real":1,
    "Fake":0
})

In [34]:
tfidf=TfidfVectorizer()
X=tfidf.fit_transform(X)

In [35]:
X_train, X_test, y_train, y_test=train_test_split(X,y)

In [52]:
from keras.models import Sequential
from keras.layers import Dense, LSTM, Input
from keras.regularizers import l1,l2

model=Sequential([
    Input(shape=(X.shape[1],)),
    Dense(2, kernel_regularizer=l2()),
    Dense(1)
])
model.compile("adam",loss="binary_crossentropy",metrics=['accuracy'])
model.fit(X_train,y_train,validation_data=(X_test,y_test),epochs=10,batch_size=4)

Epoch 1/10
[1m384/384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - accuracy: 0.6235 - loss: 1.2067 - val_accuracy: 0.6680 - val_loss: 0.6825
Epoch 2/10
[1m384/384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6009 - loss: 0.7094 - val_accuracy: 0.6738 - val_loss: 0.6479
Epoch 3/10
[1m384/384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6166 - loss: 0.6459 - val_accuracy: 0.6934 - val_loss: 0.6352
Epoch 4/10
[1m384/384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1ms/step - accuracy: 0.6987 - loss: 0.5987 - val_accuracy: 0.7148 - val_loss: 0.6263
Epoch 5/10
[1m384/384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.7477 - loss: 0.5645 - val_accuracy: 0.7559 - val_loss: 0.6300
Epoch 6/10
[1m384/384[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 1ms/step - accuracy: 0.8255 - loss: 0.5449 - val_accuracy: 0.7676 - val_loss: 0.6287
Epoch 7/10
[1m384/384[0m 

<keras.src.callbacks.history.History at 0x2a2d34750>