In [1]:
import numpy as np
import pandas as pd
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC  # Import Support Vector Machine
from sklearn.metrics import accuracy_score

import nltk
nltk.download('stopwords')

# Load the dataset and combine 'author' and 'title'
news_dataset = pd.read_csv(r"C:\Users\Tech Line\Desktop\1111\fake-new-detection-machine-learning\data set\train.csv")

# replacing the null values with empty string
news_dataset = news_dataset.fillna('')

news_dataset['content'] = news_dataset['author'].fillna('') + ' ' + news_dataset['title'].fillna('') 

# counting the number of missing values in the dataset
news_dataset.isnull().sum()

print(news_dataset.isnull().sum())

# Drop unnecessary columns
X = news_dataset.drop(columns='label')
Y = news_dataset['label']

# Text preprocessing using stemming
port_stem = PorterStemmer()
stop_words = set(stopwords.words('english'))

def stemming(content):
    content = re.sub('[^a-zA-Z]', ' ', content.lower())
    words = [port_stem.stem(word) for word in content.split() if word not in stop_words]
    return ' '.join(words)

news_dataset['content'] = news_dataset['content'].apply(stemming)

# Separating the data and label
X = news_dataset['content'].values
Y = news_dataset['label'].values

# Converting the textual data to numerical data
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(X)

# Splitting the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, stratify=Y, random_state=2)


# Training a Support Vector Machine model
svm_model = SVC()
svm_model.fit(X_train, Y_train)

# Evaluating the Support Vector Machine model
svm_training_data_accuracy = accuracy_score(svm_model.predict(X_train), Y_train)
svm_test_data_accuracy = accuracy_score(svm_model.predict(X_test), Y_test)

print('Accuracy score on the training data (SVM):', svm_training_data_accuracy)
print('Accuracy score on the test data (SVM):', svm_test_data_accuracy)

X_new = X_test[5]


# Prediction using SVM
svm_prediction = svm_model.predict(X_new.reshape(1, -1))
print('Prediction using SVM:', svm_prediction[0])

if svm_prediction[0] == 0:
    print('The news is Real (SVM)')
else:
    print('The news is Fake (SVM)')

[nltk_data] Downloading package stopwords to C:\Users\Tech
[nltk_data]     Line\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


id         0
title      0
author     0
text       0
label      0
content    0
dtype: int64
Accuracy score on the training data (SVM): 0.9990985576923077
Accuracy score on the test data (SVM): 0.9889423076923077
Prediction using SVM: 1
The news is Fake (SVM)
