In [4]:
import pandas as pd
import numpy as np
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
import pickle
from sklearn import *

In [7]:
#import data
import script
df = script.df
data_frame = script.data_frame

In [8]:
x = df['text']
y = df['label']

In [9]:
#remove the non-required value and clean the data 
df.isnull().any()

title    False
text     False
label    False
dtype: bool

In [10]:
#divide the data into train and test set
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [11]:
#divide into a word and find  meaningful words 
tfidf_vectorizer = TfidfVectorizer(stop_words = "english", max_df = 0.7)
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

In [12]:
#classify the data and calculate the accracy of the predict value
pac = PassiveAggressiveClassifier(max_iter = 50)
pac.fit(tfidf_train, y_train)
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100, 2)}%')

Accuracy: 94.24%


In [13]:
#makes the data into the machine learning model fully automated. 
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words = "english")),('nbmodel', MultinomialNB())])


In [14]:
#run machine learning
pipeline.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('nbmodel', MultinomialNB())])

In [15]:
#calculate the acuuracy of the test score
score=pipeline.score(x_test, y_test)
print('accuracy', score)

accuracy 0.835043409629045


In [16]:
# predict the test score
pred = pipeline.predict(x_test)

In [17]:
#classification report
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

        FAKE       0.98      0.68      0.80       630
        REAL       0.76      0.98      0.86       637

    accuracy                           0.84      1267
   macro avg       0.87      0.83      0.83      1267
weighted avg       0.87      0.84      0.83      1267



In [18]:
#confusion matrix
print(confusion_matrix(y_test, pred))

[[431 199]
 [ 10 627]]
