In [13]:
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.naive_bayes import MultinomialNB
import pickle
from sklearn import *

In [14]:
#import data
import script
df = script.df
data_frame = script.data_frame

In [15]:
x = df['text']
y = df['label']

In [16]:
#divide the data into train and test set, 20% of rows
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2)

In [17]:
#divide into a word and find  meaningful words 
tfidf_vectorizer = TfidfVectorizer(stop_words = "english", max_df = 0.7)
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

In [41]:
#classify the data and calculate the accracy of the predict value
pac = PassiveAggressiveClassifier(max_iter = 50)
pac.fit(tfidf_train, y_train)
y_pred = pac.predict(tfidf_test)
score = accuracy_score(y_test, y_pred)
print(f'Accuracy: {round(score*100, 2)}%')

Accuracy: 94.32%


In [31]:
#makes the data into the machine learning model fully automated. Simplify above code with a pipeline to vectorize and classify test content
pipeline = Pipeline([('tfidf', TfidfVectorizer(stop_words = "english")),('pamodel',PassiveAggressiveClassifier())])


In [32]:
#run train data
pipeline.fit(x_train, y_train)

Pipeline(steps=[('tfidf', TfidfVectorizer(stop_words='english')),
                ('pamodel', PassiveAggressiveClassifier())])

In [37]:
#calculate the acuuracy of the test score
score=pipeline.score(x_test, y_test)
print('accuracy:', score)

accuracy: 0.9479084451460142


In [43]:
# predict the test score with a pipeline
pred = pipeline.predict(x_test)

In [44]:
#classification report, performance evaluation table 
print(classification_report(y_test, pred))

              precision    recall  f1-score   support

        FAKE       0.95      0.95      0.95       638
        REAL       0.95      0.94      0.95       629

    accuracy                           0.95      1267
   macro avg       0.95      0.95      0.95      1267
weighted avg       0.95      0.95      0.95      1267



In [45]:
with open('model.pkl', 'wb') as handle: #write pkl file for simple webapp
    pickle.dump(pipeline, handle, protocol=pickle.HIGHEST_PROTOCOL)