In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline

import seaborn as sns
import pandas as pd
import numpy as np
import joblib

from skl2onnx import convert_sklearn
from skl2onnx.common.data_types import Int64TensorType
import onnxruntime as rt

In [2]:
fake = pd.read_csv('./dataset/Fake.csv', delimiter = ',')
fake['label']= 0
true = pd.read_csv('./dataset/True.csv', delimiter = ',')
true['label']= 1

In [3]:
# Join both datasets
all_data = pd.concat([fake,true],axis=0)

In [4]:
# Drop fields that we do not need
data = all_data.drop(['text','subject','date'], axis=1)

In [5]:
x = data['title']
y = data['label']

In [6]:
# Split x and y into training and test data
x_train,x_test,y_train,y_test = train_test_split(x, y, test_size = 0.2)

In [7]:
vectorizer = CountVectorizer(lowercase=True, min_df=1, max_df=1.0, ngram_range=(1,1))
x_train_vector = vectorizer.fit_transform(x_train)
x_test_vector = vectorizer.transform(x_test)

# Data information
vocab = vectorizer.vocabulary_
print("Vocab size = {}".format(len(vocab)))
print("Size of training data = {}".format(x_train_vector.shape))
print("Size of test data = {}".format(x_test_vector.shape))

Vocab size = 19528
Size of training data = (35918, 19528)
Size of test data = (8980, 19528)


In [8]:
classifier = LogisticRegression(fit_intercept=True, penalty="l2", C=1, max_iter=200)
classifier.fit(x_train_vector, y_train)

LogisticRegression(C=1, max_iter=200)

In [9]:
# Performance on Training and Test data
print("Training accuracy = {}".format(classifier.score(x_train_vector, y_train)))
print("Test accuracy = {}".format(classifier.score(x_test_vector, y_test)))

Training accuracy = 0.9892532991814689
Test accuracy = 0.9614699331848552


# Using a pipeline

In [10]:
pipeline = Pipeline([('vectorizer', CountVectorizer()), ('classifier', LogisticRegression(fit_intercept=True, penalty="l2", C=1, max_iter=200))])

In [11]:
pipeline.fit(x_train, y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('classifier', LogisticRegression(C=1, max_iter=200))])

In [16]:
pipeline.score(x_test, y_test)

0.9614699331848552

In [19]:
joblib.dump(pipeline, './model/pipeline.pkl')

['./model/pipeline.pkl']

# Save pipeline in .onnx extension

In [187]:
from skl2onnx.common.data_types import StringTensorType

In [188]:
initial_type = [('StringTensorType', StringTensorType([None]))]
onx = convert_sklearn(pipeline, initial_types=initial_type)

with open("./model/pipeline.onnx", "wb") as f:
    f.write(onx.SerializeToString())



In [178]:
sess = rt.InferenceSession("./model/pipeline.onnx")
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name

pred_onx = sess.run([label_name], {input_name: x_test.to_list()})[0]

In [179]:
pred_onx

array([1, 1, 1, ..., 1, 1, 0], dtype=int64)

# Save classifier in .onnx extension

In [129]:
initial_type = [('int64_input', Int64TensorType([None,19576]))]
onx = convert_sklearn(classifier, initial_types=initial_type)
with open("./model/classifier.onnx", "wb") as f:
    f.write(onx.SerializeToString())

In [130]:
sess = rt.InferenceSession("./model/classifier.onnx")
input_name = sess.get_inputs()[0].name
label_name = sess.get_outputs()[0].name

pred_onx = sess.run([label_name], {input_name: x_test_vector.toarray().astype(np.int64)})[0]