In [1]:
# Import relevant libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# nltk for text cleaning
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import re

# Sklearn modules
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, f1_score, r2_score
from sklearn import svm

from sqlalchemy import create_engine
from config import username, password, db_name
engine = create_engine(f'postgresql://{username}:{password}@localhost:5432/{db_name}')
connection = engine.connect()

In [None]:
# Load the dataset
dataset = pd.read_sql("SELECT * FROM news", connection)
dataset.head()

In [13]:
# Vectorise word data
tfidf = TfidfVectorizer(lowercase=False, stop_words='english', max_features=44235)

In [14]:
text = dataset['text']
X = tfidf.fit_transform(text)
y = dataset['label']

In [15]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [64]:
from sklearn import svm
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto')
SVM_model = SVM.fit(X_train,y_train)

# predict the labels on validation dataset
predictions_SVM = SVM.predict(X_test)

In [65]:
# Accuracy score
print("SVM Accuracy Score = ",accuracy_score(predictions_SVM, y_test)*100)

SVM Accuracy Score =  99.21633637254163


In [66]:
#Generate the confusion matrix
cf_matrix = confusion_matrix(y_test, predictions_SVM)

print(cf_matrix)

[[6467   48]
 [  56 6700]]


In [69]:
svm_f1 = round(f1_score(y_test, predictions_SVM, average= 'weighted'), 3)
svm_accuracy = round((accuracy_score(y_test, predictions_SVM) * 100), 2)

print("classification report :")
print(classification_report(y_test, predictions_SVM))
print("------------------------------------------------------")
print("confusion matrix : ")
print(confusion_matrix(y_test, predictions_SVM))
print("------------------------------------------------------")
print("Accuracy : " , svm_accuracy , "%")
print("f1_score : " , svm_f1)

classification report :
              precision    recall  f1-score   support

           0       0.99      0.99      0.99      6515
           1       0.99      0.99      0.99      6756

    accuracy                           0.99     13271
   macro avg       0.99      0.99      0.99     13271
weighted avg       0.99      0.99      0.99     13271

------------------------------------------------------
confusion matrix : 
[[6467   48]
 [  56 6700]]
------------------------------------------------------
Accuracy :  99.22 %
f1_score :  0.992


In [68]:
import joblib
# Save the model
joblib.dump(SVM_model , 'svm_model.pkl')

# Save the vectorizer
joblib.dump(tfidf, open("svm_model.pkl", "wb"))