In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import SelectKBest, chi2
from sklearn.svm import LinearSVC
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import f1_score, classification_report, accuracy_score, precision_score, recall_score
import joblib

data = pd.read_csv("model/data/balanced_raw_data.csv", header=None, names=['text', 'label'])
X = data['text']
y = data['label']

xtrain, xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

# pipeline creation, have tweaked random things but this seems to work well
pipeline = Pipeline([
    ('tfidf', TfidfVectorizer()),
    ('select', SelectKBest(score_func=chi2)),
    ('svc', LinearSVC(class_weight='balanced', max_iter=10000))
])

# define a parameter grid, cylce through various values to see what works best lol
param_grid = {
    'tfidf__max_df': [0.9, 1.0],
    'tfidf__min_df': [1, 2],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    'select__k': ['all', 1000, 5000],
    'svc__C': [0.1, 1, 10]
}

# optimises f1 score hopefully...
grid = GridSearchCV(pipeline, param_grid, scoring='f1', cv=5, verbose=3, n_jobs=-1)
grid.fit(xtrain, ytrain)


print("best parameters found:")
print(grid.best_params_)
print("best cross-validation F1 score:", grid.best_score_)

# run tests on the test set to see what the f1 score looks like
y_pred = grid.predict(xtest)
accuracy = accuracy_score(ytest, y_pred)
precision = precision_score(ytest, y_pred)
recall = recall_score(ytest, y_pred)
f1 = f1_score(ytest, y_pred)

print("the accuracy is:", accuracy)
print("the precision is :", precision)
print("the recall:", recall)
print("the F1 score:", f1)
print("this is the classification report:")
print(classification_report(ytest, y_pred))

joblib.dump(grid.best_estimator_, 'final_svm.pkl')


Fitting 5 folds for each of 72 candidates, totalling 360 fits


In [None]:

import pandas as pd
import joblib
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report
from sklearn.model_selection import train_test_split

data = pd.read_csv("model/data/balanced_raw_data.csv", header=None, names=['text', 'label'])
X = data['text']
y = data['label']

doesntmatter, xtest, doesntmatter, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

model = joblib.load('final_svm.pkl')

y_pred = model.predict(xtest)

accuracy = accuracy_score(ytest, y_pred)
precision = precision_score(ytest, y_pred)
recall = recall_score(ytest, y_pred)
f1 = f1_score(ytest, y_pred)

print("the accuracy is:", accuracy)
print("the precision is :", precision)
print("the recall:", recall)
print("the F1 score:", f1)




Accuracy: 0.7212569444444444
Precision: 0.352530382731725
Recall: 0.8138950542317518
F1 Score: 0.4919692693237479

Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.70      0.81    120121
           1       0.35      0.81      0.49     23879

    accuracy                           0.72    144000
   macro avg       0.65      0.76      0.65    144000
weighted avg       0.85      0.72      0.76    144000

