In [9]:
import numpy as np
import pandas as pd
import re
import spacy
import joblib
import sqlite3

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

con = sqlite3.connect("dataset.db")

In [10]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(txt:str):
    txt = re.sub('[^a-zA-Z]', ' ', txt) 
    txt = txt.lower()
    txt = " ".join(txt.split()) 
    
    doc = nlp(txt)
    
    tokens_filtered = []
    # Iterate through tokens and append to list if its not stop word or punctuation mark
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        tokens_filtered.append(token.lemma_)
        
    return " ".join(tokens_filtered)

In [16]:
df = pd.read_sql_query("SELECT * FROM Dataset", con)

df.head()

Unnamed: 0,text,prep_text,label
0,Budget to set scene for election Gordon Brown...,budget set scene election gordon brown seek ec...,0
1,Army chiefs in regiments decision Military ch...,army chief regiment decision military chief ex...,0
2,Howard denies split over ID cards Michael How...,howard deny split d card michael howard deny s...,0
3,Observers to monitor UK election Ministers wi...,observer monitor uk election minister invite i...,0
4,Kilroy names election seat target Ex-chat sho...,kilroy name election seat target ex chat host ...,0


In [18]:
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5,ngram_range=(1, 2), stop_words='english')
features = vectorizer.fit_transform(df['prep_text']).toarray()

X = features
y = df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [19]:
training_alg = {'model':LogisticRegression()}

try:
    training_alg['model'].fit(X_train, y_train, 
        early_stopping_rounds=10,
        eval_metric='merror',
        eval_set=[(X_test, y_test)])
        
except TypeError:
    training_alg['model'].fit(X_train, y_train)

training_score = cross_val_score(training_alg['model'], X_train, y_train, cv=5, scoring='accuracy') 
avg_score = round(np.mean(training_score) * 100, 2)

print(f"Training score: {training_score}")
print(f"Average score: {avg_score}")

Training score: [0.97653959 0.97647059 0.97941176 0.97352941 0.98235294]


NameError: name 'avg_score' is not defined