In [56]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
import spacy

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

from sklearn import metrics
import joblib

In [6]:
nlp = spacy.load("en_core_web_sm")

def preprocess_text(txt:str):
    txt = re.sub('[^a-zA-Z]', ' ', txt) 
    txt = txt.lower()
    txt = " ".join(txt.split()) 
    
    doc = nlp(txt)
    
    tokens_filtered = []
    # Iterate through tokens and append to list if its not stop word or punctuation mark
    for token in doc:
        if token.is_stop or token.is_punct:
            continue
        tokens_filtered.append(token.lemma_)
        
    return " ".join(tokens_filtered)

In [7]:
# Load data
plt.rcParams["figure.figsize"] = [10, 5]

df = pd.read_csv('df_file.csv')
df['Text'] = df['Text'].apply(lambda x:x.replace('\n',''))

In [8]:
# Remove duplicates
df.drop_duplicates(ignore_index = True, inplace=True)

# Preprocess text
df['prep_text'] = df['Text'].apply(preprocess_text)
df.head()

Unnamed: 0,Text,Label,prep_text
0,Budget to set scene for election Gordon Brown...,0,budget set scene election gordon brown seek ec...
1,Army chiefs in regiments decision Military ch...,0,army chief regiment decision military chief ex...
2,Howard denies split over ID cards Michael How...,0,howard deny split d card michael howard deny s...
3,Observers to monitor UK election Ministers wi...,0,observer monitor uk election minister invite i...
4,Kilroy names election seat target Ex-chat sho...,0,kilroy name election seat target ex chat host ...


In [12]:
vectorizer = TfidfVectorizer(sublinear_tf=True, min_df=5,ngram_range=(1, 2), stop_words='english')
features = vectorizer.fit_transform(df['prep_text']).toarray()

X = features
y = df['Label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

print(f'Rows used in training: {len(X_train)}')
print(f'Rows used in evaluation: {len(X_test)}')

Rows used in training: 1701
Rows used in evaluation: 426


In [47]:

training_alg = {'model':LogisticRegression()}
scores = {}

try:
    model = training_alg['model'].fit(X_train, y_train, 
        early_stopping_rounds=10,
        eval_metric='merror',
        eval_set=[(X_test, y_test)])
    
except TypeError:
    classifiers['model'].fit(X_train, y_train)
    
training_score = cross_val_score(training_alg['model'], X_train, y_train, cv=5, scoring='accuracy') 
avg_score = round(np.mean(training_score) * 100, 2)

joblib.dump(training_alg["model"], "model.joblib")

print(training_score)

[0.99120235 0.98823529 0.96176471 0.97352941 0.97058824]
