In [53]:
import pandas as pd
import numpy as np
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle

import re
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, MinMaxScaler
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity as cs
from sklearn.pipeline import Pipeline
from sklearn.naive_bayes import MultinomialNB
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import ComplementNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

from wordcloud import WordCloud

import joblib

import warnings

In [54]:
nltk.download('stopwords',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('omw-1.4',quiet=True)

True

In [55]:
default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english')
default_stopwords = default_stopwords + ['said', 'would','even','according','could','year',
                                         'years','also','new','people','old,''one','two','time',
                                         'first','last','say','make','best','get','three','make',
                                         'year old','told','made','like','take','many','set','number',
                                         'month','week','well','back']
shortword = re.compile(r'\W*\b\w{1,4}\b\d')
BAD_SYMBOLS_RE = re.compile("[^a-zA-Z,\d]")
REPLACE_IP_ADDRESS = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
def clean_text(text, ):

    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s) if len(w)>=3]

    def preprocessing_text(text):
        text = text.lower()
        text=text.replace('\n',' ').replace('\xa0',' ').replace('-',' ').replace('ó','o').replace('ğ','g').replace('á','a').replace("'"," ")
        text=re.sub(r'\d+','', text)
        text=re.sub(r'http\S+', '', text)
        text=BAD_SYMBOLS_RE.sub(' ', text)
        text=REPLACE_IP_ADDRESS.sub('', text)
        text=REPLACE_BY_SPACE_RE.sub(' ', text)
        text=' '.join(word for word in text.split() if len(word)>3)
        
        return text

    def remove_special_characters(text, characters=string.punctuation.replace('-', '')):
        tokens = tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(characters + '0123456789')))
        return ' '.join(filter(None, [pattern.sub('', t) for t in tokens]))

    def stem_text(text, stemmer=default_stemmer):
        tokens = tokenize_text(text)
        return ' '.join([stemmer.stem(t) for t in tokens])

    def lemm_text(text, lemm=WordNetLemmatizer()):
        tokens = tokenize_text(text)
        return ' '.join([lemm.lemmatize(t) for t in tokens])

    def remove_stopwords(text, stop_words=default_stopwords):
        tokens = [w for w in tokenize_text(text) if w not in stop_words]
        return ' '.join(tokens)
    
    text = text.strip(' ') # strip whitespaces
    text = text.lower() # lowercase
    #text = stem_text(text) # stemming
    text=preprocessing_text(text)
    text = remove_special_characters(text) # remove punctuation and symbols
    text = lemm_text(text) # lemmatizer
    text = remove_stopwords(text) # remove stopwords

    return text

In [56]:
df = pd.read_csv('MN-DS-news-classification.csv')

In [57]:
df['text']= df[['title', 'content']].apply(lambda x: ' . '.join(x.astype(str)),axis=1)

In [58]:
df['text']=df['text'].apply(clean_text)

In [59]:
def encode_labels(y):
    le = LabelEncoder()
    return le.fit_transform(y), le

In [60]:
def train_nb(X, y):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2,random_state=42)
    
    # Define a pipeline combining a text feature extractor with multi lable classifier
    NB_pipeline = Pipeline([
                    ('clf', OneVsRestClassifier(MultinomialNB())),
                ])

    NB_pipeline.fit(X_train, y_train)
    # compute the testing accuracy
    prediction = NB_pipeline.predict(X_test)
    print('Precision is {}'.format(precision_score(y_test, prediction, average='macro')))
    print('Recall is {}'.format(recall_score(y_test, prediction, average='macro')))
    print('F1:', f1_score(y_test, prediction, average='macro'))
    
    return NB_pipeline

In [61]:
y_level_1, le_level_1 = encode_labels(df['category_level_1'])
y_level_2, le_level_2 = encode_labels(df['category_level_2'])

In [62]:
def get_tfidf(X): 
    vectorizer = TfidfVectorizer(strip_accents='unicode', 
                                 analyzer='word', 
                                 ngram_range=(1,2), 
                                 norm='l2', 
                                 max_features=50000)
    vectors = vectorizer.fit_transform(X)
    return vectors, vectorizer

In [63]:
X_tfidf, vectorizer = get_tfidf(df['text'])

In [64]:
model1 = train_nb(X_tfidf, y_level_1)
model2 = train_nb(X_tfidf, y_level_2)

Precision is 0.8019714674627598
Recall is 0.6310102334213202
F1: 0.649460198225175
Precision is 0.628225877944608
Recall is 0.6022412871407946
F1: 0.5833794001248308


In [65]:
joblib.dump({
    'model_level_1': model1,
    'model_level_2': model2,
    'le_level_1': le_level_1,
    'le_level_2': le_level_2,
    'vectorizer': vectorizer
}, 'multi_label_models.pkl')


['multi_label_models.pkl']

Load model

In [72]:
loaded_models = joblib.load('multi_label_models.pkl')

In [73]:
loaded_model1 = loaded_models['model_level_1']
loaded_model2 = loaded_models['model_level_2']

In [74]:
def load_model(filepath):
    return joblib.load(filepath)

In [82]:
def predict_with_confidence(new_text, loaded_models):

    model1 = loaded_models['model_level_1']
    model2 = loaded_models['model_level_2']
    le_level_1 = loaded_models['le_level_1']
    le_level_2 = loaded_models['le_level_2']
    vectorizer = loaded_models['vectorizer']
    

    cleaned_text = clean_text(new_text)
    
 
    new_tfidf = vectorizer.transform([cleaned_text])
    

    prediction1 = model1.predict(new_tfidf)
    prediction2 = model2.predict(new_tfidf)
    
    prob1 = model1.predict_proba(new_tfidf)
    prob2 = model2.predict_proba(new_tfidf)
    

    confidence1 = prob1.max(axis=1)
    confidence2 = prob2.max(axis=1)
    

    predicted_label1 = le_level_1.inverse_transform(prediction1)
    predicted_label2 = le_level_2.inverse_transform(prediction2)
    
    return predicted_label1, confidence1, predicted_label2, confidence2

new_text = "Physician, Father and Caretaker of 29 Year Old Autistic Man Found Brutally Murdered "

predicted_categories1, confidence1, predicted_categories2, confidence2 = predict_with_confidence(new_text, loaded_models)

print("Predicted categories for Level 1:", predicted_categories1)
print("Confidence for Level 1:", confidence1)
print("Predicted categories for Level 2:", predicted_categories2)
print("Confidence for Level 2:", confidence2)


Predicted categories for Level 1: ['society']
Confidence for Level 1: [0.18754679]
Predicted categories for Level 2: ['crime']
Confidence for Level 2: [0.02633024]
