In [20]:
import re
import nltk
import random
import numpy as np
import pandas as pd
from textblob import Word
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score

In [22]:
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [23]:
!python3 -m textblob.download_corpora

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Package brown is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package conll2000 to /root/nltk_data...
[nltk_data]   Package conll2000 is already up-to-date!
[nltk_data] Downloading package movie_reviews to /root/nltk_data...
[nltk_data]   Package movie_reviews is already up-to-date!
Finished.


## Data preparation

In [24]:
def clean_string(string):

    string = re.sub(r"\'s", "", string)
    string = re.sub(r"\'ve", "", string)
    string = re.sub(r"n\'t", "", string)
    string = re.sub(r"\'re", "", string)
    string = re.sub(r"\'d", "", string)
    string = re.sub(r"\'ll", "", string)
    string = re.sub(r",", "", string)
    string = re.sub(r"!", " ! ", string)
    string = re.sub(r"\(", "", string)
    string = re.sub(r"\)", "", string)
    string = re.sub(r"\?", "", string)
    string = re.sub(r"'", "", string)
    string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
    string = re.sub(r"[0-9]\w+|[0-9]", "", string)
    string = re.sub(r"\s{2,}", " ", string)

    return string.strip().lower()


def prepare_dataset(dataset):
    # divide in features and labels
    x = dataset['news'].tolist()
    y = dataset['type'].tolist()
    print("\n--------------------------------------------------------")
    print("------------------- DATA PREPARATION -------------------")
    print("--------------------------------------------------------\n")
    print("Tokenization & lemmatization", end='', flush=True)
    for i, value in enumerate(x):
        x[i] = ' '.join([Word(word).lemmatize() for word in clean_string(value).split()])
        if(i%100==0):
            print('.', end='', flush=True)
    print("DONE!\n")
    X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.20, random_state=42)
    vect = TfidfVectorizer(stop_words='english', min_df=2)

    X_train = vect.fit_transform(X_train)
    y_train = np.array(y_train)
    X_test = vect.transform(X_test)
    y_test = np.array(y_test)
    X_val, X_test, y_val, y_test = train_test_split(X_test, y_test, test_size=0.50, random_state=42)

    print("Train set:" + str(X_train.shape))
    print("Validation set:" + str(X_val.shape))
    print("Test set:" + str(X_test.shape))

    return X_train, X_val, X_test, y_train, y_val, y_test, vect

## Models

In [25]:

def train_RF(X_train, y_train):
    print("-------------------")
    print("-- RANDOM FOREST --")
    print("-------------------")
    model = RandomForestClassifier(n_estimators=300, max_depth=150, n_jobs=1)
    model.fit(X_train, y_train)
    return model


def train_NN(X_train, y_train):
    print("-------------------")
    print("- NEURAL  NETWORK -")
    print("-------------------")
    model = MLPClassifier(activation='relu', alpha=1e-05, batch_size='auto',
                          beta_1=0.9, beta_2=0.999, early_stopping=True,
                          epsilon=1e-08, hidden_layer_sizes=(128, 64),
                          learning_rate='constant', learning_rate_init=0.001,
                          max_iter=400, momentum=0.9, n_iter_no_change=10,
                          nesterovs_momentum=True, power_t=0.5, random_state=1,
                          shuffle=True, solver='lbfgs', tol=0.0001,
                          validation_fraction=0.1, verbose=False, warm_start=False)
    model.fit(X_train, y_train)
    return model


def train_SVC(X_train, y_train):
    print("-------------------")
    print("------- SVM -------")
    print("-------------------")
    model = SVC(kernel='linear', gamma='auto')
    model.fit(X_train, y_train)
    return model


def eval_model(model, X_val, y_val):
    y_pred = model.predict(X_val)
    result = classification_report(y_val, y_pred)
    accSVC = accuracy_score(y_val, y_pred)
    print(result)
    print("\nAccuracy: ", accSVC)


def test_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    return acc

## Configuration 

In [26]:
PREFERENCES = ["business", "entertainment", "politics", "sport", "tech"]

class bcolors:
    PURPLE = '\033[95m'
    BLUE = '\033[94m'
    GREEN = '\033[92m'
    YELLOW = '\033[93m'
    RED = '\033[91m'
    ENDC = '\033[0m'
    BOLD = '\033[1m'
    UNDERLINE = '\033[4m'


def assign_preferences(users):
    users_with_preferences = {}
    for user in users:
        preferences = []
        possibilities = list(PREFERENCES)
        for pref in range(random.randint(1, 3)):
            category = random.choice(possibilities)
            preferences.append(category)
            possibilities.remove(category)
        users_with_preferences[user] = preferences
    return users_with_preferences


def predict_doc_type(doc, vect, model):
    doc_cleaned = clean_string(doc)
    corpus = []
    corpus.append(doc_cleaned)
    test_vect = vect.transform(corpus)
    return model.predict(test_vect)[0]


# Testing results
def run_demo(vect, model):
    users = ["Maria   ", "Nacho   ", "Luca    ", "Adam    ", "Tom     ", "Mike    "]
    users_with_preferences = assign_preferences(users)
    print("\n--------------------------------------------------------")
    print("------------------------- DEMO -------------------------")
    print("--------------------------------------------------------\n")

    print("--------------------------------------------------------\n")
    print("USER PREFERENCES:")
    for user in users_with_preferences:
        print(user, " -> ", users_with_preferences[user])
    print("\n--------------------------------------------------------")

    while True:
        try:
            print("---------------------------------------------------------------------------------------------------------------")
            test_corpus = input(bcolors.BOLD + "Paste here an article without newline characters (\\n) (Press 'q' to quit or 'r' to reassign the preferences):\n" + bcolors.ENDC)
            print("---------------------------------------------------------------------------------------------------------------")
        except ValueError:
            continue
        if test_corpus == "q":
            break
        if test_corpus == "r":
            users_with_preferences = assign_preferences(users)
            print("--------------------------------------------------------\n")
            print("USER PREFERENCES:")
            for user in users_with_preferences:
                print(user, "->  ", users_with_preferences[user])
            print("\n--------------------------------------------------------")
            continue
        else:

            result = predict_doc_type(test_corpus, vect, model)
            if(result == "business"):
                topic = bcolors.BOLD + bcolors.YELLOW + result.upper() + bcolors.ENDC
            if (result == "entertainment"):
                topic = bcolors.BOLD + bcolors.RED + result.upper() + bcolors.ENDC
            if (result == "politics"):
                topic = bcolors.BOLD + bcolors.PURPLE + result.upper() + bcolors.ENDC
            if (result == "sport"):
                topic = bcolors.BOLD + bcolors.BLUE + result.upper() + bcolors.ENDC
            if (result == "tech"):
                topic = bcolors.BOLD + bcolors.GREEN + result.upper() + bcolors.ENDC
            print("--------------------------------------------------------\n")
            print(bcolors.BOLD + "This article talks about "+ topic + bcolors.BOLD + " and it's addressed to:"+bcolors.ENDC)
            for user in users_with_preferences:
                if result in users_with_preferences[user]:
                    print(user)
            print("\n--------------------------------------------------------")
            continue

## Main

In [29]:
# DATA PREPARATION
dataset = pd.read_csv('dataset.csv', encoding="ISO-8859-1")
X_train, X_val, X_test, y_train, y_val, y_test, vect = prepare_dataset(dataset)

# MODELS TRAINING
print("\n--------------------------------------------------------")
print("------------------- MODELS  TRAINING -------------------")
print("--------------------------------------------------------\n")

# Random Forest
modelRF = train_RF(X_train, y_train)
eval_model(modelRF, X_val, y_val)
# SVM
modelSVC = train_SVC(X_train, y_train)
eval_model(modelSVC, X_val, y_val)
# NN
modelNN = train_NN(X_train, y_train)
eval_model(modelNN, X_val, y_val)

# MODELS TESTING
print("\n--------------------------------------------------------")
print("-------------- MODELS  TESTING (accuracy) --------------")
print("--------------------------------------------------------\n")
print("RANDOM FOREST:     ", test_model(modelRF, X_test, y_test))
print("SVC:               ", test_model(modelSVC, X_test, y_test))
print("NEURAL NETWORK:    ", test_model(modelNN, X_test, y_test))



--------------------------------------------------------
------------------- DATA PREPARATION -------------------
--------------------------------------------------------

Tokenization & lemmatization.......................DONE!

Train set:(1780, 13197)
Validation set:(222, 13197)
Test set:(223, 13197)

--------------------------------------------------------
------------------- MODELS  TRAINING -------------------
--------------------------------------------------------

-------------------
-- RANDOM FOREST --
-------------------
               precision    recall  f1-score   support

     business       0.98      0.95      0.96        56
entertainment       0.97      0.94      0.96        36
     politics       0.89      0.97      0.93        40
        sport       1.00      1.00      1.00        54
         tech       0.97      0.94      0.96        36

     accuracy                           0.96       222
    macro avg       0.96      0.96      0.96       222
 weighted avg       

In [31]:
# RUN THE DEMO
run_demo(vect, modelNN)


--------------------------------------------------------
------------------------- DEMO -------------------------
--------------------------------------------------------

--------------------------------------------------------

USER PREFERENCES:
Maria     ->  ['sport']
Nacho     ->  ['entertainment', 'tech', 'politics']
Luca      ->  ['entertainment']
Adam      ->  ['politics', 'business', 'sport']
Tom       ->  ['entertainment', 'sport', 'tech']
Mike      ->  ['tech', 'sport', 'entertainment']

--------------------------------------------------------
---------------------------------------------------------------------------------------------------------------
[1mPaste here an article without newline characters (\n) (Press 'q' to quit or 'r' to reassign the preferences):
[0mFollowing the introduction of Apple’s iOS Screen Time feature, a number of app developers who created screen-tracking and parental control apps have been asked to change their products, or have been booted fro