In [2]:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import TfidfVectorizer
import gensim
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
import numpy as np
from pprint import pprint
from sklearn import svm
from sklearn.metrics import classification_report, accuracy_score


In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\marcd\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:
cats = ['rec.autos', 'rec.sport.baseball', 'rec.sport.hockey', 'sci.crypt', 'sci.electronics', 'sci.med', 'sci.space', 'soc.religion.christian', 'talk.politics.guns', 'talk.politics.mideast', 'talk.politics.misc', 'talk.religion.misc']
train_dataset = fetch_20newsgroups(subset='train', categories=cats, shuffle=True)
test_dataset = fetch_20newsgroups(subset='test', categories=cats, shuffle=True)


In [5]:
list_cats = list(train_dataset.target_names)

# Create a dictionarry to associate the labels of the dataset with name of categories
dict_cats = {'cars': [0], 'sport': [1, 2], 'science': [3, 4, 5, 6], 'religion': [7, 11], 'politics': [8, 9, 10]}

# Create a dictionarry to associate the labels of the dataset with new numbers of categories
dict_cats_number = {0: [0], 1: [1, 2], 2: [3, 4, 5, 6], 3: [7, 11], 4: [8, 9, 10]}


In [6]:
#Tokenize and lemmatize a text

stemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def stem_and_lemmatize(text):
    lemm_text = lemmatizer.lemmatize(text, pos='v')
    return stemmer.stem(lemm_text)

def preprocess_data(text):
    """
    This function returns the text preprocessed: a new text with the words stemmed and lemmatized, without the stop words and the words shorter than 3 letters.
    """
    words = []
    text = simple_preprocess(text) #the text is converted into a list of lower-case words
    for token in text:
        if token not in STOPWORDS and len(token) > 3:
            words.append(stem_and_lemmatize(token))
    return ' '.join(words)


In [7]:
def preprocess_dataset(train_dataset, test_dataset):
    """
    This function preprocess the dataset.
    It returns:
        The Tfidf vectors of training and testing datasets (X_train, X_test)
        The labels (y_train, y_test)
        The Tfidf vectorizer object
    """
    train_processed_texts = []
    for text in train_dataset.data:
        train_processed_texts.append(preprocess_data(text))
    test_processed_texts = []
    for text in test_dataset.data:
        test_processed_texts.append(preprocess_data(text))

    vect = TfidfVectorizer(stop_words='english', min_df=2)
    X_train = vect.fit_transform(train_processed_texts)
    X_test = vect.transform(test_processed_texts)
    y_train = np.array(train_dataset.target)
    y_test = np.array(test_dataset.target)

    return X_train, X_test, y_train, y_test, vect


In [8]:
X_train, X_test, y_train, y_test, vect = preprocess_dataset(train_dataset, test_dataset)

In [9]:
# Classifier: we choose a SVM

# First, let transform our labels into the labels for the category we chose.
def transform_label(labels):
    labels2 = [0 for k in range(len(labels))]
    for i in range(len(labels)):
        label = labels[i]
        for j in range(5):
            if label in dict_cats_number[j]:
                labels2[i] = j
                break
    return labels2

def trained_svm(X_train, y_train, kernel):
    model = svm.SVC(kernel=kernel, gamma='auto')
    model.fit(X_train, y_train)
    return model


In [14]:
# Performance of our classifier
y_train = transform_label(y_train)
y_test = transform_label(y_test)

print("Training...")
kernel = 'linear'
SVM = trained_svm(X_train, y_train, kernel)
print("Training done.")

y_pred = SVM.predict(X_test)
report = classification_report(y_test, y_pred)
accuracy = accuracy_score(y_test, y_pred)

print(report)
print("Accuracy: ", accuracy)


Training...
Training done.
              precision    recall  f1-score   support

           0       1.00      0.76      0.86       396
           1       0.98      1.00      0.99      4074

    accuracy                           0.98      4470
   macro avg       0.99      0.88      0.92      4470
weighted avg       0.98      0.98      0.98      4470

Accuracy:  0.9782997762863535


In [36]:
# The article clasifier

# First, let's create the profiles and their fields of interest.

profiles = {
    'Thomas' : ['sport', 'politics'],
    'Aline' : ['cars'],
    'George' : ['religion', 'cars'],
    'Eva' : ['science', 'politics'],
    'Lorenzo' : ['sport']
}

profiles_number = {}
for key in profiles.keys():
    interests = profiles[key]
    interests_number = []
    for interest in interests:
        cats = dict_cats[interest]
        cats = transform_label(cats)
        for cat in cats:
            interests_number.append(cat)
    profiles_number[key] = interests_number


def predict_text(text):
    """
    This function returns the predicted category of an input text.
    """
    text = [preprocess_data(text)]
    text = vect.transform(text)
    prediction = SVM.predict(text)[0]
    cat = list(dict_cats.keys())[prediction]
    return cat


def main():
        text = input("Please type the text you want and press enter: ")
        print("Processing...")
        category = predict_text(text)
        users = []
        for user in profiles.keys():
            if category in profiles[user]:
                users.append(user)
        print("The theme of this text is: ", category)
        print('')
        print("So, this text will be sent to the following users:")
        for user in users:
            print("     -  ", user)
        

In [37]:
"""
Run this cell to try the programm.
"""

main()

Processing...
The theme of this text is:  science

So, this text will be sent to the following users:
     -   Eva
