<a href="https://www.kaggle.com/code/aleksandrmorozov123/nlp-with-python?scriptVersionId=197051836" target="_blank"><img align="left" alt="Kaggle" title="Open in Kaggle" src="https://kaggle.com/static/images/open-in-kaggle.svg"></a>

In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

**Code to populate the documents dictionary**

In [None]:
def read_documents ():
    f = open ("/kaggle/input/cisi-a-dataset-for-information-retrieval/CISI.ALL")
    merged = " "
    # the string variable merged keeps the result of merging the field identifier with its content
    
    for a_line in f.readlines ():
        if a_line.startswith ("."):
            merged += "\n" + a_line.strip ()
        else:
            merged += " " + a_line.strip ()
    # updates the merged variable using a for-loop
    
    documents = {}
    
    content = ""
    doc_id = ""
    # each entry in the dictioanry contains key = doc_id and value = content
    
    for a_line in merged.split ("\n"):
        if a_line.startswith (".I"):
            doc_id = a_line.split (" ") [1].strip()
        elif a_line.startswith (".X"):
            documents[doc_id] = content
            content = ""
            doc_id = ""
        else:
            content += a_line.strip ()[3:] + " "
    f.close ()
    return documents

# print out the size of the dictionary and the content of the very first article
documents = read_documents ()
print (len (documents))
print (documents.get ("1"))
    

**Code to populate the queries dictionary**

In [None]:
def read_queries ():
    f = open ("/kaggle/input/cisi-a-dataset-for-information-retrieval/CISI.QRY")
    merged = ""
    
    # merge the conten of each field with its identifier and separate different fields with lune breaks
    for a_line in f.readlines ():
        if a_line.startswith ("."):
            merged += "\n" + a_line.strip ()
        else:
            merged += " " + a_line.strip ()
    
    queries = {}
    
    # initialize queries dictionary with key = qry_id and value=content for each query in the dataset
    content = ""
    qry_id = ""
    
    for a_line in merged.split ("\n"):
        if a_line.startswith (".I"):
            if not content == "":
                queries [qry_id] = content
                content = ""
                qry_id = ""
            # add an enrty to the dictionary when you encounter an .I identifier
            qry_id = a_line.split(" ")[1].strip ()
        # otherwise, keep adding content to the content variable
        elif a_line.startswith (".W") or a_line.startswith (".T"):
            content += a_line.strip ()[3:] + " "
    queries [qry_id] = content
    f.close ()
    return queries

# print out the length of the dictionary and the content of the first query
queries = read_queries ()
print (len (queries))
print (queries.get("1"))

**Code to populate the mappings dictionary**

In [None]:
def read_mappings ():
    f = open ("/kaggle/input/cisi-a-dataset-for-information-retrieval/CISI.REL")
    mappings = {}
    
    for a_line in f.readlines ():
        voc = a_line.strip ().split ()
        key = voc[0].strip ()
        current_value = voc[1].strip()
        value = []
        # update the entry in the mappings dictionary with the current value
        if key in mappings.keys ():
            value = mappings.get (key)
        value.append (current_value)
        mappings [key] = value
    f.close ()
    return mappings

# print out some information about the mapping data structure
mappings = read_mappings ()
print (len (mappings))
print (mappings.keys ())
print (mappings.get ("1"))

**Preprocess the data in documents and queries**

In [None]:
# import required libraries
import nltk
from nltk import word_tokenize

# text is converted to lowercase and split into words
def get_words (text):
    word_list = [word for word in word_tokenize (text.lower ())]
    return word_list
    
doc_words = {}
qry_words = {}

for doc_id in documents.keys ():
    doc_words [doc_id] = get_words (documents.get (doc_id))
for qry_id in queries.keys ():
    # entries in both documents and queries are represented as word lists
    qry_words [qry_id] = get_words (queries.get (qry_id))
    
# print out the length of the dictionaries and check the first document and the fisrt query
print (len (doc_words))
print (doc_words.get ("1"))
print (len (doc_words.get ("1")))
print (len (qry_words))
print (qry_words.get ("1"))
print (len (qry_words.get("1")))

**Simple Biilean search algorithm**

In [None]:
# iterate through the documents
def retrieve_documents (doc_words, query):
    docs = []
    for doc_id in doc_words.keys ():
        found = False
        i = 0
        while i<len(query) and not found: 
            word = query [i]
            if word in doc_words.get (doc_id):
                docs.append (doc_id)
                found = True
            else:
                i+=1
    return docs

# check the results
docs = retrieve_documents (doc_words, qry_words.get("3"))
print (docs [:100])
print (len (docs))

**Begin the preprocessing - remove stopwords and punctuation marks**

In [None]:
# import python's string module that will help remove punctuation marks
import string

# import the stopwords list
from nltk import word_tokenize
from nltk.corpus import stopwords

def process (text):
    stoplist = set (stopwords.words ('english'))
    # only add tthe words if they are not included in the stoplist and are not puctuation marks
    word_list = [word for word in word_tokenize (text.lower())
                if not word in stoplist and not word in string.punctuation]
    return word_list

# check the results of these preprocessing steps on some documents or queries
word_list = process (documents.get ("1"))
print (word_list)

**Next step in preprocessing - stemming**

In [None]:
# import the stemmer
from nltk.stem.lancaster import LancasterStemmer

def process (text):
    stoplist = set (stopwords.words ('english'))
    # initialize the LancasterStemmer
    st = LancasterStemmer ()
    word_list = [st.stem(word) for word in word_tokenize (text.lower ())
                if not word in stoplist and not word in string.punctuation]
    return word_list

# check the results on some document, query, or on a list of words
word_list = process (documents.get("26"))
print (word_list)
word_list = process ("organize, organizing, organizational, organ, organic, organizer")
print (word_list)

**Estimate term frequency in documents and queries**

In [None]:
def get_terms (text):
    stoplist = set (stopwords.words ('english'))
    terms = {}
    st = LancasterStemmer ()
    word_list = [st.stem(word) for word in word_tokenize (text.lower ())
                if not word in stoplist and not word in string.punctuation]
    for word in word_list:
        terms [word] = terms.get (word, 0) + 1
    return terms

doc_terms = {}
qry_terms = {}
for doc_id in documents.keys ():
    doc_terms [doc_id] = get_terms (documents.get (doc_id))
for qry_id in queries.keys ():
    # populate the term frequency dictionaries for all documents and all queries
    qry_terms [qry_id] = get_terms (queries.get (qry_id))
    
# check the results
print (len (doc_terms))
print (doc_terms.get ("1"))
print (len (doc_terms.get("1")))
print (len (qry_terms))
print (qry_terms.get("1"))
print (len (qry_terms.get("1")))


**Code to represent the datya in a shared space**

In [None]:
# collect the shared vocabulary of terms from documents and queries and return it as a sorted list
def collect_vocabulary ():
    all_terms = []
    for doc_id in doc_terms.keys ():
        for term in doc_terms.get (doc_id).keys():
            all_terms.append (term)
    for qry_id in qry_terms.keys ():
        for term in qry_terms.keys():
            for term in qry_terms.get(qry_id).keys():
                all_terms.append (term)
    return sorted (set (all_terms))

# print out the length of the shared vocabulary and check the first several terms in the vocabulary
all_terms = collect_vocabulary ()
print (len (all_terms))
print (all_terms [:10])

def vectorize (input_features, vocabulary):
    output = {}
    for item_id in input_features.keys ():
        features = input_features.get (item_id)
        output_vector = []
        for word in vocabulary:
            if word in features.keys ():
                output_vector.append (int (features.get (word)))
            else:
                output_vector.append (0)
        output [item_id] = output_vector
    return output

doc_vectors = vectorize (doc_terms, all_terms)
qry_vectors = vectorize (qry_terms, all_terms)

# print out some statistics on these data structures
print (len (doc_vectors))
print (len (doc_vectors.get ("1450")))
print (len (qry_vectors))
print (len (qry_vectors.get ("110")))

**Calculate and apply inverse document frequency weighting**

In [None]:
# import library for math
import math

def calculate_idfs (vocabulary, doc_features):
    doc_idfs = {}
    for term in vocabulary:
        doc_count = 0
        for doc_id in doc_features.keys ():
            terms = doc_features.get (doc_id)
            if term in terms.keys ():
                doc_count += 1
        doc_idfs [term] = math.log (float (len (doc_features.keys ()))/
                                    float (1 + doc_count), 10)
    return doc_idfs

# check the results - we should have idf values for all terms from the vocabulary
doc_idfs = calculate_idfs (all_terms, doc_terms)
print (len (doc_idfs))
print (doc_idfs.get ("system"))

# define a function to apply idf weighing to the input_terms data structure
def vectorize_idf (input_terms, input_idfs, vocabulary):
    output = {}
    for item_id in input_terms.keys ():
        terms = input_terms.get (item_id)
        output_vector = []
        for term in vocabulary:
            if term in terms.keys ():
                # multiply the term frequencies with idf weights if the term is present in document
                output_vector.append (
                input_idfs.get (term) * float (terms.get (term)))
            else:
                output_vector.append (float (0))
        output [item_id] = output_vector
    return output

# apply idf weighing to doc_terms
doc_vectors = vectorize_idf (doc_terms, doc_idfs, all_terms)

# print out some statistics, such as the number of documents and terms
print (len (doc_vectors))
print (len (doc_vectors.get ("1460")))

**Run search algorithm for a given query on the set of the documents**

In [None]:
# the operator's itemgetter functionality helps sort Python dictionaries by keys or values
from operator import itemgetter

# calculate the length of the input vector
def length (vector):
    sq_length = 0
    for index in range (0, len(vector)):
        sq_length += math.pow (vector [index], 2)
    return math.sqrt (sq_length)

# calculate the dot product of two vectors
def dot_product (vector1, vector2):
    if len (vector1) == len (vector2):
        dot_prod = 0
        for index in range (0, len(vector1)):
            if not vector1 [index] == 0 and not vector2 [index] == 0:
                dot_prod += vector1 [index] * vector2 [index]
        return dot_prod
    else:
        return "Unmatching dimensionality"
    
def calculate_cosine (query, document):
    cosine = dot_product (query, document) / (length (query) * length (document))
    return cosine

query = qry_vectors.get ("3")
results = {}

for doc_id in doc_vectors.keys ():
    document = doc_vectors.get (doc_id)
    cosine = calculate_cosine (query, document)
    results [doc_id] = cosine
    
# sort the results dictionary by cosine values in descending order and return the top n results
for items in sorted (results.items (), key = itemgetter (1), reverse = True) [:44]:
    print (items [0])

**Estimate precision@k and ratio of cases with at least one relevant document**

In [None]:
# calculate the proportion of relevant documents from the gold standard in the top k returned results
def calculate_precision (model_output, gold_standard):
    true_pos = 0
    for item in model_output:
        if item in gold_standard:
            true_pos += 1
    return float (true_pos) / float (len (model_output))

def calculate_found (model_output, gold_standard):
    found = 0
    for item in model_output:
        if item in gold_standard:
            found = 1
    return float (found)

precision_all = 0.0
found_all = 0.0
for query_id in mappings.keys ():
    # calculate mean values across all queries
    gold_standard = mappings.get (str (query_id))
    query = qry_vectors.get (str (query_id))
    results = {}
    model_output = []
    for doc_id in doc_vectors.keys ():
        document = doc_vectors.get (doc_id)
        cosine = calculate_cosine (query, document)
        # for each document, esimate its relevance to the query with cosine similarity as before
        results [doc_id] = cosine
    # sort the results and consider only top k (top 5) most relevant documents
    for items in sorted (results.items (), key = itemgetter (1), reverse = True) [:5]:
        model_output.append (items [0])
    precision = calculate_precision (model_output, gold_standard)
    found = calculate_found (model_output, gold_standard)
    print (f"{str (query_id)} : {str(precision)}")
    precision_all += precision
    found_all += found
    
# estimate the mean values for all queries
print (precision_all / float (len (mappings.keys ())))
print (found_all / float (len (mappings.keys ())))    

On some queries the algorithm perform very well. For example, "1 : 1.0" shows that all top 5 documents returned for query 1 are relevant. However, on other queries the alforithm does not perform well.

**Estimate mean reciprocal rank**

In [None]:
rank_all = 0.0
for query_id in mappings.keys ():
    gold_standard = mappings.get (str (query_id))
    query = qry_vectors.get (str (query_id))
    results = {}
    for doc_id in doc_vectors.keys ():
        document = doc_vectors.get (doc_id)
        cosine = calculate_cosine (query, document)
        results [doc_id] = cosine
    sorted_results = sorted (results.items (),
                            key=itemgetter (1), reverse = True)
    index = 0
    found = False
    while found == False:
        # set the flag found to False and switch it to True when we find the first relevant document
        item = sorted_results [index]
        # increment the index with each document in the results
        index += 1
        if index == len (sorted_results):
            found = True
        if item [0] in gold_standard:
            # the document ID is the first element in the sorted tuples oof (document_id, similarity score)
            found = True
            print (f"{str(query_id)}: {str(float (1) / float (index))}")
            rank_all += float(1) / float (index)
            
# print out the mean valur across all queries
print (rank_all / float (len (mappings.keys ())))

**Example how to run spaCy's processing pipeline**

In [None]:
# import library
import spacy

# the spacy.load command initializes the nlp pipeline
nlp = spacy.load ("en_core_web_sm")
doc = nlp ("On monday students meet with researchers " + " and discuss future development their research.")
rows = []

# print the output in a tabular format and add a header to the printout for clarity
rows.append (["Word", "Position", "Lowercase", "Lemma", "POS", "Alphanumeric", "Stopword"])

for token in doc:
    rows.append ([token.text, str(token.i), token.lower_, token.lemma_,
                 token.pos_, str(token.is_alpha), str (token.is_stop)])
    
# Python's zip function allows to reformat input from row representation
columns = zip (*rows)
column_widths = [max (len (item) for item in col)
                for col in columns]

# calculate the maximum length of strings in each column to allow enough space in the printout
for row in rows:
    print (''.join(' {:{width}} '.format (
        row [i], width = column_widths [i])
                  for i in range (0, len (row))))

**Identify all groups of nouns and the way they are realted to each other**

In [None]:
doc = nlp ("On monday students meet with researchers " + " and discuss future development their research.")

# we can access noun phrases by doc.noun_chunks
for chunk in doc.noun_chunks:
    # print out the phrase, its head, the type of relation to the next most important word, and the word itself
    print ('\t'.join ([chunk.text, chunk.root.text, chunk.root.dep_, chunk.root.head.text]))

**Visualize the dependency information**

In [None]:
# import spaCy's visualization tool displaCy
from spacy import displacy
# path helps define the location for the file to store the visualization
from pathlib import Path

# use displaCy to visualize dependecies over the input text with approptiate arguments
svg = displacy.render (doc, style = 'dep', jupyter = False)
file_name = '-'.join ([w.text for w in doc if not w.is_punct]) + ".svg"

# the the output us stored to simply uses the words from the sentence in its name
output_path = Path (file_name)
output_path.open ("w", encoding="utf-8").write(svg)

**Print out the information about head and dependents for each word**

In [None]:
# coode assumes that spaCy is imported and input text is already fed into the pipeline
for token in doc:
    print (token.text, token.dep_, token.head.text,
          token.head.pos_, [child for child in token.children])

**Extarct participants of the actions**

In [None]:
# code assumes that spaCy is imported and input text is already fed into pipeline
for token in doc:
    # check that the ROOT of the sentence is a verb with the base form (lemma) "meet"
    if (token.lemma_ == "meet" and token.pos_ == "VERB"
       and token.dep_ == "ROOT"):
        # this verb expresses the action itself
        action = token.text
        # extract the list of all dependents of this verb using token.children
        children = [child for child in token.children]
        participant1 = ""
        participant2 = ""
        for child1 in children:
            if child1.dep_ == "nsubj":
                participant1 = " ".join (
                [attr.text for attr in child1.children]
                ) + " " + child1.text
            elif child1.text == "with":
                # check if the verb has preposition "with" as one of its dependents
                action += " " + child1.text
                child1_children = [child for child in child1.children]
                for child2 in child1_children:
                    if child2.pos_ == "NOUN":
                        participant2 = " ".join (
                        [attr.text for attr in child2.children]
                        ) + " " + child2.text
                    
# print out the results
print (f"Participant1 = {participant1}")
print (f"Action = {action}")
print (f"Participant2 = {participant2}")

**Build information extractor**

In [None]:
# provide diverse set of sentences
sentences = ["On monday students meet with researchers " + " and discuss future development their research.", 
            " Warren Baffet met with the President last week.",
            "Elon Musk met with the President an White House.",
            "The two bussinesmans also posed for photographs and " + 
            "the Vice President talked to reporters."]

# define a function to apply all the steps in the information extraction algorithm
def extract_information (doc):
    action = ""
    participant1 = ""
    for token in doc: 
         if (token.lemma_ == "meet" and token.pos_ == "VERB" 
            and token.dep_ == "ROOT"):
                action = token.text
                children = [child for child in token.children]
                for child1 in children:
                    if child1.dep_ == "nsubj": 
                        patricipant1 = " ".join (
                [attr.text for attr in child1.children]
                ) + " " + child1.text
                    elif child1.text == "with":
                        action += " " + child1.text
                        child1_children = [child for child in child1.children]
                        for child2 in child1_children:
                            # extract participants expressed with proper nouns (PROPN) and common nouns (NOUN)
                            if (child2.pos_ == "NOUN"
                            or child2.pos_ == "PROPN"):
                                participant2 = " ".join (
                        [attr.text for attr in child2.children]
                        ) + " " + child2.text
                    elif (child1.dep_ == "dobj"
                        and (child1.pos_ == "NOUN"
                            or child1.pos_ == "PROPN")):
                        participant2 = " ".join (
                            [attr.text for attr in child1.children]
                            ) + " " + child1.text
    
        
# apply extract_information function to each sentence and print out the actions and participants
for sent in sentences:
    print (f"\nSentence = {sent}")
    doc = nlp (sent)
    extract_information (doc)
    print (f"Participant1 = {participant1}")
    print (f"Action = {action}")
    print (f"Participant2 = {participant2}")

**Code to extract literary works from Project Gutenberg**

In [None]:
nltk.download ('gutenberg')
from nltk.corpus import gutenberg

# print out the names of files
gutenberg.fileids ()

**Define training and test sets**

In [None]:
nltk.download ('punkt')

author1_train = gutenberg.sents ('chesterton-ball.txt') + gutenberg.sents ('chesterton-brown.txt')
print (author1_train)
print (len (author1_train))

In [None]:
# initialize the test set with the sentences from the third work by the author
author1_test = gutenberg.sents ('chesterton-thursday.txt')
print (author1_test)
print (len (author1_test))

In [None]:
author2_train = gutenberg.sents ('shakespeare-caesar.txt') + gutenberg.sents ('shakespeare-hamlet.txt')
print (author2_train)
print (len (author2_train))

In [None]:
author2_test = gutenberg.sents ('shakespeare-macbeth.txt')
print (author2_test)
print (len (author2_test))

**Calculate simple statistics on texts**

In [None]:
def statistics (gutenberg_data):
    for work in gutenberg_data:
        # use NLTK's functionality to calculate statistics
        num_chars = len (gutenberg.raw (work))
        num_words = len (gutenberg.words (work))
        num_sents = len (gutenberg.sents (work))
        num_vocab = len (set (w.lower ()
                             for w in gutenberg.words (work)))
        print (round (num_chars / num_words),
              round (num_words / num_sents),
              round (num_words / num_vocab),
              work)
        
gutenberg_data = ['chesterton-ball.txt','chesterton-brown.txt','chesterton-thursday.txt', 
                  'shakespeare-caesar.txt','shakespeare-hamlet.txt','shakespeare-macbeth.txt']
statistics (gutenberg_data)

**Run StratifiedShufflingSplit on the data**

In [None]:
# import required libraries
import random
import sklearn
from sklearn.model_selection import StratifiedShuffleSplit

all_sents = [(sent, 'chesterton') for sent in author1_train]
all_sents += [(sent, 'shakespeare') for sent in author2_train]
# combine all sentences into a single list called all_sents, keeping the author label
print (f"Dataset size = {str (len (all_sents))} sentences")

# keep the set of labels (authors) as values
values = [author for (sent, author) in all_sents]
split = StratifiedShuffleSplit (n_splits = 1, test_size = 0.2, random_state = 42)
strat_train_set = []
strat_pretest_set = []
for train_index, pretest_index in split.split (all_sents, values):
    strat_train_set= [all_sents [index] for index in train_index]
    strat_pretest_set = [all_sents [index]
                        for index in pretest_index]

**Check the proportions of the data in the two classes**

In [None]:
# calculate the proportion of the entries in each class (category) in the given dataset data
def cat_proportions (data, cat):
    count = 0
    for item in data:
        if item [1] == cat:
            count += 1
    return float (count) / float (len (data))

categories = ['chesterton', 'shakespeare']
rows = []
rows.append (["Category", "Overall", "Stratified train", "Stratified pretest"])

for cat in categories:
    rows.append ([cat, f"{cat_proportions (all_sents, cat):.6f}",
                 f"{cat_proportions (strat_train_set, cat):.6f}",
                 f"{cat_proportions (strat_pretest_set, cat):.6f}"])
    
columns = zip (*rows)
column_widths = [max (len (item) for item in col) for col in columns]
for row in rows:
    print (''.join (' {:{width}} '.format (row [i], width = column_widths [i])
                   for i in range (0, len (row))))

**Create the test_set data structure**

In [None]:
test_set = [(sent, "chesterton") for sent in author1_test]
test_set += [(sent, "shakespeare") for sent in author2_test]

# extract words as features
def get_features (text):
    features = {}
    word_list = [word for word in text]
    for word in word_list:
        features [word] = True
    return features

# extract features from training and pretest sets
train_features = [(get_features (sents), label)
                 for (sents, label) in strat_train_set]
pretest_features = [(get_features (sents), label)
                   for (sents, label) in strat_pretest_set]

# run some checks to see what the data contains
print (len (train_features))
print (train_features [0] [0])
print (train_features [100] [0])

**Train the Naive Bayes classifier on train and test on pretest set**

In [None]:
# import the classifier
from nltk import NaiveBayesClassifier, classify

# train the classifier on the training set
print (f"Training set size = {str (len (train_features))} sentences")
print (f"Pretest set size = {str (len (pretest_features))} sentences")
classifier = NaiveBayesClassifier.train (train_features)

print (f"Accuracy on the training set = {str (classify.accuracy (classifier, train_features))}")
print (f"Accuracy on the pretest set = " + 
      f"{str (classify.accuracy (classifier, pretest_features))}")
classifier.show_most_informative_features (50)

**Code to extract words and sentence length statistics**

In [None]:
def avg_number_chars (text):
    total_chars = 0.0
    for word in text:
        total_chars += len (word)
    return float (total_chars) / float (len(text))

# calculate the sentence length in terms of the number of words
def number_words (text):
    return float (len (text))

print (avg_number_chars (["Not", "so", "happy", ",", "yet", "much", "happyer"]))
print (number_words (["Not", "so", "happy", ",", "yet", "much", "happyer"]))

**Code to extract features and map them to the labels**

In [None]:
# argument source denotes the dataset we are applying the feature extraction
def initialize_dataset (source):
    all_features = []
    targets = []
    # iterate through all (sent, label) pairs in the given dataset
    for (sent, label) in source:
        feature_list = []
        feature_list.append (avg_number_chars (sent))
        feature_list.append (number_words (sent))
        all_features.append (feature_list)
        if label == "chesterton": targets.append (0)
        else: targets.append (1)
    return all_features, targets

train_data, train_targets = initialize_dataset (strat_train_set)
pretest_data, pretest_targets = initialize_dataset (strat_pretest_set)
test_data, test_targets = initialize_dataset (test_set)

# print out thr length of the structures
print (len (train_data), len (train_targets))
print (len (pretest_data), len (pretest_targets))
print (len (test_data), len (test_targets))

**Train and test a classifier with sklearn**

In [None]:
# import decision tree classifier
from sklearn.tree import DecisionTreeClassifier

# initialization
text_clf = DecisionTreeClassifier (random_state = 42)

# train the classifier using the fit method
text_clf.fit (train_data, train_targets)

# test the classifier using the predict method
predicted = text_clf.predict (pretest_data)

# evaluating the classifier
# import numpy and sklearn's metrics funcvtionality
import numpy as np
from sklearn import metrics

def evaluate (predicted, targets):
    # use numpy.mean to estimate the accuracy of the classifier
    print (np.mean (predicted == targets))
    print (metrics.confusion_matrix (targets, predicted))
    print (metrics.classification_report (targets, predicted))
    
evaluate (predicted, pretest_targets)

# apply the same routine to the test set
predicted = text_clf.predict (test_data)
evaluate (predicted, test_targets)

**Calculate the number and proportion of times certain words occur**

In [None]:
def word_counts(text):
    counts = {}
    for word in text:
        counts[word.lower()] = counts.get(word.lower(), 0) + 1
    return counts

def proportion_words(text, wordlist):
    count = 0
    for word in text:
        if word.lower() in wordlist:
            count += 1
    return float(count)/float(len(text))

**Adding stopword counts and proportion as features**

In [None]:
import spacy
from spacy.lang.en.stop_words import STOP_WORDS

# add spaCy’s functionality to the code and upload the stopwords list
nlp = spacy.load('en_core_web_lg')
def initialize_dataset(source):
    all_features = []
    targets = []
    for (sent, label) in source:
        feature_list=[]
        feature_list.append(avg_number_chars(sent))
        feature_list.append(number_words(sent))
        counts = word_counts(sent)
        for word in STOP_WORDS:
            if word in counts.keys():
                feature_list.append(counts.get(word))
            else:
                feature_list.append(0)
        feature_list.append(proportion_words(sent, STOP_WORDS))
        all_features.append(feature_list)
        if label=="austen": targets.append(0)
        else: targets.append(1)
    return all_features, targets

train_data, train_targets = initialize_dataset(strat_train_set)
pretest_data, pretest_targets = initialize_dataset(strat_pretest_set)
test_data, test_targets = initialize_dataset(test_set)

# Print out the length of the feature lists and targets lists
print (len(train_data), len(train_targets))
print (len(pretest_data), len(pretest_targets))
print (len(test_data), len(test_targets))

**Evaluate the results**

In [None]:
# train the classifier on the training data
text_clf = DecisionTreeClassifier(random_state=42)
text_clf.fit(train_data, train_targets)

# test on the pretest set
predicted = text_clf.predict(pretest_data)
evaluate(predicted, pretest_targets)

# apply the same routine to the test set
predicted = text_clf.predict(test_data)
evaluate(predicted, test_targets)

**Applying spaCy preprocessing**

In [None]:
# provide the preprocess function with the original sentences from the datasets
def preprocess(source):
    source_docs = {}
    index = 0
    for (sent, label) in source:
        text = " ".join(sent)
        source_docs[text] = nlp(text)
        if index>0 and (index%2000)==0:
            print(str(index) + " texts processed")
        index += 1
    print("Dataset processed")
    return source_docs

# apply the preprocess function to the three original datasets
train_docs = preprocess(strat_train_set)
pretest_docs = preprocess(strat_pretest_set)
test_docs = preprocess(test_set)

**Adding distribution of part-of-speech tags as features**

In [None]:
# import Python’s Counter functionality to simplify counting procedures
from collections import Counter
pos_list = ["C", "D", "E", "F", "I", "J", "M",
            "N", "P", "R", "T", "U", "V", "W"]

def pos_counts(text, source_docs, pos_list):
    pos_counts = {}
    doc = source_docs.get(" ".join(text))
    tags = []
    for word in doc:
        tags.append(str(word.tag_)[0])
    counts = Counter(tags)
    for pos in pos_list:
        if pos in counts.keys():
            pos_counts[pos] = counts.get(pos)
        # Populate the pos_counts dictionary using the counts
        # of the part-of-speech tags or inserting 0    
        else: pos_counts[pos] = 0
    return pos_counts

def initialize_dataset(source, source_docs):
    all_features = []
    targets = []
    for (sent, label) in source:
        feature_list=[]
        feature_list.append(avg_number_chars(sent))
        feature_list.append(number_words(sent))
        counts = word_counts(sent)
        for word in STOP_WORDS:
            if word in counts.keys():
                feature_list.append(counts.get(word))
            else:
                feature_list.append(0)
        feature_list.append(proportion_words(sent, STOP_WORDS))
        # extract the previous 308 features as before
        p_counts = pos_counts(sent, source_docs, pos_list)
        for pos in p_counts.keys():
            feature_list.append(float(p_counts.get(pos))/float(len(sent)))
        all_features.append(feature_list)
        if label=="austen": targets.append(0)
        else: targets.append(1)
    return all_features, targets

**Run the train-test-evaluate routine**

In [None]:
def run():
    train_data, train_targets = initialize_dataset(strat_train_set, train_docs)
    pretest_data, pretest_targets = initialize_dataset(strat_pretest_set, 
                                                       pretest_docs)
    test_data, test_targets = initialize_dataset(test_set, test_docs)
    print (len(train_data), len(train_targets))
    print (len(pretest_data), len(pretest_targets))
    print (len(test_data), len(test_targets))
    print ()
    text_clf = DecisionTreeClassifier(random_state=42)
    text_clf.fit(train_data, train_targets)
    predicted = text_clf.predict(pretest_data)
    evaluate(predicted, pretest_targets)
    predicted = text_clf.predict(test_data)
    evaluate(predicted, test_targets)
    
run()

**Collecting the most frequent suffixes from the data**

In [None]:
# import python operator functionality
import operator

# create the function
def select_suffixes (cutoff):
    all_suffixes = []
    # iterate through the list of values in the train_docs.values ()
    for doc in train_docs.values ():
        for word in doc:
            all_suffixes.append (str (word.suffix_).lower ())
    counts = Counter (all_suffixes)
    # store the frequency of all the suffixes in the counts dictionary and the sort it
    sorted_counts = sorted (counts.items (), key = operator.itemgetter (1),
                           reverse = True)
    selected_suffixes = []
    for i in range (0, round (len (counts)*cutoff)):
        selected_suffixes.append (sorted_counts [i][0])
    return selected_suffixes

selected_suffixes = select_suffixes (0.4)
print (len (selected_suffixes))
print (selected_suffixes)

**Add new, suffix-based features**

In [None]:
# create function that returns the counts of suffixes from suffix_list in the given sentence (text)
def suffix_counts (text, source_docs, suffix_list):
    suffix_counts = {}
    doc = source_docs.get (" ".join (text))
    suffixes = []
    for word in doc:
        suffixes.append (str (word.suffix_))
    counts = Counter (suffixes)
    for suffix in suffix_list:
        if suffix in counts.keys ():
            suffix_counts [suffix] = counts.get(suffix)
        else: suffix_counts [suffix] = 0
    return suffix_counts

def initialize_dataset (source, source_docs):
    all_features = []
    targets = []
    for (sent, label) in source:
        feature_list = []
        feature_list.append (avg_number_chars (sent))
        feature_list.append (number_words (sent))
        counts = word_counts (sent)
        for word in STOP_WORDS:
            if word in counts.keys ():
                feature_list.append (counts.get (word))
            else:
                feature_list.append (0)
        feature_list.append (proportion_words (sent, STOP_WORDS))
        p_counts = pos_counts (sent, source_docs, pos_list)
        for pos in p_counts.keys ():
            feature_list.append (float (p_counts.get (pos))/float (len (sent)))
        s_counts = suffix_counts (sent, source_docs, selected_suffixes)
        for suffix in s_counts.keys ():
            # append the new 690 suffix distribution features by calculating the proportion
            # of words containing the suffixes
            feature_list.append (float (s_counts.get (suffix))/float (len (sent)))
            
        all_features.append (feature_list)
        if label == 'chesterton': targets.append (0)
        else: targets.append (1)
    return all_features, targets

# apply the train-test-evaluate routine
run ()

**Collect 50% most frequent unique words per author**

In [None]:
# function for collect full vocabularies for each author
def unique_vocabulary (label1, label2, cutoff):
    voc1 = []
    voc2 = []
    for (sent, label) in strat_train_set:
        if label == label1:
            for word in sent:
                voc1.append (word.lower ())
        elif label == label2:
            for word in sent:
                voc2.append (word.lower ())
    counts1 = Counter (voc1)
    sorted_counts1 = sorted (counts1.items (), key = operator.itemgetter (1),
                             reverse = True)
    counts2 = Counter (voc2)
    sorted_counts2 = sorted (counts2.items (), key = operator.itemgetter (1),
                            reverse = True)
    unique_voc = []
    # the unique_voc list stores the most frequent words for each author
    # if they are never used by the other author
    for i in range (0, round (len (sorted_counts1)*cutoff)):
        if not sorted_counts1[i] [0] in counts2.keys ():
            unique_voc.append (sorted_counts1[i] [0])
    for i in range (0, round (len (sorted_counts2)*cutoff)):
        if not sorted_counts2 [i] [0] in counts1.keys ():
            unique_voc.append (sorted_counts2 [i] [0])
    return unique_voc

# print out the unique_voc list using 50% as the cutoff
unique_voc = unique_vocabulary ("chesterton", "shakespeare", 0.5)
print (len (unique_voc))
print (unique_voc)

**Adding new word-based features, then train and test the classifier**

In [None]:
# the unique_counts function returns the counts of unique words from the unique_voc list
# in the given sentence

def unique_counts (text, unique_voc):
    unique_counts = {}
    words = []
    for word in text:
        words.append (word.lower ())
    counts = Counter (words)
    for word in unique_voc:
        # for each word from the unique_voc, use its count in the sentence
        # if it occurs or 0 otherwise
        if word in counts.keys ():
            unique_counts [word] = counts.get (word)
        else: unique_counts [word] = 0
    return unique_counts

def initialize_dataset (source, source_docs):
    all_features = []
    targets = []
    for (sent, label) in source:
        feature_list = []
        feature_list.append (avg_number_chars (sent))
        feature_list.append (number_words (sent))
        counts = word_counts (sent)
        for word in STOP_WORDS:
            if word in counts.keys ():
                feature_list.append (counts.get (word))
            else:
                feature_list.append (0)
        feature_list.append (proportion_words (sent, STOP_WORDS))
        p_counts = pos_counts (sent, source_docs, pos_list)
        for pos in p_counts.keys ():
            feature_list.append (float (p_counts.get (pos))/float (len (sent)))
        s_counts = suffix_counts (sent, source_docs, selected_suffixes)
        for suffix in s_counts.keys ():
            # add the previous 690 features as before
            feature_list.append (float (s_counts.get (suffix))/float (len (sent)))
        u_counts = unique_counts (sent, unique_voc)
        # add the new 5253 unique word counts features
        for word in u_counts.keys ():
            feature_list.append (u_counts.get (word))
        all_features.append (feature_list)
        if label == 'chesterton': targets.append (0)
        else: targets.append (1)
    return all_features, targets

# apply the train-test-evaluate routine
run ()

**Creating of topic analyzer**

In [5]:
# import sklearn's functionality
from sklearn.datasets import fetch_20newsgroups

# define the load_dataset function to return the data extracted
# according to predefined restrictions
def load_dataset(sset, cats):
    if cats==[]:
        newsgroups_dset = fetch_20newsgroups(subset=sset,
                                             remove=('headers', 'footers', 'quotes'),
                                             shuffle=True)
    else:
        newsgroups_dset = fetch_20newsgroups(subset=sset, categories=cats,
                                             remove=('headers', 'footers', 'quotes'),
                                             shuffle=True)
    return newsgroups_dset

categories = ["comp.windows.x", "misc.forsale", "rec.autos"]
categories += ["rec.motorcycles", "rec.sport.baseball", "rec.sport.hockey"]
categories += ["sci.crypt", "sci.med", "sci.space"]
categories += ["talk.politics.mideast"]

# to access both training and test sets, use "all" as the first argument
newsgroups_all = load_dataset('all', categories)
print(len(newsgroups_all.data))

9850


**Preprocess data using NLTK and gensim**

In [6]:
import nltk
import gensim
from nltk.stem import SnowballStemmer
from gensim.utils import simple_preprocess
from gensim.parsing.preprocessing \
import STOPWORDS as stopwords
stemmer = SnowballStemmer("english")

def stem(text):
    return stemmer.stem(text)

def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text, min_len=4):
        if token not in stopwords:
            result.append(stem(token))
    return result

**Inspecting the results of the preprocessing step**

In [7]:
doc_sample = newsgroups_all.data [0]
print ('Original document: ')
print (doc_sample)

print ('\n\nTokenized document: ')
words = []
for token in gensim.utils.tokenize (doc_sample):
    words.append (token)
print (words)

# check the output of the preprocess function
print ('\n\nPreprocessed document:  ')
print (preprocess (doc_sample))

Original document: 
Hi Xperts!

How can I move the cursor with the keyboard (i.e. cursor keys), 
if no mouse is available?

Any hints welcome.

Thanks.


Tokenized document: 
['Hi', 'Xperts', 'How', 'can', 'I', 'move', 'the', 'cursor', 'with', 'the', 'keyboard', 'i', 'e', 'cursor', 'keys', 'if', 'no', 'mouse', 'is', 'available', 'Any', 'hints', 'welcome', 'Thanks']


Preprocessed document:  
['xpert', 'cursor', 'keyboard', 'cursor', 'key', 'mous', 'avail', 'hint', 'welcom', 'thank']


**Inspect the processing output for a group of documents**

In [8]:
# iterate through the documents, such as through the list of the first 10 ones
for i in range (0, 15):
    print (str(i) + "\t" + ", ".join (preprocess (newsgroups_all.data[i]) [:15]))

0	xpert, cursor, keyboard, cursor, key, mous, avail, hint, welcom, thank
1	obtain, copi, open, look, widget, obtain, need, order, copi, thank, help, email
2	right, signal, strong, live, west, philadelphia, perfect, sport, fan, dream, especi, person, want, hear, team
3	canadian, thing, coach, boston, bruin, colorado, rocki, summari, post, gather, ongo, beef, year, convent, wisdom
4	heck, feel, like, time, includ, cafeteria, work, half, time, headach, intensifi, away, throw, imagin, guess
5	damn, right, late, climb, meet, morn, bother, right, foot, asleep, remind, fold, underneath, crunch, metatars
6	olympus, stylus, pocket, camera, smallest, class, includ, time, date, stamp, batteri, carri, case, reduct, timer
7	includ, follow, chmos, clock, generat, driver, processor, chmos, eras, prom, power, chmos, dynam, chmos, programm
8	chang, intel, discov, xclient, xload, longer, work, bomb, messag, error, open, display, unix, correct, share
9	termin, like, power, server, run, window, manag, spe

**Converitng word content of the documents into a dictionary**

In [9]:
processed_docs = []
for i in range (0, len (newsgroups_all.data)):
    processed_docs.append (preprocess (newsgroups_all.data [i]))
print (len (processed_docs))

dictionary = gensim.corpora.Dictionary (processed_docs)
print (len (dictionary))

# check what is stored in this dictionary (iterate through the first 10 items)
index = 0
for key, value in dictionary.iteritems ():
    print (key, value)
    index += 1
    if index > 15:
        break

9850
39350
0 avail
1 cursor
2 hint
3 key
4 keyboard
5 mous
6 thank
7 welcom
8 xpert
9 copi
10 email
11 help
12 look
13 need
14 obtain
15 open


**Performing further dimenisionality reduction on the documents**

In [10]:
# convert each document in the collection into a list of tuples and check the output
dictionary.filter_extremes (no_below = 10, no_above = 0.5, keep_n = 10000)
print (len (dictionary))

bow_corpus = [dictionary.doc2bow (doc) for doc in processed_docs]
print (bow_corpus [0])

5868
[(0, 1), (1, 2), (2, 1), (3, 1), (4, 1), (5, 1), (6, 1), (7, 1), (8, 1)]


**Check word stems behind IDs from the dictionary**

In [11]:
# extract a particular document from bow_corpus
bow_doc = bow_corpus [0]

# print out the IDs, corresponding word stems, and the number of occurences of these word stems
for i in range (len (bow_doc)):
    print (f"Key {bow_doc [i] [0]} =\"{dictionary [bow_doc [i] [0]]}\":\
    occurences = {bow_doc [i] [1]}")

Key 0 ="avail":    occurences = 1
Key 1 ="cursor":    occurences = 2
Key 2 ="hint":    occurences = 1
Key 3 ="key":    occurences = 1
Key 4 ="keyboard":    occurences = 1
Key 5 ="mous":    occurences = 1
Key 6 ="thank":    occurences = 1
Key 7 ="welcom":    occurences = 1
Key 8 ="xpert":    occurences = 1


**Run the LDA algorithm on our documents**

In [12]:
# initialize id2word to the dictionary where each word stem is mapped to a unique ID
id2word = dictionary

# initialize corpus to the bow_corpus  
corpus = bow_corpus

# create the algorithm
lda_model = gensim.models.ldamodel.LdaModel (corpus = corpus, id2word = id2word,
                                            num_topics = 10, random_state = 100,
                                            update_every = 1, chunksize = 1000,
                                            passes = 10, alpha = "symmetric",
                                            iterations = 100, per_word_topics = True)

# output all topics and for each of them print out its index and the most informative words identified
for index, topic in lda_model.print_topics (-1):
    print (f"Topic: {index} \nWords: {topic}")

Topic: 0 
Words: 0.021*"encrypt" + 0.018*"secur" + 0.018*"chip" + 0.016*"govern" + 0.013*"clipper" + 0.012*"public" + 0.010*"privaci" + 0.010*"key" + 0.010*"phone" + 0.009*"algorithm"
Topic: 1 
Words: 0.017*"appear" + 0.014*"copi" + 0.013*"cover" + 0.013*"star" + 0.013*"book" + 0.011*"penalti" + 0.010*"black" + 0.009*"comic" + 0.008*"blue" + 0.008*"green"
Topic: 2 
Words: 0.031*"window" + 0.015*"server" + 0.012*"program" + 0.012*"file" + 0.012*"applic" + 0.012*"display" + 0.011*"widget" + 0.010*"version" + 0.010*"motif" + 0.010*"support"
Topic: 3 
Words: 0.015*"space" + 0.007*"launch" + 0.007*"year" + 0.007*"medic" + 0.006*"patient" + 0.006*"orbit" + 0.006*"research" + 0.006*"diseas" + 0.005*"develop" + 0.005*"nasa"
Topic: 4 
Words: 0.018*"armenian" + 0.011*"peopl" + 0.008*"kill" + 0.008*"said" + 0.007*"turkish" + 0.006*"muslim" + 0.006*"jew" + 0.006*"govern" + 0.005*"state" + 0.005*"greek"
Topic: 5 
Words: 0.024*"price" + 0.021*"sale" + 0.020*"offer" + 0.017*"drive" + 0.017*"sell" + 0

**Identify the main topic for each document in the collection**

In [13]:
# fucntion takes as input the LDA model, corpus, and the original collection of texts
def analyse_topics(ldamodel, corpus, texts):
    main_topic = {}
    percentage = {}
    keywords = {}
    text_snippets = {}
    for i, topic_list in enumerate(ldamodel[corpus]):
        topic = topic_list[0]
        topic = sorted(topic, key=lambda x: (x[1]), reverse=True)
    for j, (topic_num, prop_topic) in enumerate(topic):
        if j == 0:
            wp = ldamodel.show_topic(topic_num)
            topic_keywords = ", ".join([word for word, prop in wp[:5]])
            main_topic[i] = int(topic_num)
            percentage[i] = round(prop_topic,4)
            keywords[i] = topic_keywords
            text_snippets[i] = texts[i][:8]
        else:
            break
    return main_topic, percentage, keywords, text_snippets

main_topic, percentage, keywords, text_snippets = analyse_topics(
    lda_model, bow_corpus, processed_docs)

**Print out the main topic for each document in the collection**

In [23]:
indexes = []
rows = []
for i in range(0, 10):
    indexes.append(i)
    rows.append(['ID', 'Main Topic', 'Contribution (%)', 'Keywords', 'Snippet'])
    
for idx in indexes:
    rows.append([str(idx), f"{main_topic.get(idx)}",
                 f"{percentage.get(idx)}:.4f",
                 f"{keywords.get(idx)}\n",
                 f"{text_snippets.get(idx)}"])
columns = zip(*rows)
column_widths = [max(len(item) for item in col) for col in columns]
for row in rows:
    print(''.join(' width{} .'.format(row[i], width=column_widths[i])
                  for i in range(0, len(row))))

 widthID . widthMain Topic . widthContribution (%) . widthKeywords . widthSnippet .
 widthID . widthMain Topic . widthContribution (%) . widthKeywords . widthSnippet .
 widthID . widthMain Topic . widthContribution (%) . widthKeywords . widthSnippet .
 widthID . widthMain Topic . widthContribution (%) . widthKeywords . widthSnippet .
 widthID . widthMain Topic . widthContribution (%) . widthKeywords . widthSnippet .
 widthID . widthMain Topic . widthContribution (%) . widthKeywords . widthSnippet .
 widthID . widthMain Topic . widthContribution (%) . widthKeywords . widthSnippet .
 widthID . widthMain Topic . widthContribution (%) . widthKeywords . widthSnippet .
 widthID . widthMain Topic . widthContribution (%) . widthKeywords . widthSnippet .
 widthID . widthMain Topic . widthContribution (%) . widthKeywords . widthSnippet .
 width0 . widthNone . widthNone:.4f . widthNone
 . widthNone .
 width1 . widthNone . widthNone:.4f . widthNone
 . widthNone .
 width2 . widthNone . widthNone:.4

**Visualize the output of LDA using pyLDAvis**

In [59]:
pprint(lda_model.print_topics())
doc_lda = lda_model[corpus]

[(0,
  '0.021*"encrypt" + 0.018*"secur" + 0.018*"chip" + 0.016*"govern" + '
  '0.013*"clipper" + 0.012*"public" + 0.010*"privaci" + 0.010*"key" + '
  '0.010*"phone" + 0.009*"algorithm"'),
 (1,
  '0.017*"appear" + 0.014*"copi" + 0.013*"cover" + 0.013*"star" + 0.013*"book" '
  '+ 0.011*"penalti" + 0.010*"black" + 0.009*"comic" + 0.008*"blue" + '
  '0.008*"green"'),
 (2,
  '0.031*"window" + 0.015*"server" + 0.012*"program" + 0.012*"file" + '
  '0.012*"applic" + 0.012*"display" + 0.011*"widget" + 0.010*"version" + '
  '0.010*"motif" + 0.010*"support"'),
 (3,
  '0.015*"space" + 0.007*"launch" + 0.007*"year" + 0.007*"medic" + '
  '0.006*"patient" + 0.006*"orbit" + 0.006*"research" + 0.006*"diseas" + '
  '0.005*"develop" + 0.005*"nasa"'),
 (4,
  '0.018*"armenian" + 0.011*"peopl" + 0.008*"kill" + 0.008*"said" + '
  '0.007*"turkish" + 0.006*"muslim" + 0.006*"jew" + 0.006*"govern" + '
  '0.005*"state" + 0.005*"greek"'),
 (5,
  '0.024*"price" + 0.021*"sale" + 0.020*"offer" + 0.017*"drive" + 0.017

**Transformers**

In [1]:
# Transformer and Torch Installation
try:
  import transformers
except:
  print("Installing transformers")
  !pip -qq install transformers

try:
  import torch
except:
  print("Installing Torch")
  !pip -qq install torch

In [2]:
# SST-2 Binary Classification
from transformers import pipeline

nlp = pipeline("sentiment-analysis")

print(nlp("If you sometimes like to go to the movies to have fun , Wasabi is a good place to start ."),"If you sometimes like to go to the movies to have fun , Wasabi is a good place to start .")
print(nlp("Effective but too-tepid biopic."),"Effective but too-tepid biopic.")

No model was supplied, defaulted to distilbert-base-uncased-finetuned-sst-2-english and revision af0f99b (https://huggingface.co/distilbert-base-uncased-finetuned-sst-2-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/629 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Downloading tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

[{'label': 'POSITIVE', 'score': 0.9998257756233215}] If you sometimes like to go to the movies to have fun , Wasabi is a good place to start .
[{'label': 'NEGATIVE', 'score': 0.9974064230918884}] Effective but too-tepid biopic.


In [3]:
# Sequence Classification : paraphrase classification
from transformers import AutoTokenizer, TFAutoModelForSequenceClassification
import tensorflow as tf

tokenizer = AutoTokenizer.from_pretrained("bert-base-cased-finetuned-mrpc")
model = TFAutoModelForSequenceClassification.from_pretrained("bert-base-cased-finetuned-mrpc")

classes = ["not paraphrase", "is paraphrase"]

sequence_A = "The DVD-CCA then appealed to the state Supreme Court."
sequence_B = "The DVD CCA appealed that decision to the U.S. Supreme Court."

paraphrase = tokenizer.encode_plus(sequence_A, sequence_B, return_tensors="tf")

paraphrase_classification_logits = model(paraphrase)[0]

paraphrase_results = tf.nn.softmax(paraphrase_classification_logits, axis=1).numpy()[0]

print(sequence_B, "should be a paraphrase")
for i in range(len(classes)):
    print(f"{classes[i]}: {round(paraphrase_results[i] * 100)}%")

Downloading tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

Downloading config.json:   0%|          | 0.00/433 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/436k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/433M [00:00<?, ?B/s]

All PyTorch model weights were used when initializing TFBertForSequenceClassification.

All the weights of TFBertForSequenceClassification were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFBertForSequenceClassification for predictions without further training.


The DVD CCA appealed that decision to the U.S. Supreme Court. should be a paraphrase
not paraphrase: 8%
is paraphrase: 92%


In [4]:
# Named Entity Recognition(NER)
from transformers import pipeline
nlp = pipeline("ner")
sequence = "Hugging Face Inc. is a company based in New York City. Its headquarters are in DUMBO, therefore very" \
           "close to the Manhattan Bridge which is visible from the window."
print(nlp(sequence))

No model was supplied, defaulted to dbmdz/bert-large-cased-finetuned-conll03-english and revision f2482bf (https://huggingface.co/dbmdz/bert-large-cased-finetuned-conll03-english).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/1.33G [00:00<?, ?B/s]

Some weights of the model checkpoint at dbmdz/bert-large-cased-finetuned-conll03-english were not used when initializing BertForTokenClassification: ['bert.pooler.dense.weight', 'bert.pooler.dense.bias']
- This IS expected if you are initializing BertForTokenClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForTokenClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Downloading tokenizer_config.json:   0%|          | 0.00/60.0 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/213k [00:00<?, ?B/s]

[{'entity': 'I-ORG', 'score': 0.9995635, 'index': 1, 'word': 'Hu', 'start': 0, 'end': 2}, {'entity': 'I-ORG', 'score': 0.99159384, 'index': 2, 'word': '##gging', 'start': 2, 'end': 7}, {'entity': 'I-ORG', 'score': 0.99826705, 'index': 3, 'word': 'Face', 'start': 8, 'end': 12}, {'entity': 'I-ORG', 'score': 0.9994404, 'index': 4, 'word': 'Inc', 'start': 13, 'end': 16}, {'entity': 'I-LOC', 'score': 0.99943465, 'index': 11, 'word': 'New', 'start': 40, 'end': 43}, {'entity': 'I-LOC', 'score': 0.99932706, 'index': 12, 'word': 'York', 'start': 44, 'end': 48}, {'entity': 'I-LOC', 'score': 0.9993864, 'index': 13, 'word': 'City', 'start': 49, 'end': 53}, {'entity': 'I-LOC', 'score': 0.98256207, 'index': 19, 'word': 'D', 'start': 79, 'end': 80}, {'entity': 'I-LOC', 'score': 0.93698275, 'index': 20, 'word': '##UM', 'start': 80, 'end': 82}, {'entity': 'I-LOC', 'score': 0.89870965, 'index': 21, 'word': '##BO', 'start': 82, 'end': 84}, {'entity': 'I-LOC', 'score': 0.97582406, 'index': 29, 'word': 'Ma

In [5]:
# Winograd
from transformers import pipeline
translator = pipeline("translation_en_to_fr")

No model was supplied, defaulted to t5-base and revision 686f1db (https://huggingface.co/t5-base).
Using a pipeline without specifying a model name and revision in production is not recommended.


Downloading config.json:   0%|          | 0.00/1.21k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/892M [00:00<?, ?B/s]

Downloading generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

Downloading spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

Downloading tokenizer.json:   0%|          | 0.00/1.39M [00:00<?, ?B/s]

For now, this behavior is kept to avoid breaking backwards compatibility when padding/encoding with `truncation is True`.
- Be aware that you SHOULD NOT rely on t5-base automatically truncating your input to 512 when padding/encoding.
- If you want to encode/pad to sequences longer than 512 you can either instantiate this tokenizer with `model_max_length` or pass `max_length` when encoding/padding.


In [6]:
print(translator("The terminators of the Skenet go to the attack.", max_length=40))

[{'translation_text': "Les terminateurs du Skenet vont à l'attaque."}]
