In [1]:
#create a workflow for each process, ie CountVecotrizer, using CSV data, using imported data etc
#isolate important functions and note what they do
#collect resources and definitions

# Imports

In [None]:
# Classics
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

# Base
from collections import Counter
import re
import os

# Plotting
import squarify
import matplotlib.pyplot as plt
import seaborn as sns

# NLP Libraries
import spacy
from spacy.tokenizer import Tokenizer

nlp = spacy.load("en_core_web_lg")
tokenizer = Tokenizer(nlp.vocab)

# Vector Representations
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.neighbors import NearestNeighbors
from sklearn.decomposition import PCA

# Import Data (file folder)

In [None]:
def gather_data(filefolder):
    """ Produces List of Documents from a Directory
    
    filefolder (str): a path of .txt files
    
    returns list of strings 
    """
    
    data = []
    
    files = os.listdir(filefolder)
    
    for article in files: 
        
        path = os.path.join(filefolder, article)
                    
        if  path[-3:] == 'txt':
            with open(path, 'rb') as f:
                data.append(f.read())
    
    return data

data = gather_data('./data')

# Tokenization

In [None]:
tokens = []

for doc in tokenizer.pipe(df['description'], batch_size=500):
    doc_tokens = [token.text for token in doc]
    tokens.append(doc_tokens)
    
df['tokens'] = tokens
df['tokens'].head()

# Extending Stop Words

In [None]:
STOP_WORDS = nlp.Defaults.stop_words.union(['''Insert additional stop words here'''])

# Lemmatization

In [None]:
def get_lemmas(text):

    lemmas = []
    
    doc = nlp(text)
    
    for token in doc: 
        if ((token.is_stop == False) and (token.is_punct == False)) and (token.pos_!= 'PRON'):
            lemmas.append(token.lemma_)
    
    return lemmas

df['lemmas'] = df['Column Name'].apply(get_lemmas)

# Count Function (example)

In [None]:
def count(docs):

        word_counts = Counter()
        appears_in = Counter()
        
        total_docs = len(docs)

        for doc in docs:
            word_counts.update(doc)
            appears_in.update(set(doc))

        temp = zip(word_counts.keys(), word_counts.values())
        
        wc = pd.DataFrame(temp, columns = ['word', 'count'])

        wc['rank'] = wc['count'].rank(method='first', ascending=False)
        total = wc['count'].sum()

        wc['pct_total'] = wc['count'].apply(lambda x: x / total)
        
        wc = wc.sort_values(by='rank')
        wc['cul_pct_total'] = wc['pct_total'].cumsum()

        t2 = zip(appears_in.keys(), appears_in.values())
        ac = pd.DataFrame(t2, columns=['word', 'appears_in'])
        wc = ac.merge(wc, on='word')

        wc['appears_in_pct'] = wc['appears_in'].apply(lambda x: x / total_docs)
        
        return wc.sort_values(by='rank')

# Squarify Plot

In [None]:
wc = count(df['lemmas'])
wc_top20 = wc[wc['rank'] <= 20]

squarify.plot(sizes=wc_top20['pct_total'], label=wc_top20['word'], alpha=.8 )
plt.axis('off')
plt.show()

# CountVectorizer

In [None]:
# create the transformer
vect = CountVectorizer(stop_words = 'english', min_df = 0.05, max_df = 0.90)

# tokenize and build vocab
vect.fit(data)

# transform text
sparse_dtm = vect.transform(data)

# create a vocabulary
dtm = pd.DataFrame(sparse_dtm.todense(), columns = vect.get_feature_names())

# Tfidf Vectorizer

In [None]:
# Instantiate vectorizer object
tfidf = TfidfVectorizer(stop_words = 'english')

# Create a vocabulary and get word counts per document
sparse = tfidf.fit_transform(data)

# Get feature names to use as dataframe column headers
dtm = pd.DataFrame(sparse.todense(), columns = tfidf.get_feature_names())

# View Feature Matrix as DataFrame 
dtm.head()

# K-Nearest Neighbors

In [None]:
# Fit on TF-IDF Vectors
nn  = NearestNeighbors(n_neighbors=5, algorithm='ball_tree')
nn.fit(dtm)

# Query Using kneighbors 
nn.kneighbors([dtm.iloc[0]])