In [4]:
import pandas as pd
import spacy
from sklearn.feature_extraction.text import TfidfVectorizer 

dataframe = pd.read_csv('training_set_rel3.tsv', encoding = 'latin-1',sep='\t')
dataframe = dataframe[['essay_id','essay_set','essay','domain1_score']]
nlp = spacy.load('en_core_web_md')
essays = dataframe
# ----------- Isolate essays from the 6th set ------------ #
essays = essays[(essays['essay_set'] == 6)]
essays.dropna(axis=1, how='all', inplace=True)
essays = essays.iloc[:50]
 


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  essays.dropna(axis=1, how='all', inplace=True)


In [10]:
# settings that you use for count vectorizer will go here 
tfidf_vectorizer=TfidfVectorizer(use_idf=True) 
 
# just send in all your docs here 
tfidf_vectorizer_vectors=tfidf_vectorizer.fit_transform(essays['essay'])

def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)

tfidf_vector = tfidf_vectorizer_vectors[1]

sorted_items=sort_coo(tfidf_vector.tocoo())

def extract_topn_from_vector(feature_names, sorted_items, topn=20):
    """get the feature names and tf-idf score of top n items"""
    
    #use only topn items from vector
    sorted_items = sorted_items[:topn]

    score_vals = []
    feature_vals = []
    
    # word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])

    #create a tuples of feature,score
    #results = zip(feature_vals,score_vals)
    results= {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]]=score_vals[idx]
    
    return results

feature_names=tfidf_vectorizer.get_feature_names()
keywords=extract_topn_from_vector(feature_names,sorted_items,20)

print("\n===Keywords===")
for k in keywords:
    print(k,keywords[k])

print(essays['essay'].iloc[0])


===Keywords===
the 0.548
other 0.18
passage 0.175
as 0.152
about 0.143
of 0.14
below 0.125
would 0.124
city 0.116
people 0.112
that 0.111
and 0.111
in 0.105
populated 0.102
worked 0.097
walked 0.097
talks 0.097
paragraphs 0.097
mentions 0.097
lives 0.097
There were many obstacles that the builders faced in attempting to have dirigibles dock at the Empire State Building. Safety was an issue with the dirigibles because of the highly flammable gas hydrogen. So, if the dirigibles explode over New York many people could be killed in the very populated. Another problem is keeping the derigible @CAPS1 so people can get off while the wind moves the back end of itself. The only way was to tie lead weights down in the back of the dirigible, but it would be dangerous over people down below. So, these obstacles workers faced, never got past them and so they made the world's highest soda fountain and tea garden for tourists in the observation area.


In [32]:
import spacy
import pytextrank

nlp = spacy.load('en_core_web_md')


nlp.add_pipe("positionrank")

doc = nlp(essays['essay'].iloc[1].lower())
for p in doc._.phrases:
    print('{:.4f} {:5d}  {}'.format(p.rank, p.count, p.text))
    print(p.chunks)

0.1211     2  dirigibles
[dirigibles, dirigibles]
0.1208     1  several other buildings
[several other buildings]
0.1114     1  many problems
[many problems]
0.1091     1  other people
[other people]
0.0971     1  new york city
[new york city]
0.0955     1  other safety issues
[other safety issues]
0.0716     1  the empire state building
[the empire state building]
0.0623     1  some @num1 feet
[some @num1 feet]
0.0591     1  course
[course]
0.0572     1  the start
[the start]
0.0560     1  the dirigibles
[the dirigibles]
0.0527     1  the air
[the air]
0.0510     3  the building
[the building, the building, the building]
0.0485     1  the major problem
[the major problem]
0.0430     1  a highly populated area
[a highly populated area]
0.0417     1  the people
[the people]
0.0400     1  pedestrians
[pedestrians]
0.0376     1  the city streets
[the city streets]
0.0349     1  such a densely populated area
[such a densely populated area]
0.0337     1  the high winds
[the high winds]
0.03

In [33]:
from rake_nltk import Rake
f = essays['essay'].iloc[0]
r = Rake(max_length=3) # Uses stopwords for english from NLTK, and all puntuation characters.

r.extract_keywords_from_text(f)

ini = r.get_ranked_phrases_with_scores()
ini

[(9.0, 'tie lead weights'),
 (9.0, 'never got past'),
 (9.0, 'highest soda fountain'),
 (9.0, 'empire state building'),
 (8.0, 'obstacles workers faced'),
 (4.5, 'many obstacles'),
 (4.5, 'builders faced'),
 (4.0, 'wind moves'),
 (4.0, 'tea garden'),
 (4.0, 'observation area'),
 (4.0, 'another problem'),
 (3.666666666666667, 'dirigibles explode'),
 (3.666666666666667, 'dirigibles dock'),
 (3.5, 'back end'),
 (1.6666666666666667, 'dirigibles'),
 (1.5, 'back'),
 (1.0, 'would'),
 (1.0, 'world'),
 (1.0, 'way'),
 (1.0, 'tourists'),
 (1.0, 'safety'),
 (1.0, 'populated'),
 (1.0, 'people'),
 (1.0, 'made'),
 (1.0, 'killed'),
 (1.0, 'keeping'),
 (1.0, 'issue'),
 (1.0, 'get'),
 (1.0, 'dirigible'),
 (1.0, 'derigible'),
 (1.0, 'dangerous'),
 (1.0, 'caps1'),
 (1.0, 'attempting')]