In [28]:
#Importing libraries

import pandas as pd
import numpy as np
import nltk.corpus

In [29]:
#Fetching data from nltk.corpu.gutenberg
title = []
description = []

for ids in nltk.corpus.gutenberg.fileids():
    words = nltk.corpus.gutenberg.sents(ids)
    sentences = [" ".join(list_of_words) for list_of_words in words]
    title.append(ids)
    description.append(str("".join(sentences)))

for ids in nltk.corpus.inaugural.fileids():
    words_in = nltk.corpus.inaugural.sents(ids)
    sentences_in = [" ".join(list_of_words) for list_of_words in words_in]
    title.append(ids)
    description.append(str("".join(sentences_in)))
    
#Creating the dataset
df = pd.DataFrame(list(zip(title,description)), 
               columns =['Title', 'Description']) 

#Fetch wordcount for each description
df['Word_Count'] = df['Description'].apply(lambda x: len(str(x).split(" ")))

In [30]:
#Descriptive statistics of word count
df['Word_Count'].describe()

count        76.000000
mean      35106.000000
std      120596.098759
min         144.000000
25%        1763.500000
50%        2634.000000
75%        6245.500000
max      980552.000000
Name: Word_Count, dtype: float64

In [31]:
#Identify common words
freq = pd.Series(' '.join(df['Description']).split()).value_counts()[:20]
#freq = pd.Series(' '.join(df['Description']).split()).value_counts()[-20:]

In [32]:
#Libraries for text preprocessing
import re
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer

In [33]:
stop_words = set(stopwords.words("english"))
#stop_words.remove('not')

In [34]:
corpus = []
def data_preprocessing(data):
    
    for i in range(0, data['Title'].count()):
        #Remove punctuations
        text = re.sub('[^a-zA-Z]', ' ', data['Description'][i])
    
        #Convert to lowercase
        text = text.lower()
    
        #Remove tags
        text=re.sub("&lt;/?.*?&gt;"," &lt;&gt; ",text)
    
        #Remove special characters and digits
        text=re.sub("(\\d|\\W)+"," ",text)
    
        #Convert to list from string
        text = text.split()
    
        #Stemming
        #ps=PorterStemmer()
        #Lemmatisation
        lem = WordNetLemmatizer()
        text = [lem.lemmatize(word) for word in text if not word in  
                stop_words] 
        text = " ".join(text)
        #corpus.append(text)
        data['Corpus'][i] = text
    return data

In [35]:
#Most frequently occuring words
def get_top_n_words(corpus, ngram, n=None):
    vec = CountVectorizer(ngram).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in      
                   vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], 
                       reverse=True)
    return words_freq[:n]


In [36]:
def plot_freq_words(vis_df):
    #Barplot of most freq words
    import matplotlib.pyplot as plt
    import seaborn as sns
    %matplotlib inline

    sns.set(rc={'figure.figsize':(13,8)})
    g = sns.barplot(x=vis_df.columns.values[0], y=vis_df.columns.values[1], data=vis_df)
    g.set_xticklabels(g.get_xticklabels(), rotation=30)
    return

In [37]:
def data_visualization(data):
    
    #Most frequently occuring words
    top_words = get_top_n_words(data['Corpus'], ngram="", n=20)
    top_df = pd.DataFrame(top_words)
    top_df.columns=["Word", "Freq"]
    plot_freq_words(top_df)

    #Most frequently occuring Bi-grams
    ngram = '''ngram_range=(2,2), 
               max_features=2000'''
    top2_words = get_top_n_words(data['Corpus'], ngram, n=20)
    top2_df = pd.DataFrame(top2_words)
    top2_df.columns=["Bi-gram", "Freq"]
    #print(top2_df)
    plot_freq_words(top2_df)

    #Most frequently occuring Tri-grams
    ngram = '''ngram_range=(3,3), 
               max_features=2000'''
    top3_words = get_top_n_words(data['Corpus'], ngram, n=20)
    top3_df = pd.DataFrame(top3_words)
    top3_df.columns=["Tri-gram", "Freq"]
    #print(top3_df)
    plot_freq_words(top3_df)
    return

In [81]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer

def data_processing(data):
    #Create a vector of word count
    cv=CountVectorizer(max_df=0.8,stop_words=stop_words, max_features=10000, ngram_range=(1,3))
    X=cv.fit_transform(data['Corpus'])
    
    tfidf_transformer=TfidfTransformer(smooth_idf=True,use_idf=True)
    tfidf_transformer.fit(X)
    
    #Get feature names
    feature_names=cv.get_feature_names()
 
    #Fetch document for which keywords needs to be extracted
    #doc=data['Corpus'][0]
    for i in range(0,data['Title'].count()):
        
        keywords = ""
        #Generate tf-idf for the given document
        tf_idf_vector = tfidf_transformer.transform(cv.transform([data['Description'][i]]))
    
        #Sort the tf-idf vectors by descending order of scores
        sorted_items = sort_coo(tf_idf_vector.tocoo())
    
        #Extract only the top n; n here is 10
        keywords = extract_topn_from_vector(feature_names,sorted_items,5)
        data['Keywords'][i] = " ".join(keywords)
 
    return data
    #Now print the results
    #print("\n===Description===")
    #print(doc)
    #print("\n===Keywords===")
    #for k in keywords:
        #print(k,keywords[k])

In [82]:
#Function for sorting tf_idf in descending order
from scipy.sparse import coo_matrix
def sort_coo(coo_matrix):
    tuples = zip(coo_matrix.col, coo_matrix.data)
    return sorted(tuples, key=lambda x: (x[1], x[0]), reverse=True)
 
def extract_topn_from_vector(feature_names, sorted_items, topn=10):
    #Get the feature names and tf-idf score of top n items
    
    #Use only topn items from vector
    sorted_items = sorted_items[:topn]
 
    score_vals = []
    feature_vals = []
    
    #Word index and corresponding tf-idf score
    for idx, score in sorted_items:
        
        #Keep track of feature name and its corresponding score
        score_vals.append(round(score, 3))
        feature_vals.append(feature_names[idx])
 
    #Create a tuples of feature,score
    #Results = zip(feature_vals,score_vals)
    results = {}
    for idx in range(len(feature_vals)):
        results[feature_vals[idx]] = score_vals[idx]
    
    return results

In [83]:
df['Corpus'] = ""
df['Keywords'] = ""
df = data_preprocessing(df)
#data_visualization(df)

#Copying dataset to avoid any overwrite
df2 = df.copy()
df2 = data_processing(df2)


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [84]:
df2.head()

Unnamed: 0,Title,Description,Word_Count,Corpus,Keywords
0,austen-emma.txt,[ Emma by Jane Austen 1816 ]VOLUME ICHAPTER IE...,184733,emma jane austen volume ichapter iemma woodhou...,emma mr harriet weston knightley
1,austen-persuasion.txt,[ Persuasion by Jane Austen 1818 ]Chapter 1Sir...,94426,persuasion jane austen chapter sir walter elli...,anne elliot wentworth captain captain wentworth
2,austen-sense.txt,[ Sense and Sensibility by Jane Austen 1811 ]C...,136619,sense sensibility jane austen chapter family d...,elinor marianne dashwood jennings willoughby
3,bible-kjv.txt,[ The King James Bible ]The Old Testament of t...,980552,king james bible old testament king james bibl...,unto lord thou thy thee
4,blake-poems.txt,[ Poems by William Blake 1789 ]SONGS OF INNOCE...,7918,poem william blake song innocence experience b...,thee thou weep thy little


In [85]:
title = ["Test1"]
description = ['''Researchers say they have discovered the first nearby “super-Earth” planet that could possibly support life.
                 The discovery was made by a spacecraft designed to look for planets outside of our solar system. An international team of astronomers recently announced their findings in the publication Astronomy & Astrophysics.
                 The new discovery is known as an exoplanet, a planet that orbits a star outside of our own solar system. Exoplanets are not easy to identify with telescopes. This is because they are often hidden by the bright light of the stars they orbit.
                 In an effort to help discover such planets, NASA launched its Transiting Exoplanet Survey Satellite, or TESS, last year. TESS was designed to expand upon the work of previous exoplanet discoveries made with telescopes. 
                 More than 3,500 new exoplanets have been discovered over the past 20 years.''']
corpus = " "
keywords = " "
df_test = pd.DataFrame(list(zip(title,description,corpus,keywords)), 
               columns =['Title', 'Description', 'Corpus', 'Keywords']) 

#df_test = data_preprocessing(df_test)
df_test.head()

Unnamed: 0,Title,Description,Corpus,Keywords
0,Test1,Researchers say they have discovered the first...,,


In [89]:
title1 = "Test2"
description1 = '''Application login is not working. External users are trying to process their payments via IVR. 
                 This has increased the call volume by many folds. The agents are requesting to fix this issue as soon as possible.'''
df_test = df_test.append(pd.Series([title1, description1,'',''], index=df_test.columns ), ignore_index=True)
 
df_test.head()

Unnamed: 0,Title,Description,Corpus,Keywords
0,Test1,Researchers say they have discovered the first...,,
1,Test2,Application login is not working. External use...,,
2,Test2,Application login is not working. External use...,,
3,Test2,Application login is not working. External use...,,
4,Test2,Application login is not working. External use...,,


In [92]:
stop_words.remove('not')
df_test = data_preprocessing(df_test)
df_test.head()

Unnamed: 0,Title,Description,Corpus,Keywords
0,Test1,Researchers say they have discovered the first...,researcher say discovered first nearby super e...,exoplanet system solar system solar planet
1,Test2,Application login is not working. External use...,application login not working external user tr...,working external working volume many volume vi...
2,Test2,Application login is not working. External use...,application login not working external user tr...,working external working volume many volume vi...
3,Test2,Application login is not working. External use...,application login not working external user tr...,working external working volume many volume vi...
4,Test2,Application login is not working. External use...,application login not working external user tr...,working external working volume many volume vi...


In [93]:
df_test = data_processing(df_test)
df_test.head()

Unnamed: 0,Title,Description,Corpus,Keywords
0,Test1,Researchers say they have discovered the first...,researcher say discovered first nearby super e...,exoplanet system solar system solar planet
1,Test2,Application login is not working. External use...,application login not working external user tr...,working external working volume many volume vi...
2,Test2,Application login is not working. External use...,application login not working external user tr...,working external working volume many volume vi...
3,Test2,Application login is not working. External use...,application login not working external user tr...,working external working volume many volume vi...
4,Test2,Application login is not working. External use...,application login not working external user tr...,working external working volume many volume vi...


In [96]:
df_test['Keywords'][1]

'working external working volume many volume via ivr increased'