In [1]:
import numpy as np
import pandas as pd
import re
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer 

In [2]:
# read the entire text file and divides it by the sentance.
sentences = list()
with open("beatles_biography.txt") as file:
    for line in file:
        # The point where the sentence ends
        for l in re.split(r"\.\s|\?\s|\!\s|\n",line): 
            if l: 
                sentences.append(l) 

In [3]:
len(sentences)

116

In [11]:
sentences[0:10]

['English rock and roll band',
 'In the 1960s a new band known as the Beatles burst on the pop music scene and changed it forever',
 'Band members included George Harrison (1943?2001), John Lennon (1940?1980), Paul McCartney (1942?), and Ringo Starr (1940?)',
 'With the release of three anthologies (collections) in the mid-1990s, the Beatles remain one of the best-selling musical groups of all time',
 'Early days',
 'The Beatles came from Liverpool, England, and were originally inspired by the simple guitar-and-washboard style "skiffle" music',
 'Skiffle was a lively type of acoustic (nonelectric) music that used songs from British and American folk and popular music',
 'Later such U.S',
 'pop artists as Elvis Presley (1935?1977), Buddy Holly (1936?1959), and Little Richard (1932?) influenced them',
 'All four members of the Beatles had an early interest in music']

In [5]:
# Eliminate stop words, consider only between min_df and max_df
cvec = CountVectorizer(stop_words='english', min_df=3, 
                       max_df=0.5, ngram_range=(1,2))
sf = cvec.fit_transform(sentences)

In [15]:
print(sf[0])

  (0, 7)	1
  (0, 46)	1
  (0, 47)	1
  (0, 48)	1


In [18]:
cvec.get_feature_names()[0:5]

['1964', '1968', '1970', 'album', 'albums']

In [7]:
# Transforming it with idf weight
transformer = TfidfTransformer() 
transformed_weights = transformer.fit_transform(sf) 

In [16]:
print(transformed_weights[0])

  (0, 48)	0.5157629658743857
  (0, 47)	0.5157629658743857
  (0, 46)	0.5157629658743857
  (0, 7)	0.4494059290856895


In [9]:
weights = np.asarray(transformed_weights.mean(axis=0)).ravel().tolist() 
weights_df = pd.DataFrame({'term': cvec.get_feature_names(), 'weight': weights}) 

In [10]:
# Show words in order of high weight
weights_df.sort_values(by='weight',ascending=False).head(10)

Unnamed: 0,term,weight
8,beatles,0.150009
3,album,0.055008
26,lennon,0.054275
51,songs,0.051386
43,released,0.050173
19,group,0.05013
30,mccartney,0.049464
16,film,0.04584
33,new,0.041649
7,band,0.038203
