In [1]:
from IPython.core.display import HTML
HTML("<style>.container { width:100% !important; }</style>")

In [38]:
import nltk
import numpy
from nltk import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from collections import Counter
import pandas as pd

pd.set_option('display.max_rows', 30)
pd.set_option('display.max_columns', 11)
pd.set_option('display.width', 230)

def remove_file_extension(string, file_extension):
    if string.endswith(file_extension):
        return string[:-len(file_extension)]
    return string


def create_lexicon(file_name, n):
    #nltk.download('wordnet')
    #nltk.download("stopwords")
    #nltk.download("punkt")
    stop_words = set(stopwords.words("english"))

    lemmitizer = WordNetLemmatizer()
    lexicon=[]
    tokenizer = RegexpTokenizer(r'\w+')
    with open(file_name, 'r') as f:
        # f is the fileobject
        # contents is a list of strings where each string is a line
        # of the text in the fileobject
        contents = f.readlines()
  
        for l in contents[:]:
            # l is the line of text, a string
            all_words = word_tokenize(l)
            # all words is a list of strings, each string a word from the line
            
            for i in all_words:
                i = i.lower()
                #print(i)
                if i not in stop_words:
                    #print(i)
                    if i.isalnum():
                        #print(i)
                        lexicon.append(i)
    
    lexicon = [lemmitizer.lemmatize(i) for i in lexicon]

    #print(lexicon)
    w_counts = Counter(lexicon)
    
    l2 = dict(w_counts)
    #l2 = {}
    #for w in w_counts:
    #    l2[w] = w_counts[w]
        
    l2 = pd.Series(l2)

    l2.sort_values(inplace=True, ascending=False)
    
    name = remove_file_extension(fileIn, ".txt")
    
    l2.rename(name, inplace=True)
    
    return l2.head(n)

def main():
    print("What I want to print")
    scores = create_lexicon("script1.txt", 20)
    print(scores)

if __name__ == "__main__":
    main()

What I want to print
food       221
animal      31
many        26
may         24
culture     23
price       21
include     20
cooking     19
taste       18
meat        18
plant       18
world       18
type        18
health      16
used        16
also        16
people      16
country     15
product     14
diet        14
Name: script1, dtype: int64


In [39]:
def main():
    
    script1 = create_lexicon("script1.txt", 200)
    transcript1 = create_lexicon("transcript_1.txt", 200)
    transcript2 = create_lexicon("transcript_2.txt", 200)
    transcript3 = create_lexicon("transcript_3.txt", 200)

    
    
    scores = pd.DataFrame()
    
    scores['script1'] = script1
    scores['transcript1'] = transcript1
    scores['transcript2'] = transcript2
    scores['transcript3'] = transcript3
    scores.sort_values('transcript3', inplace=True, ascending=False)
    print(scores)

if __name__ == "__main__":
    main()

              script1  transcript1  transcript2  transcript3
food              221        122.0         39.0         68.0
cooking            19          NaN          NaN         59.0
may                24          6.0         12.0         19.0
meat               18         10.0          NaN         19.0
vitamin             6          NaN          NaN         18.0
fat                10          NaN          NaN         16.0
also               16         10.0         12.0         15.0
vegetable          13          NaN          NaN         15.0
cancer              5          NaN          NaN         14.0
many               26          8.0         11.0         13.0
raw                13          NaN          NaN         13.0
water               7          NaN          NaN         13.0
sugar              11          NaN          NaN         12.0
used               16          4.0          2.0         12.0
ingredient          8          4.0          NaN         11.0
...               ...   

In [49]:
def main():
    
    script1 = create_lexicon("heading-script1.txt", 200)
    transcript1 = create_lexicon("heading-transcript_1.txt", 200)
    transcript2 = create_lexicon("heading-transcript_2.txt", 200)
    transcript3 = create_lexicon("heading-transcript_3.txt", 200)
        
    frames = [pd.DataFrame(script1), pd.DataFrame(transcript1), pd.DataFrame(transcript2), pd.DataFrame(transcript3)]
    
    scores = pd.concat(frames)
    

    scores.sort_values('heading-transcript_3', inplace=True, ascending=False)
    print(scores)
    
    """
    The most common words in the first paragraph are:
    
    heading-script1:       food-13,        right-5,   world-3
    heading-transcript1:   food-15,        fast-8,    restaurant-7
    heading-transcript2:   restaurant-10,  serve-5,   meal-5
    heading-transcript3:   cooking-6,      food-5,    cook-4
    
    """    
    

if __name__ == "__main__":
    main()

             heading-script1  heading-transcript_1  heading-transcript_2  heading-transcript_3
cooking                  NaN                   NaN                   NaN                   6.0
food                     NaN                   NaN                   NaN                   5.0
cook                     NaN                   NaN                   NaN                   4.0
heat                     NaN                   NaN                   NaN                   3.0
type                     NaN                   NaN                   NaN                   3.0
technique                NaN                   NaN                   NaN                   3.0
invention                NaN                   NaN                   NaN                   2.0
unique                   NaN                   NaN                   NaN                   2.0
ingredient               NaN                   NaN                   NaN                   2.0
preparing                NaN                   NaN

In [32]:
def remove_file_extension(string, file_extension):
    if string.endswith(file_extension):
        return string[:-len(file_extension)]
    return string

script1 = pd.Series({'1': 1,'2': 1,'3': 1,'4': 1,})
filename = "script1.txt"
name = remove_file_extension(filename, ".txt")
print(name)
script1.rename(name, inplace=True)
print(script1.name)

script1
script1
