<font size="4">Import modules</font>

In [1]:
import glob
import operator
import pandas as pd
import csv

import nltk
from nltk.corpus import stopwords
from nltk import ngrams
from nltk.tokenize import word_tokenize

from sklearn.feature_extraction.text import CountVectorizer

import multiprocessing
from multiprocessing import Process

<font size="4">Reading files from directory</font>

In [2]:
# get all data from .txt files from directory (data placed in created List, in these case directory was placed on current user Desktop

text_from_all_files = []

all_files = glob.glob(r'C:\Users\altz7\Desktop\test_folder\*.txt')
for file in all_files:
    open_file = open(file, encoding="utf-8")
    combined_data = open_file.read()
    text_from_all_files.append(combined_data)
    open_file.close()

<font size="4">Preprocessing data</font>

In [3]:
#create a string object from list
final_text = ''.join(text_from_all_files)

In [4]:
#replace unwanted characters that will not be catched by Stopwords standard vocabulary 
final_text = final_text.replace('...', '.').replace('--', '').replace('***', '')

In [5]:
#remove cases where two or more whitespaces where used insted of one
final_text = ' '.join(final_text.split())

In [6]:
words = nltk.word_tokenize(final_text) #tokenize words from input text files
words = [word for word in words if len(word) > 1] # Remove single-character tokens (mostly punctuation)
words = [word for word in words if not word.isnumeric()] # Remove numbers
words = [word.lower() for word in words] # Lowercase all words
words = [word for word in words if word not in stopwords.words('english')] # Remove stopwords

<font size="4">Creating list of N-grams</font>

In [7]:
ngrams_list = []
def ngrams_from_text():
    str1 = ' '.join([str(elem) for elem in words]) #convert list object back into string, N-gram requirements
    n = 5 #In these line you can control the size of each N-gram. Number equals to number of words in each N-gram
    bigrams = ngrams(str1.split(), n)
    for gram in bigrams:
        ngrams_list.append(gram)

<font size="4">Creating word frequency list with metric values</font>

In [8]:
def word_frequency():
    cv = CountVectorizer()
    cv_fit=cv.fit_transform(words)
    word_list = cv.get_feature_names()
    count_list = cv_fit.toarray().sum(axis=0)
    dictionary_test = dict(zip(word_list, count_list))
    return dictionary_test

<font size="4">Multiprocessing</font>

In [9]:
if __name__ == "__main__":
    p1 = Process(target = word_frequency())
    p2 = Process(target = ngrams_from_text())
    p1.start()
    p2.start()
    p1.join()
    p2.join()

<font size="4">Saving data</font>

In [10]:
#Saving N-grams list in .txt file. Directory and file names could be changed to any other values

with open(r'C:\Users\altz7\Desktop\test_folder\Ngrams_list.txt', 'w') as file_output:
    for list_item in ngrams_list:
        file_output.write(str(f"{list_item}\n"))

In [11]:
# Saving N-grams list in .csv file. Directory and file names could be changed to any other values

df1 = pd.DataFrame(data={"N-grams": ngrams_list})
df1.to_csv(r'C:\Users\altz7\Desktop\test_folder\Ngrams_list.csv', index=False)

In [12]:
#Saving frequency_list as CSV format, more freindly way, cause we can specify columns easily 

#in this line you can control how much values you like to list from the top. In these case it will be listed first 25 frequent words.
dictionary_test = word_frequency()
word_frequency_dict = dict(sorted(dictionary_test.items(), key=operator.itemgetter(1), reverse=True)[:26])

df = pd.DataFrame(word_frequency_dict.items(), columns=["word", "frequency"])

df.to_csv(r'C:\Users\altz7\Desktop\test_folder\Words_frequency.csv', index=False)

In [13]:
#Just in case you need frequency data in txt - these lines convert CSV file to TXT file format
with open(r'C:\Users\altz7\Desktop\test_folder\Words_frequency.txt', "w") as my_output_file:
    with open(r'C:\Users\altz7\Desktop\test_folder\Words_frequency.csv') as my_input_file:
        [ my_output_file.write(" ".join(row)+'\n') for row in csv.reader(my_input_file)]
    my_output_file.close()