### Installing Packages: 

In [1]:
%pip install pandas
%pip install numpy
%pip install pyphen
%pip install pronouncing

Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.
Note: you may need to restart the kernel to use updated packages.


### Importing packages : 

In [2]:
import pandas as pd
import numpy as np
import pronouncing 
import pyphen

### Importing datasets : 

In [3]:
# first dataframe
df1 = pd.read_csv('common_words.csv')

# second dataframe
df2 = pd.read_csv('common_stopwords.csv')

# combinging the frames together
frames = [df1, df2]

# merging both the datasets to a single dataframe
df = pd.concat(frames)

### Important Function Creations : 

#### Phoneme extraction function :

<div class="alert alert-block alert-info">
It is the process of converting written text into a sequence of phonemes, which are the basic units of sound in a language. This is a crucial step in text-to-speech (TTS) systems as it helps the TTS engine to generate speech sounds more accurately.
</div>

In [4]:
# this function extracts all the phonemes in string format
# and returns the string in a list format
def extract_phonemes(text):
    
    # splits the string into individual words
    words = text.split()
    
    # empty list
    phonemes = []
    
    # for every individual word in words
    for word in words:
        # appends the phoneme string to the empty list
        phonemes.extend(pronouncing.phones_for_word(word))
    return phonemes

#### Syllable extraction function :

<div class="alert alert-block alert-info">
It is the process of dividing a word into its syllables, which are units of sound that make up a word. Syllable extraction is useful in text-to-speech (TTS) systems as it can help to generate speech sounds more accurately.
</div>

In [5]:
# this function extracts all the syllables in string format
# and returns the string in a list format
def extract_syllables(text):
    
    # pyphen initialization 
    dic = pyphen.Pyphen(lang='en')
    
    # splits the string into individual words
    words = text.split()
    
    # empty list
    syllables = []
    
    # for every individual word in words
    for word in words:
        # appends the phoneme string to the empty list
        syllables.extend(dic.inserted(word).split("-"))
    return syllables

### Zipping function : 

In [6]:
# this function is used to zip both 
# phonemes and syllables returned into 
# a single list
def extract(word):
    
    # empty lists
    phonemes = []
    syllables = []
    
    # pyphen initialization
    # dic = pyphen.Pyphen(lang='en')
    # phonemes.extend(pronouncing.phones_for_word(word))
    # syllables.extend(dic.inserted(word).split("-"))
    
    # phoneme extraction
    phonemes = extract_phonemes(word)
    
    # syllables extraction
    syllables = extract_syllables(word)
    
    # returns the zipped list
    return ([phonemes, syllables])

### Main function : 

In [7]:
# main function
if __name__ == "__main__":
    
    # initialization of words to be added to 
    # new dataframe 
    words = df.iloc[:].values
    
    # empty lists to store returned data
    list_of_words = []
    list_of_phonemes = []
    list_of_syllables = []
    
    # for every indiviual word in the list of words
    for iword in words:
        
        # takes the string element
        # instead of the list as whole
        word = iword[0]
        
        # feature_extracted stores the zipped
        # phonemes and syllables
        feature_extracted = extract(word)
        
        # appending to new lists
        list_of_words.append(iword.tolist())
        list_of_phonemes.append(feature_extracted[0])
        list_of_syllables.append(feature_extracted[1])
    
    # creating new dataframe to store the new features
    new_df = pd.DataFrame(list(zip(list_of_words, list_of_phonemes, list_of_syllables)), columns =['Word', 'Phonemes', 'Syllables'])
    
    # converting the dataframe to a new csv file
    new_df.to_csv('text-preprocessed.csv', index=False)