In [6]:
#!/usr/bin/env python
# coding: utf-8

# In[2]:


import warnings
import pandas as pd
import numpy as np
import random
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from pandas.core.common import SettingWithCopyWarning
from pathlib import Path
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /home/anthony/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

In [7]:
data = pd.read_csv('Downloads/MFTD_en_exploded.csv')

In [8]:
def clean_text(data, old_field, clean_field):
    #convert to lowercase
    data[clean_field] = data[old_field].str.lower()
    #remove punctuation, numbers and special characters
    data[clean_field] = data[clean_field].str.replace("[^a-zA-Z#]", " ")
    #remove short words (eg: I've -> i ve --> removing such words with length<3)
    data[clean_field] = data[clean_field].apply(lambda elem: re.sub(r"\b\w{1,2}\b", "", elem))
    return data


# In[9]:


data = clean_text(data,'content', 'clean_stories')

  data[clean_field] = data[clean_field].str.replace("[^a-zA-Z#]", " ")


In [10]:
# ### Variation 1: Use Tokenization, Lemmatization and TFIDF (This will take longer as we are tokenizing for lemmatization and joining the words again for TFIDF)

# ## Tokenization

# In[11]:


def tokenzie_words(data,field_name,new_field_name):
    data[new_field_name] = data[field_name].apply(lambda x: word_tokenize(x))
    return data


# In[12]:


data = tokenzie_words(data, 'clean_stories','type1_data')


# ## Lemmatization

# In[13]:


def word_lemmatizer(text):
    lem_text = [WordNetLemmatizer().lemmatize(i) for i in text]
    return lem_text


# In[14]:


data['type1_data'] = data['type1_data'].apply(lambda x: word_lemmatizer(x))
data['type1_data'] = [' '.join(map(str, l)) for l in data['type1_data']]


# ### Variation 2: Not using Tokenization and Lemmatization

# In[16]:


data['type2_data'] = data['clean_stories']


def tfidf(data,field_name):
    tfIdfVectorizer=TfidfVectorizer(use_idf=True,stop_words="english",lowercase=True)
    tfIdf = tfIdfVectorizer.fit_transform(data[field_name])
    for ind in data.index:
        df = pd.DataFrame(tfIdf[ind].T.todense(), index=tfIdfVectorizer.get_feature_names(), columns=["TF-IDF"])
        df = df[df['TF-IDF'] != 0]
        df = df.sort_values('TF-IDF',ascending=False)
        df = df['TF-IDF'].to_dict()
        data.at[ind,field_name] = df



tfidf(data,'type1_data')


tfidf(data,'type2_data')


data.to_pickle("Downloads/folktales-tfidf.pkl")

In [11]:
data

Unnamed: 0.1,Unnamed: 0,name,content,Translated_from.Author,Translated_from.Book Title,Translated_from.Publication Date,Translated_from.Language,ATU,Language,Origin,Author,Book Title,Publication Date,clean_stories,type1_data,type2_data
0,16,trans-1017.xml,"\nHill and vale do not come together, but the ...",Jacob & Wilhelm Grimm,Kinder- und Hausmärchen,1812.0,German,613,English,Germany,,,,hill and vale not come together but the chi...,"{'tailor': 0.6712439515140128, 'shoemaker': 0....","{'tailor': 0.6934414559899187, 'shoemaker': 0...."
1,23,trans-1026.xml,\nThere was once a countryman who had money an...,Jacob & Wilhelm Grimm,Kinder- und Hausmärchen,1812.0,German,441,English,Germany,Margaret Hunt,Grimm's Household Tales,1884.0,there was once countryman who had money and ...,"{'hedgehog': 0.6759177518728824, 'han': 0.5139...","{'hedgehog': 0.6960061818523211, 'hans': 0.511..."
2,28,trans-1036.xml,\nThere was once a mother who had a little boy...,Jacob & Wilhelm Grimm,Kinder- und Hausmärchen,1812.0,German,441,English,Germany,,Grimm's Household Tales,1884.0,there was once mother who had little boy s...,"{'shroud': 0.4750570897663063, 'mother': 0.385...","{'shroud': 0.4644993586035836, 'mother': 0.376..."
3,30,trans-104.xml,And the sister wept over her poor lost brother...,Jacob & Wilhelm Grimm,Kinder- und Hausmärchen,1812.0,German,450,English,Germany,,,,and the sister wept over her poor lost brother...,"{'fawn': 0.8002964731612497, 'soft': 0.1514392...","{'fawn': 0.7942240142108445, 'soft': 0.1502901..."
4,34,trans-1045.xml,"\nThere was once a rich man, who had a servant...",Jacob & Wilhelm Grimm,Kinder- und Hausmärchen,1812.0,German,592,English,Germany,,Grimm's Household Tales,1884.0,there was once rich man who had servant wh...,"{'jew': 0.6036372132762582, 'servant': 0.32042...","{'jew': 0.5929197604136036, 'servant': 0.37508..."
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
411,2536,trans-9938.xml,"\nOnce upon a time, a purdy little mouse sat i...",Francesc Maspons Labrós,Lo Rondallayre,1871.0,Catalan,2023,English,Spain,"Jason F. Quackenbush, Esq.",ibid,2016.0,once upon time purdy little mouse sat her...,"{'mouse': 0.4389333527500325, 'bouquet': 0.384...","{'mouse': 0.44735722402775197, 'bouquet': 0.37..."
412,2537,trans-9939.xml,"\n(Valencian version) \nLong, long ago, there ...",Fernán Caballero,,,Spanish,780,English,Spain,Amanda Cibulka,ibid,2017.0,valencian version long long ago there wa...,"{'flute': 0.5424355628457117, 'shepherd': 0.36...","{'flute': 0.5378709338430943, 'shepherd': 0.36..."
413,2538,trans-9941.xml,\nONCE in summer-time the bear and the wolf we...,Jacob & Wilhelm Grimm,Kinder- und Hausmärchen,1812.0,German,222,English,Germany,Jacob and Wilhelm Grimm,Jacob and Wilhelm Grimm. Household Tales. The ...,1909.0,once summer time the bear and the wolf were ...,"{'wren': 0.5071454109815293, 'bear': 0.3202121...","{'wren': 0.35089988495709845, 'bear': 0.338819..."
414,2540,trans-998.xml,\nThere was once a little child whose mother g...,Jacob & Wilhelm Grimm,Kinder- und Hausmärchen,1812.0,German,285,English,Germany,,Grimm's Household Tales,1884.0,there was once little child whose mother gav...,"{'snake': 0.714902966541677, 'huhu': 0.3128483...","{'snake': 0.7325109665096652, 'huhu': 0.298681..."
