In [20]:
import pandas as pd
import pandas, sys
import numpy as np
import re
import sklearn
import spacy
import nltk
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Carmen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Concatenating Data

In [35]:
df1 = pd.read_csv("lang_doc.csv")
df2 = pd.read_csv("lang_doc2.csv")
df3 = pd.read_csv("lang_doc3.csv")

lang_df = pd.concat([df1, df2, df3],  ignore_index=True)

In [36]:
lang_df.head()

Unnamed: 0,Spanish_text,English_text
0,Esta tesis quiere profundizar sobre el caso de...,This thesis wants to delve into the case of th...
1,La violencia en sus diversas variantes está pr...,Violence in its several variations is present ...
2,"Esta tesis doctoral es un ensayo clínico que, ...",This doctoral thesis is a clinical trial that ...
3,La frustración es un fenómeno que puede mitiga...,Frustration can be mitigated if there has been...
4,Las políticas lingüísticas europeas promueven ...,European language policies promote the trainin...


In [37]:
lang_df.shape

(122, 2)

### More Cleaning + tokenizing sentences

In [38]:
lang_df[['Spanish_text', 'English_text']] = lang_df[['Spanish_text', 'English_text']].replace(r"\(.*\)","")

In [39]:
# remove any text that contains 'Español' in English_text variable
lang_df['English_text'] = lang_df['English_text'].apply(lambda x: 0 if 'Español' in x else x)

# remove rows with 0
lang_df = lang_df[lang_df['English_text'] != 0]

In [40]:
# storing each sentence in a list using sent_tokenize()
lang_df["tokenized_spanish"] = lang_df["Spanish_text"].apply(sent_tokenize)
lang_df["tokenized_english"] = lang_df["English_text"].apply(sent_tokenize)

In [41]:
lang_df.head()

Unnamed: 0,Spanish_text,English_text,tokenized_spanish,tokenized_english
0,Esta tesis quiere profundizar sobre el caso de...,This thesis wants to delve into the case of th...,[Esta tesis quiere profundizar sobre el caso d...,[This thesis wants to delve into the case of t...
1,La violencia en sus diversas variantes está pr...,Violence in its several variations is present ...,[La violencia en sus diversas variantes está p...,[Violence in its several variations is present...
2,"Esta tesis doctoral es un ensayo clínico que, ...",This doctoral thesis is a clinical trial that ...,"[Esta tesis doctoral es un ensayo clínico que,...",[This doctoral thesis is a clinical trial that...
3,La frustración es un fenómeno que puede mitiga...,Frustration can be mitigated if there has been...,[La frustración es un fenómeno que puede mitig...,[Frustration can be mitigated if there has bee...
4,Las políticas lingüísticas europeas promueven ...,European language policies promote the trainin...,[Las políticas lingüísticas europeas promueven...,[European language policies promote the traini...


In [42]:
# create variables with len in spanish and english
lang_df['len_token_spanish'] = lang_df['tokenized_spanish'].apply(lambda x: len(x))
lang_df['len_token_english'] = lang_df['tokenized_english'].apply(lambda x: len(x))

In [43]:
# dropped unequal sentence lengths
lang_df = lang_df[lang_df['len_token_spanish'] == lang_df['len_token_english']]

In [44]:
# drop irrelevent variables
lang_df.drop(['Spanish_text', 'English_text', 'len_token_spanish', 'len_token_english'], axis=1, inplace=True)

In [45]:
lang_df.head()

Unnamed: 0,tokenized_spanish,tokenized_english
1,[La violencia en sus diversas variantes está p...,[Violence in its several variations is present...
2,"[Esta tesis doctoral es un ensayo clínico que,...",[This doctoral thesis is a clinical trial that...
5,[Los pabellones de conveniencia han tenido un ...,[Flags of convenience have had a deep impact o...
9,[Los seres humanos y los animales experimentan...,[Humans and animals undergo morphological deve...
12,[La irrupción de los sistemas de derechos huma...,[The irruption of human rights systems at a co...


### 'exploding' the lists of sentences into own rows

In [46]:
# make sentences in the lists it's own row
lang_df2 =lang_df.apply(pd.Series.explode).reset_index()
lang_df2.drop('index', axis=1, inplace=True)

# renaming variables
lang_df2.rename(columns = {'tokenized_spanish' : 'Spanish', 'tokenized_english' : 'English'}, inplace = True)

In [48]:
len(lang_df2)

505

### Example Sentences

In [49]:
lang_df2['Spanish'][355]

'Para evitar estos conflictos, se debe realizar una gestión adecuada del paisaje en los términos del Convenio Europeo del Paisaje.'

In [50]:
lang_df2['English'][355]

'To avoid these conflicts, proper landscape management must be carried out under the terms of the European Landscape Convention.'

In [51]:
lang_df2.to_csv('sent_translations.csv')