In [119]:
import pandas as pd
import pandas, sys
import numpy as np
import re
import sklearn
import spacy
import nltk
from nltk import sent_tokenize, word_tokenize
nltk.download('punkt')
import warnings
warnings.filterwarnings("ignore")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Carmen\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Concatenating Data

In [322]:
df1 = pd.read_csv("lang_doc.csv")
df2= pd.read_csv("lang_doc2.csv")

lang_df = pd.concat([df1, df2],  ignore_index=True)

In [323]:
lang_df.head()

Unnamed: 0,Spanish_text,English_text
0,Esta tesis quiere profundizar sobre el caso de...,This thesis wants to delve into the case of th...
1,La violencia en sus diversas variantes está pr...,Violence in its several variations is present ...
2,"Esta tesis doctoral es un ensayo clínico que, ...",This doctoral thesis is a clinical trial that ...
3,La frustración es un fenómeno que puede mitiga...,Frustration can be mitigated if there has been...
4,Las políticas lingüísticas europeas promueven ...,European language policies promote the trainin...


### More Cleaning + tokenizing sentences

In [324]:
lang_df[['Spanish_text', 'English_text']] = lang_df[['Spanish_text', 'English_text']].replace(r"\(.*\)","")

In [325]:
# remove any text that contains 'Español' in English_text variable
lang_df['English_text'] = lang_df['English_text'].apply(lambda x: 0 if 'Español' in x else x)

# remove rows with 0
lang_df = lang_df[lang_df['English_text'] != 0]

In [326]:
# storing each sentence in a list using sent_tokenize()
lang_df["tokenized_spanish"] = lang_df["Spanish_text"].apply(sent_tokenize)
lang_df["tokenized_english"] = lang_df["English_text"].apply(sent_tokenize)

In [327]:
lang_df.head()

Unnamed: 0,Spanish_text,English_text,tokenized_spanish,tokenized_english
0,Esta tesis quiere profundizar sobre el caso de...,This thesis wants to delve into the case of th...,[Esta tesis quiere profundizar sobre el caso d...,[This thesis wants to delve into the case of t...
1,La violencia en sus diversas variantes está pr...,Violence in its several variations is present ...,[La violencia en sus diversas variantes está p...,[Violence in its several variations is present...
2,"Esta tesis doctoral es un ensayo clínico que, ...",This doctoral thesis is a clinical trial that ...,"[Esta tesis doctoral es un ensayo clínico que,...",[This doctoral thesis is a clinical trial that...
3,La frustración es un fenómeno que puede mitiga...,Frustration can be mitigated if there has been...,[La frustración es un fenómeno que puede mitig...,[Frustration can be mitigated if there has bee...
4,Las políticas lingüísticas europeas promueven ...,European language policies promote the trainin...,[Las políticas lingüísticas europeas promueven...,[European language policies promote the traini...


In [328]:
# create variables with len in spanish and english
lang_df['len_token_spanish'] = lang_df['tokenized_spanish'].apply(lambda x: len(x))
lang_df['len_token_english'] = lang_df['tokenized_english'].apply(lambda x: len(x))

# dropped unequal sentence lengths
lang_df = lang_df[lang_df['len_token_spanish'] == lang_df['len_token_english']]

In [329]:
# drop irrelevent variables
lang_df.drop(['Spanish_text', 'English_text', 'len_token_spanish', 'len_token_english'], axis=1, inplace=True)

In [330]:
lang_df.head()

Unnamed: 0,tokenized_spanish,tokenized_english
1,[La violencia en sus diversas variantes está p...,[Violence in its several variations is present...
2,"[Esta tesis doctoral es un ensayo clínico que,...",[This doctoral thesis is a clinical trial that...
5,[Los pabellones de conveniencia han tenido un ...,[Flags of convenience have had a deep impact o...
9,[Los seres humanos y los animales experimentan...,[Humans and animals undergo morphological deve...
12,[La irrupción de los sistemas de derechos huma...,[The irruption of human rights systems at a co...


### 'exploding' the lists of sentences into own rows

In [331]:
# make sentences in the lists it's own row
lang_df2 =lang_df.apply(pd.Series.explode).reset_index()
lang_df2.drop('index', axis=1, inplace=True)

# renaming variables
lang_df2.rename(columns = {'tokenized_spanish' : 'Spanish', 'tokenized_english' : 'English'}, inplace = True)

In [336]:
lang_df2.head()

Unnamed: 0,Spanish,English
0,La violencia en sus diversas variantes está pr...,Violence in its several variations is present ...
1,Este fenómeno impacta a un alto por ciento de ...,This phenomenon impacts a high percentage of c...
2,La globalización y la continua búsqueda de las...,Globalization and the continued quest of natio...
3,"Asimismo, son estas nuevas tecnologías las que...",It is also these new technologies that can fos...
4,La educación está en constante transmutación c...,Education is constantly changing because of th...


In [335]:
len(lang_df2)

373

### Example Sentences

In [337]:
lang_df2['Spanish'][6]

'Reconociendo la aventajada postura que exhibe el maestro en los escenarios escolares para la transformación de la vida de sus alumnos y la prevención de los actos de agresión y violencia pretendemos establecer la posible relación entre la violencia escolar y el nivel de capacitación que tiene el futuro profesorado para enfrentar situaciones de violencias en los ambientes escolares.'

In [338]:
lang_df2['English'][6]

'Recognizing the valuable attitude that teacher exhibits in school settings for the transformation of the lives of their students and the prevention of hostility acts and violence and the level of training that the future teachers have to face situations of violence in school environments.'