In [1]:
# import useful library
from collections import Counter
import operator
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud, STOPWORDS

import nltk
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
import re

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Dense, Flatten, TimeDistributed
from tensorflow.keras.layers import RepeatVector, Embedding, Input, LSTM
from tensorflow.keras.layers import Conv1D, MaxPool1D, Bidirectional
from tensorflow.keras.models import Model

import warnings
warnings.filterwarnings('ignore')

import logging
logging.basicConfig(level=logging.INFO)

In [2]:
# import dataset from kaggle
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json
!kaggle datasets download -d harishreddy18/english-to-french

Downloading english-to-french.zip to /content
  0% 0.00/2.50M [00:00<?, ?B/s]
100% 2.50M/2.50M [00:00<00:00, 82.0MB/s]


In [3]:

# unzipping dataset
!unzip "english-to-french.zip"

Archive:  english-to-french.zip
  inflating: small_vocab_en.csv      
  inflating: small_vocab_fr.csv      


In [5]:
df_english = pd.read_csv("/content/small_vocab_en.csv", sep='\t', names=['english'])
df_french = pd.read_csv("/content/small_vocab_fr.csv", sep='\t', names=['french'])

In [7]:
print("Number of dataset English: ", df_english.shape)
print("Number of dataset French: ", df_french.shape)

Number of dataset English:  (137860, 1)
Number of dataset French:  (137860, 1)


In [8]:
df = pd.concat([df_english, df_french], 1)
df.head()

Unnamed: 0,english,french
0,"new jersey is sometimes quiet during autumn , ...",new jersey est parfois calme pendant l' automn...
1,the united states is usually chilly during jul...,les états-unis est généralement froid en juill...
2,"california is usually quiet during march , and...","california est généralement calme en mars , et..."
3,the united states is sometimes mild during jun...,"les états-unis est parfois légère en juin , et..."
4,"your least liked fruit is the grape , but my l...","votre moins aimé fruit est le raisin , mais mo..."


In [9]:
# Make a sampel data
df_sampel = df.sample(n=35000)

In [10]:
print("Shape of dataset sampel:", df_sampel.shape)

Shape of dataset sampel: (35000, 2)


In [11]:
# 1. remove punctuation function (CLEANING SECTION)
import string
def clean_string(info):
  clean = [(char) for char in info if char not in string.punctuation]
  return "".join(clean)

In [12]:
df_sampel['english'] = df_sampel['english'].apply(clean_string)
df_sampel['french'] = df_sampel['french'].apply(clean_string)
df_sampel.head()

Unnamed: 0,english,french
101360,their least favorite fruit is the lemon but y...,leur fruit préféré est moins le citron mais v...
109017,he thinks its difficult to translate french to...,il pense quil est difficile de traduire frança...
91922,i dislike limes grapefruit and grapes,je naime pas citrons verts le pamplemousse et...
113547,china is never dry during november but it is ...,chine est jamais à sec en novembre mais il es...
86893,paris is sometimes busy during summer but it ...,paris est parfois occupé pendant l été mais i...


In [13]:
# 2. Splitting dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df_sampel['english'].tolist(), 
                                                    df_sampel['french'].tolist(),
                                                    test_size=0.2, random_state=11)

In [17]:
# show sampel sentence
print("Sampel sentence in \nEnglish: {} \nFrench: {}".format(X_train[3], y_train[3]))

Sampel sentence in 
English: his most loved fruit is the lemon  but my most loved is the lime  
French: son fruit le plus aimé est le citron  mais mon plus aimé est la chaux 


In [18]:
# 3. Preprocessing NLP
def preprocess(sentences):
  punc = '''!()-[];:'"\,<>./?@@$#%^&*_~'''
  for sen in sentences:
    if sen in punc:
      sentences = sentences.replace(sen, "")
  line = sentences.split()
  line = [word.lower() for word in line]
  check_re = re.compile('[^%s]' % re.escape(string.printable))
  line = [check_re.sub('', w) for w in line]
  return " ".join(line)

In [19]:
english_sentences = []          
french_sentences = []         
french_sentences_inputs = []

In [21]:
for i in range(len(X_train)):
  input_sentence = preprocess(X_train[i])
  output = y_train[i]
  output_sentence = output + ' <eos>'
  output_sentence_input = '<sos>' + output

  english_sentences.append(input_sentence)
  french_sentences.append(output_sentence)
  french_sentences_inputs.append(output_sentence_input)

In [22]:
print("Number of English samples :", len(english_sentences))
print("Number of output samples:", len(french_sentences))
print("Number of input output samples:", len(french_sentences_inputs))

Number of English samples : 28000
Number of output samples: 28000
Number of input output samples: 28000


In [None]:
# punctuation in NLTK
nltk.download('punkt')