In [1]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

import string
import os
import itertools
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from keras.layers import Input, Embedding, Activation, Flatten, Dense , GlobalAveragePooling1D
from keras.layers import Conv1D, MaxPooling1D, Dropout
from keras.models import Model
import unicodedata
import tensorflow as tf
from keras.models import Sequential
from keras import layers
from keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn import preprocessing


print(os.listdir("C:/Users/Asus/data/"))

Using TensorFlow backend.


['english.txt', 'spanish.txt']


# Data Prepration

### Reading text data

In [2]:
all_letters = string.ascii_letters + " .,;'"
n_letters = len(all_letters)


def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
        and c in all_letters
    )

def readwords(filename):
    words = open(filename, encoding='utf-8').read().strip().split()
    return [unicodeToAscii(word) for word in words]


### preparation train & test set

In [3]:
def creat_data():
    
    global data
    
    sp_word_list=readwords('C:/Users/Asus/data/spanish.txt')
    eng_word_list=readwords('C:/Users/Asus/data/english.txt')
    label =list(itertools.repeat('english',len(eng_word_list))) + list(itertools.repeat('spanish',len(sp_word_list)))
    data = pd.DataFrame({'words': eng_word_list+ sp_word_list ,'label': label})

creat_data()

In [4]:
print(data.shape)
data.head(5)

(145588, 2)


Unnamed: 0,words,label
0,The,english
1,Project,english
2,Gutenberg,english
3,eBook,english
4,of,english


In [5]:
X=data.iloc[:,:-1]
Y=data.iloc[:,-1]

In [6]:
Y.value_counts()

spanish    127770
english     17818
Name: label, dtype: int64

#### numerization label

In [7]:
le = preprocessing.LabelEncoder()
labl= le.fit(Y)
le.classes_
Y=le.transform(Y)

### Train_Test_split

In [8]:
words=X['words'].values
words_train, words_test, y_train, y_test = train_test_split(
   words,Y, test_size=0.3, random_state=1000)

In [9]:
train_texts = [s.lower() for s in words_train]

test_texts = [s.lower() for s in words_test]

In [10]:
y_train = to_categorical(y_train)
y_test = to_categorical(y_test)

### Tokenization word into smaller units(letter)

In [11]:
tk = Tokenizer(num_words=None, char_level=True, oov_token='UNK')
tk.fit_on_texts(train_texts)

In [12]:
alphabet = "abcdefghijklmnopqrstuvwxyz0123456789,;.!?:'\"/\\|_@#$%^&*~`+-=<>()[]{}"
char_dict = {}
for i, char in enumerate(alphabet):
    char_dict[char] = i + 1

tk.word_index = char_dict.copy()
tk.word_index[tk.oov_token] = max(char_dict.values()) + 1

### Choose the classification of each letter or word

In [13]:
##############for character classification############
#train_sequences = tk.texts_to_sequences(train_texts)
#test_texts = tk.texts_to_sequences(test_texts)


##############for word classification############
train_sequences = tk.texts_to_sequences([train_texts])
test_texts = tk.texts_to_sequences([test_texts])



# Padding
train_data = pad_sequences(train_sequences, maxlen=120, padding='pre')
test_data = pad_sequences(test_texts, maxlen=120, padding='pre')

# Convert to numpy array
train_data = np.array(train_data, dtype='float32')
test_data = np.array(test_data, dtype='float32')

In [14]:
print(words_train[0:5]) 

print(train_data[0:5,108:120])

['Sevylla' 'Rey' 'portio' 'mas' 'qualesquier']
[[69. 69. 69.  5. 69. 69. 69.  5. 69. 69. 69. 69.]]


## Model Construction

#### Embedding weights

In [15]:
input_size = 120
word_size = len(tk.word_index)
embedding_dim = 69


embedding_weights = []  # (70, 69)
embedding_weights.append(np.zeros(word_size))  # (0, 69)

for char, i in tk.word_index.items():  # from index 1 to 69
    onehot = np.zeros(word_size)
    onehot[i - 1] = 1
    embedding_weights.append(onehot)

embedding_weights = np.array(embedding_weights)

In [16]:
model = Sequential()

model.add(layers.Embedding(word_size+1, embedding_dim, input_length=input_size , weights=[embedding_weights]))
model.add(layers.Conv1D(32, 3, activation='relu'))
model.add(layers.Conv1D(64, 3, activation='relu'))
model.add(layers.Conv1D(128, 3, activation='relu'))

model.add(layers.MaxPooling1D(3))

model.add(layers.Conv1D(256, 3, activation='relu'))
model.add(layers.Conv1D(256, 3, activation='relu'))

model.add(layers.GlobalMaxPooling1D())

model.add(layers.Dense(10, activation='relu'))
model.add(layers.Dense(2, activation='sigmoid'))

model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 120, 69)           4830      
_________________________________________________________________
conv1d_1 (Conv1D)            (None, 118, 32)           6656      
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 116, 64)           6208      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 114, 128)          24704     
_________________________________________________________________
max_pooling1d_1 (MaxPooling1 (None, 38, 128)           0         
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 36, 256)           98560     
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 34, 256)          

### Training model

In [None]:
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

history = model.fit(train_data, y_train,epochs=10,verbose=1,validation_data=(test_data, y_test),batch_size=10)

loss, accuracy = model.evaluate(train_data, y_train, verbose=False)
print("Training Accuracy: {:.4f}".format(accuracy))
loss, accuracy = model.evaluate(test_data, y_test, verbose=False)
print("Testing Accuracy:  {:.4f}".format(accuracy))

In [None]:
import matplotlib.pyplot as plt
plt.style.use('ggplot')

def plot_history(history):
    acc = history.history['accuracy']
    val_acc = history.history['val_accuracy']
    loss = history.history['loss']
    val_loss = history.history['val_loss']
    x = range(1, len(acc) + 1)

    plt.figure(figsize=(12, 5))
    plt.subplot(1, 2, 1)
    plt.plot(x, acc, 'b', label='Training acc')
    plt.plot(x, val_acc, 'r', label='Validation acc')
    plt.title('Training and validation accuracy')
    plt.legend()
    plt.subplot(1, 2, 2)
    plt.plot(x, loss, 'b', label='Training loss')
    plt.plot(x, val_loss, 'r', label='Validation loss')
    plt.title('Training and validation loss')
    plt.legend()
    
plot_history(history)

## Test model on any word

In [None]:
word = input("Enter your word: ") 
print(word) 

In [None]:
def Test_word(word):
    
    word_test=tk.texts_to_sequences(word)
    word_test = pad_sequences(word_test, padding='pre', maxlen=120)
    word_test = np.array(word_test, dtype='float32')
    predict_classes=model.predict(word_test)
    predicted_classes=predict_classes.sum(axis=0)
    if predicted_classes[0]>predicted_classes[1]:
        print(word,'is a english word')
    else:
        print(word,'is a spanish word')
    

In [None]:
x=['wash', 'ola', 'portio']
for w in x:
    Test_word(w)