# Understanding the Data
The dataset acquired from [Kaggle](https://www.kaggle.com/code/martinkk5575/language-detection/data) contains words from several different languages. The noise contained in the dataset are duplicate words. To reduce this noise, the words will be broken down into single and double characters, then rated based on how often they show up in that respective language.

In [2]:
import pandas as pd
import numpy as np

# Import data*
fileName = "dataset.csv"
data = pd.read_csv(fileName)

data

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
...,...,...
21995,hors du terrain les années et sont des année...,French
21996,ใน พศ หลักจากที่เสด็จประพาสแหลมมลายู ชวา อินเ...,Thai
21997,con motivo de la celebración del septuagésimoq...,Spanish
21998,年月，當時還只有歲的她在美國出道，以mai-k名義推出首張英文《baby i like》，由...,Chinese


# Splitting the Data


In [3]:
from sklearn.model_selection import train_test_split

X=data['Text'] # Feature matrix
y=data['language'] # Label

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the languages into a DataFrame that we aren't modifying
languages = set(y)

In [4]:
import pandas as pd
cow = pd.DataFrame()
print(type(cow))
print(type(cow)==pd.DataFrame)

<class 'pandas.core.frame.DataFrame'>
True


# Creating Functions to Feature Engineer Our Data

In [5]:
def test_function(dataframe, chars):
    new_arr = np.zeros((1, len(chars)))
    j=0
    for char in chars:
        count = 0.0
        for letter in dataframe:
            if letter == char:
                count = count + 1.0
            fraction = count/len(dataframe)
        new_arr[0,j] = fraction
        j = j+1
    data_frame = pd.DataFrame(new_arr, columns = chars)
    return data_frame

chars_2 = ['e', 't', 'ä', 'ö', 'a', 'n', 'ก', 'ข', 'ค', 'ฅ', 'ฆ', 'ง', 'அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 
         'o', 'r', 'ー', '日', 'あ', 'ぁ', 'ぇ', 'ç', 'ğ', 'ı', 'İ', 'î', 'ö', 'ş', 'i', 'u', 'چ', 'ح', 'خ', 'ش',
         'â', 'ù', 'è', 's', 'î', 'ë', '胡', '童', '。', 'ᄁ', '알', '에', 'ᄃ', 'ऺ', 'त', 'ऻ', 'क', 'á', 'é', 'í', 
         'ó', 'ږ','ک', 'ﻑ', 'ی', 'م', 'ث', 'ţ', 'ă', 'ș', 'ş', 'б', 'в', 'г', 'д', 'ص', 'ف', 'ج', 'ر']

In [6]:
def feature_engineering_trial(dataframe, chars):
    if (type(type(dataframe)==str)):
        new_arr = np.zeros((1, len(chars)))
        j=0
        for char in chars:
            count = 0.0
            for letter in dataframe:
                if letter == char:
                    count = count + 1.0
                fraction = count/len(dataframe)
            new_arr[0,j] = fraction
            j = j+1
        data_frame = pd.DataFrame(new_arr, columns = chars)
        return data_frame
        
    if (type(type(dataframe)==pd.DataFrame)):
        arr = dataframe.to_numpy()
        new_arr = np.zeros((len(arr), len(chars)))
        i=0
        j=0
        for text in arr:
            sentence = text
            count = 0.0
            j=0
            for char in chars:
                count = 0.0
                for letter in sentence:
                    if letter == char:
                        count = count + 1.0
                    fraction = count/len(sentence)
                new_arr[i,j] = fraction
                j = j + 1
        
            i = i + 1
        data_frame = pd.DataFrame(new_arr, columns = chars)      
        return data_frame

In [7]:
def feature_engineering(dataframe, chars):
    arr = dataframe.to_numpy()
    new_arr = np.zeros((len(arr), len(chars)))
    i=0
    j=0
    for text in arr:
        sentence = text
        count = 0.0
        j=0
        for char in chars:
            count = 0.0
            for letter in sentence:
                if letter == char:
                    count = count + 1.0
                fraction = count/len(sentence)
            new_arr[i,j] = fraction
            j = j + 1
        
        i = i + 1
    data_frame = pd.DataFrame(new_arr, columns = chars)      
    return data_frame

In [8]:
def feature_engineering_2(dataframe, chars):
    arr = dataframe.to_numpy()
    new_arr = np.zeros((len(arr), len(chars)))
    i=0
    j=0
    for text in arr:
        sentence = text
        count = 0.0
        j = 0
        for list in chars:
            count = 0.0
            for char in list:
                for letter in sentence:
                    if letter == char:
                        count = count + 1.0
            fraction = count/len(sentence)
            new_arr[i,j] = fraction
            j = j+1
        i = i+1
    
    names = ['english', 'estonian', 'swedish', 'thai', 'tamil', 'dutch', 'japanese', 'turkish', 'latin', 'urdu',
             'indonesian', 'portuguese', 'french', 'chinese', 'korean', 'hindi', 'spanish', 'pushto', 'persian',
             'romanian', 'russian', 'arabic']
    
    data_frame = pd.DataFrame(new_arr, columns = names)
    return data_frame
                

# Testing Our Functions

In [9]:
new_chars = [['e', 't', 'a', 'i', 'o', 'n', 's', 'h', 'r'], ['a', 'e', 'i', 'ä', 'ö', 'õ', 'š', 'ü', 'ž'], 
             ['å', 'ä', 'ö', 'a', 'e', 't', 'n', 'r', 's', 'i'], ['ก', 'ข', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช', 'ฌ'],
             ['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ'], ['a', 'e', 'i', 'o', 'h', 'n', 'r', 't', 's'], 
             ['㍿', '㍐', 'ヿ', 'ヾ', 'ヽ', 'ー', '・', 'ヺ', 'ヹ', 'ヸ'], ['ç', 'ğ', 'ı', 'İ', 'î', 'ö', 'ş', 'ü', 'a', 'e'],
             ['a', 'e', 'i', 'n', 'r', 's', 't', 'u', 'm', 'd'], ['چ', 'ح', 'خ', 'ش', 'ن', 'ٹ', 'ن', 'ث', 'گ', 'ج'],
             ['a', 'A', 'i', 'n', 'r', 'm', 's', 't', 'u', 'g'], ['â', 'ê', 'ô', 'ã', 'õ', 'à', 'è', 'ì', 'ò', 'ù'],
             ['ô', 'û', 'à', 'è', 'ì', 'ò', 'ù', 'ë', 'ï', 'ü'], ['主', '人', '公', '阿', '米', '尔', '一', '样', '都', '是'],
             ['응','의','이','익','인','일','임','입','잉','잎'], ['ः', 'ऺ', 'ऻ', 'ा', 'ि', 'ी', 'ॎ', 'ई', 'उ', 'ऊ'], 
             ['á', 'é', 'í', 'ó', 'ú', 'ñ', 'ü', 't', 'e', 'i'], ['ت', 'ا', 'ې', 'ښ', 'ن', 'ر', 'ع', 'ط', 'ړ', 'س'],
             ['ق', ' غ', 'ج', 'ت', ' ن ', 'ی', 'ل ', 'ظ', 'ص', 'ز'], ['ă', 'â', 'î', 'ș', 'ş', 'ț', 'ţ'], 
             ['б', 'в', 'г', 'д', 'ж', 'з', 'к', 'л', 'м', 'н'], ['م', 'ص', 'ظ', 'و', 'ر', 'م', 'ي', 'ج', 'ز', 'ق']]

panda = X_train.head()
some_data = feature_engineering_2(panda, new_chars)
some_data
    

Unnamed: 0,english,estonian,swedish,thai,tamil,dutch,japanese,turkish,latin,urdu,...,french,chinese,korean,hindi,spanish,pushto,persian,romanian,russian,arabic
0,0.0,0.0,0.0,0.118919,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.157277,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.523622,0.26378,0.523622,0.0,0.0,0.523622,0.0,0.192913,0.598425,0.0,...,0.0,0.0,0.0,0.0,0.208661,0.0,0.0,0.0,0.0,0.0
3,0.038062,0.013841,0.031142,0.0,0.034602,0.038062,0.0,0.00692,0.031142,0.0,...,0.0,0.0,0.0,0.0,0.013841,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.264471,0.0


# 1st Model: RandomForrestClassifier

This model is trained on a list of 44 characters and the first 3000 rows of our X_train data. Our performance metric is accuracy score.

In [None]:
X_train_ready = feature_engineering(X_train.head(9000), chars_2)
y_train_ready = y_train.head(9000)
X_test_ready = feature_engineering(X_test, chars_2)

In [12]:
#First Model Trained
#characters for first model
chars_2 = ['e', 't', 'ä', 'ö', 'a', 'n', 'ก', 'ข', 'ค', 'ฅ', 'ฆ', 'ง', 'அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 
         'o', 'r', 'ー', '日', 'あ', 'ぁ', 'ぇ', 'ç', 'ğ', 'ı', 'İ', 'î', 'ö', 'ş', 'i', 'u', 'چ', 'ح', 'خ', 'ش',
         'â', 'ù', 'è', 's', 'î', 'ë', '胡', '童', '。', 'ᄁ', '알', '에', 'ᄃ', 'ऺ', 'त', 'ऻ', 'क', 'á', 'é', 'í', 
         'ó', 'ږ','ک', 'ﻑ', 'ی', 'م', 'ث', 'ţ', 'ă', 'ș', 'ş', 'б', 'в', 'г', 'д', 'ص', 'ف', 'ج', 'ر']

X_train_2 = X_train.head(3000)
model_one = feature_engineering(X_train_2, chars_2)
model_one

y_train_one = y_train.head(3000)

from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
rnd_clf.fit(model_one, y_train_one)

X_test_1 = feature_engineering(X_test, chars_2)

y_preds = rnd_clf.predict(X_test_1)

from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test, y_preds)

print('Accuracy=%s' % (acc_score))

import pickle #ask about pickle

saved_model = pickle.dumps(rnd_clf)

rdf_from_pickle = pickle.loads(saved_model)

Accuracy=0.8822727272727273


Same model, but training on the whole X_train dataset

In [15]:
#First Model Trained
#characters for first model
chars_2 = ['e', 't', 'ä', 'ö', 'a', 'n', 'ก', 'ข', 'ค', 'ฅ', 'ฆ', 'ง', 'அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 
         'o', 'r', 'ー', '日', 'あ', 'ぁ', 'ぇ', 'ç', 'ğ', 'ı', 'İ', 'î', 'ö', 'ş', 'i', 'u', 'چ', 'ح', 'خ', 'ش',
         'â', 'ù', 'è', 's', 'î', 'ë', '胡', '童', '。', 'ᄁ', '알', '에', 'ᄃ', 'ऺ', 'त', 'ऻ', 'क', 'á', 'é', 'í', 
         'ó', 'ږ','ک', 'ﻑ', 'ی', 'م', 'ث', 'ţ', 'ă', 'ș', 'ş', 'б', 'в', 'г', 'д', 'ص', 'ف', 'ج', 'ر']

X_train_ready = feature_engineering(X_train, chars_2)

from sklearn.ensemble import RandomForestClassifier
rnd_clf_full = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
rnd_clf_full.fit(X_train_ready, y_train)

X_test_ready = feature_engineering(X_test, chars_2)

y_predictions = rnd_clf_full.predict(X_test_ready)

from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test, y_predictions)

print('Accuracy=%s' % (acc_score))

Accuracy=0.8809090909090909


# Predicting the language of a string

Using the predict method to send a string and predict what language it is

In [13]:
string = "Hej, jag heter Jesse Byler. Den här meningen är på engelska, så förhoppningsvis klassas den som engelska. Det skulle vara trevligt, eller hur?"
hello = test_function(string, chars_2)
pred_swed = rnd_clf.predict(hello)
pred_swed

array(['Swedish'], dtype=object)

# Second Model: RandomForrestClassifier
BUT: we will send a list of lists into our feature_engeering function instead of a list of characters! We will train the model on the first 6,000 rows of the X_train data

In [7]:
X_train_2 = X_train.head(6000)
X_train_2_eng = feature_engineering_2(X_train_2, new_chars)

y_train_2 = y_train.head(6000)

from sklearn.ensemble import RandomForestClassifier

rnd_clf_2 = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
rnd_clf_2.fit(X_train_2_eng, y_train_2)

X_test_2 = feature_engineering_2(X_test, new_chars)
y_preds_2 = rnd_clf_2.predict(X_test_2)

from sklearn.metrics import accuracy_score

acc_score_2 = accuracy_score(y_test, y_preds_2)

print('Accuracy=%s' % (acc_score_2))



Accuracy=0.8252272727272727


# Third Model: Decision Tree Model
Now I will train a decision tree model, later a grid search will be conducted

In [14]:
X_train.size

17600

In [16]:
from sklearn.tree import DecisionTreeClassifier

tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(feature_engineering(X_train.head(9000), chars_2), y_train.head(9000))
preds = tree_clf.predict(feature_engineering(X_test, chars_2))

acc_score_tree = accuracy_score(y_test, preds)
print('Accuracy=%s' % (acc_score_tree))

Accuracy=0.8811363636363636


# Grid Searches for Decision Tree Hyperparameters

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {'max_depth': [2,4,6,8,10,20,30,40,50], 'max_leaf_nodes': [2,4,6,8,10,20,30,40,50], 
                          'min_samples_split': [2,4,6,8,10,20,30,40,50]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42),
                              param_grid,
                              verbose=1,
                              cv=3)
grid_search_cv.fit(feature_engineering(X_train.head(9000), chars_2), y_train.head(9000))

print("The best parameters are: ", grid_search_cv.best_params_)

Fitting 3 folds for each of 729 candidates, totalling 2187 fits
The best parameters are:  {'max_depth': 20, 'max_leaf_nodes': 50, 'min_samples_split': 40}


In [20]:
param_grid = {'max_depth': [15,17,19,20,21,23,25], 'max_leaf_nodes': [40,45,50,55,60,70,80], 
                          'min_samples_split': [35,37,39,40,41,43,45]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42),
                              param_grid,
                              verbose=1,
                              cv=3)
grid_search_cv.fit(feature_engineering(X_train.head(9000), chars_2), y_train.head(9000))

print("The best parameters are: ", grid_search_cv.best_params_)

Fitting 3 folds for each of 343 candidates, totalling 1029 fits
The best parameters are:  {'max_depth': 21, 'max_leaf_nodes': 70, 'min_samples_split': 39}


In [21]:
param_grid = {'max_depth': [20,21,22], 'max_leaf_nodes': [65,67,69,70,71,73,75], 
                          'min_samples_split': [38,39,40]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42),
                              param_grid,
                              verbose=1,
                              cv=3)
grid_search_cv.fit(feature_engineering(X_train.head(9000), chars_2), y_train.head(9000))

print("The best parameters are: ", grid_search_cv.best_params_)

Fitting 3 folds for each of 63 candidates, totalling 189 fits
The best parameters are:  {'max_depth': 22, 'max_leaf_nodes': 75, 'min_samples_split': 38}


In [22]:
param_grid = {'max_depth': [20,21,22,23,24], 'max_leaf_nodes': [73,74,75,76,77,78,79], 
                          'min_samples_split': [36,37,38,39,40]}
grid_search_cv = GridSearchCV(DecisionTreeClassifier(random_state=42),
                              param_grid,
                              verbose=1,
                              cv=3)
grid_search_cv.fit(feature_engineering(X_train.head(9000), chars_2), y_train.head(9000))

print("The best parameters are: ", grid_search_cv.best_params_)

Fitting 3 folds for each of 175 candidates, totalling 525 fits
The best parameters are:  {'max_depth': 22, 'max_leaf_nodes': 75, 'min_samples_split': 36}


# Fourth Model: Decision Tree with grid searched parameters

In [23]:
from sklearn.tree import DecisionTreeClassifier

tree_clf_opt = DecisionTreeClassifier(max_depth=22, max_leaf_nodes=75,
                                      min_samples_split=36, random_state=42)
tree_clf_opt.fit(feature_engineering(X_train.head(9000), chars_2), y_train.head(9000))
opt_preds = tree_clf_opt.predict(feature_engineering(X_test, chars_2))

acc_score_tree_opt = accuracy_score(y_test, opt_preds)
print('Accuracy=%s' % (acc_score_tree_opt))

Accuracy=0.8934090909090909


With the optimal hyperparameters, the accuracy score went up 1.23%

# Grid Searches for Random Forest Hyperparameters

In [32]:
from sklearn.model_selection import GridSearchCV

param_grid_2 = {'max_depth': [1,2,3,4,5,10,20,30,40,50], 'n_estimators': [10,100,200,300,400,500],
              'min_samples_split': [5,10,20,30,40,50]}

grid_search_cv_2 = GridSearchCV(RandomForestClassifier(random_state=42),
                              param_grid_2,
                              verbose=1,
                              cv=3)

grid_search_cv_2.fit(X_train_ready, y_train_ready)

print("The best parameters are: ", grid_search_cv_2.best_params_)

Fitting 3 folds for each of 360 candidates, totalling 1080 fits
The best parameters are:  {'max_depth': 40, 'min_samples_split': 10, 'n_estimators': 500}


In [33]:
param_grid_2 = {'max_depth': [35,37,39,40,41,43,45], 'n_estimators': [500,600,700],
              'min_samples_split': [5,7,9,10,11,13,15]}

grid_search_cv_2 = GridSearchCV(RandomForestClassifier(random_state=42),
                              param_grid_2,
                              verbose=1,
                              cv=3)

grid_search_cv_2.fit(X_train_ready, y_train_ready)

print("The best parameters are: ", grid_search_cv_2.best_params_)

Fitting 3 folds for each of 147 candidates, totalling 441 fits
The best parameters are:  {'max_depth': 45, 'min_samples_split': 7, 'n_estimators': 700}


In [34]:
param_grid_2 = {'max_depth': [40,42,44,45,46,48,50], 'n_estimators': [160,600,700,800],
              'min_samples_split': [5,6,7,8,9]}

grid_search_cv_2 = GridSearchCV(RandomForestClassifier(random_state=42),
                              param_grid_2,
                              verbose=1,
                              cv=3)

grid_search_cv_2.fit(X_train_ready, y_train_ready)

print("The best parameters are: ", grid_search_cv_2.best_params_)

Fitting 3 folds for each of 140 candidates, totalling 420 fits
The best parameters are:  {'max_depth': 45, 'min_samples_split': 7, 'n_estimators': 700}


In [35]:
param_grid_2 = {'max_depth': [45], 'n_estimators': [650,675,700,725,750],
              'min_samples_split': [7]}

grid_search_cv_2 = GridSearchCV(RandomForestClassifier(random_state=42),
                              param_grid_2,
                              verbose=1,
                              cv=3)

grid_search_cv_2.fit(X_train_ready, y_train_ready)

print("The best parameters are: ", grid_search_cv_2.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
The best parameters are:  {'max_depth': 45, 'min_samples_split': 7, 'n_estimators': 700}


In [36]:
param_grid_2 = {'max_depth': [45], 'n_estimators': [685,695,700,705,715],
              'min_samples_split': [7]}

grid_search_cv_2 = GridSearchCV(RandomForestClassifier(random_state=42),
                              param_grid_2,
                              verbose=1,
                              cv=3)

grid_search_cv_2.fit(X_train_ready, y_train_ready)

print("The best parameters are: ", grid_search_cv_2.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
The best parameters are:  {'max_depth': 45, 'min_samples_split': 7, 'n_estimators': 715}


In [None]:
param_grid_2 = {'max_depth': [45], 'n_estimators': [710,714,715,716,720],
              'min_samples_split': [7]}

grid_search_cv_2 = GridSearchCV(RandomForestClassifier(random_state=42),
                              param_grid_2,
                              verbose=1,
                              cv=3)

grid_search_cv_2.fit(X_train_ready, y_train_ready)

print("The best parameters are: ", grid_search_cv_2.best_params_)

In [38]:
param_grid_2 = {'max_depth': [45], 'n_estimators': [715,716,717,718],
              'min_samples_split': [7]}

grid_search_cv_2 = GridSearchCV(RandomForestClassifier(random_state=42),
                              param_grid_2,
                              verbose=1,
                              cv=3)

grid_search_cv_2.fit(X_train_ready, y_train_ready)

print("The best parameters are: ", grid_search_cv_2.best_params_)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
The best parameters are:  {'max_depth': 45, 'min_samples_split': 7, 'n_estimators': 716}


In [40]:
param_grid_2 = {'max_depth': [45], 'n_estimators': [716],
              'min_samples_split': [7], 'max_leaf_nodes': [10,20,30,40,50]}

grid_search_cv_2 = GridSearchCV(RandomForestClassifier(random_state=42),
                              param_grid_2,
                              verbose=1,
                              cv=3)

grid_search_cv_2.fit(X_train_ready, y_train_ready)

print("The best parameters are: ", grid_search_cv_2.best_params_)

Fitting 3 folds for each of 5 candidates, totalling 15 fits
The best parameters are:  {'max_depth': 45, 'max_leaf_nodes': 50, 'min_samples_split': 7, 'n_estimators': 716}


# Fifth Model: Random Forrest with Optimal Parameters

In [41]:
from sklearn.ensemble import RandomForestClassifier
rnd_clf_opt = RandomForestClassifier(n_estimators=716, min_samples_split=7, max_depth=45, max_leaf_nodes=50, random_state=42)
rnd_clf_opt.fit(X_train_ready, y_train_ready)

y_preds_opt = rnd_clf_opt.predict(X_test_ready)

from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test, y_preds_opt)

print('Accuracy=%s' % (acc_score))

Accuracy=0.9113636363636364


# Sixth Model: Naive Bayes

In [26]:
X_train_ready = feature_engineering(X_train.head(9000), chars_2)
y_train_ready = y_train.head(9000)
X_test_ready = feature_engineering(X_test, chars_2)

In [25]:
from sklearn.naive_bayes import MultinomialNB

NB_clf = MultinomialNB()
NB_clf.fit(feature_engineering(X_train.head(9000), chars_2), y_train.head(9000))
y_predict = NB_clf.predict(feature_engineering(X_test, chars_2))

acc_score_NB = accuracy_score(y_test, y_predict)
print('Accuracy=%s' % (acc_score_NB))

Accuracy=0.4152272727272727


# Creating lists of alphabets for languages

In [None]:
# Create these arrays into dictonaries
english_alpha = [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z]
estonian_alpha = [A, B, D, E, F, G, H, I, J, K, L, M, N, O, P, R, S, Š, Z, Ž, T, U, V, Õ, Ä, Ö, Ü]
swedish_alpha = [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, å, ä, ö]
thai_alpha = [ก, ข, ค, ฅ, ฆ, ง, จ, ฉ, ช, ฌ, ญ, ฎ, ฏ, ฐ, ฑ, ฒ, ณ, ด, ต, ถ, ท, ธ, น, บ, บ, ผ, ฝ, พ, ฟ, ภ, 
               ม, ย, ร, ล, ว, ศ, ษ, ส, ห, ฬ, อ, ฮ] 
tamil_alpha = [அ, ஆ, இ, ஈ, உ, ஊ, எ, ஏ, ஐ, ஒ, ஓ, ஔ, க, ங, ச, ஞ, ட, ண, த, ந, ன, ப, ம, ய, ர, ற, ல, ள, ழ, வ]
dutch_alpha = english_alpha
japanese_alpha = [ぁ, あ, ぃ, い, ぅ, う, ぇ, え, ぉ, お, か, が, き, ぎ, く, ぐ, け, げ, こ, ご, さ, ざ, し, じ, す, ず,
                  せ, ぜ, そ, ぞ, た, だ, ち, ぢ, っ, つ, づ, て, で, と, ど, な, に, ぬ, ね, の, は, ば, ぱ, ひ, び, ぴ,
                  ふ, ぶ, ぷ, へ, べ, れ, る, り, ら, よ, ょ, ゆ, ゅ, や, ゃ, も, め, む, み, ま, ぽ, ぼ, ほ, ぺ, ろ, ゎ,
                  わ, ゐ, ゑ, を, ん, ゔ, ゕ, ゖ,  ゚, ゛, ゜, ゝ, ゞ, ゟ, ゠, ァ, ア, サ, ゴ, コ, ゲ, ケ, グ, ク, ギ, キ,
                  ガ, カ, オ, ォ, エ, ェ, ウ, ゥ, イ, ィ, ザ, シ, ジ, ス, ズ, セ, ゼ, ソ, ゾ, タ ,ダ ,チ ,ヂ, ッ, ツ, ヅ,
                  テ, デ, ト, ホ, ペ, ベ, ヘ, プ, ブ, フ, ピ, ビ, ヒ, パ, バ, ハ, ノ, ネ, ヌ, ニ, ナ, ド, ボ, ポ, マ, ミ, 
                  ム, メ, モ, ャ, ヤ, ュ, ユ, ョ, ヨ, ラ, リ, ル, レ, ロ, ヮ, ㍿, ㍐, ヿ, ヾ, ヽ, ー, ・, ヺ, ヹ, ヸ, ヷ,
                  ヶ, ヵ, ヴ, ン, ヲ, ヱ, ヰ, ワ]
turkish_alpha = [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, r, s, t, u, v, y, z, ç, ğ, ı, İ, î, ö, ş, ü]
latin_alpha = english_alpha
urdu_alpha = [ش,س,ژ,ز,ڑ,ر,ذ,ڈ,د,خ,ح,چ,
              ج,ث,ٹ,ت,پ,ب,آ,ا,ے,ی,ھ,ہ,و,ں,ن,م,ل,گ,ک,ق,ف,غ,ع,ظ,ط,ض,ص]
indonesian_alpha = english_alpha
portuguese_alpha = [ç, á, é, í, ó, ú, â, ê, ô, ã, õ, à, è, ì, ò, ù, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z]
french_alpha = [ç, é, â, ê, î, ô, û, à, è, ì, ò, ù, ë, ï, ü, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z]
chinese_alpha = [胡, 赛, 尼, 本, 人, 和, 小, 说, 的, 主, 人, 公, 阿, 米, 尔, 一, 样, 都, 是, 出, 生, 在, 阿, 富, 汗, 首, 都, 
                 喀, 布, 尔, 少, 年, 时, 代, 便, 离, 开, 了, 这, 个, 国, 家, 。, 胡, 赛, 尼, 直, 到, 年, 小, 说, 出, 版, 之, 
                 后, 才, 首, 次, 回, 到, 已, 经, 离, 开, 年, 的, 祖, 国, 。, 他, 在, 苏, 联, 入, 侵, 时, 离, 开, 了, 阿, 富, 
                 汗, 而, 他, 的, 很, 多, 童, 年, 好, 友, 在, 阿, 富, 汗, 生, 活, 在, 他, 们, 出, 发, 之, 前, 罗, 伯, 特, 伊,
                 达, 尔, 文, 卷, 查, 尔, 斯, 赖, 尔, 所, 著, 地, 质, 学, 原, 理, 在, 南, 美, 他, 得, 到, 第, 卷, 该, 书, 将, 
                 地, 形, 地, 貌, 解, 释, 为, 漫, 长, 历, 史, 时, 间, 渐, 进, 演, 变, 的, 的, 结, 果, 当, 他, 旅, 程, 的, 第, 
                 站, 抵, 达, 圣, 地, 亚, 哥, 佛, 得, 角, 的, 时, 候, 达, 尔, 文]
korean_alpha = [ᄁ,ᄂ,ᄃ,ᄄ,ᄅᄆᄇ,ᄈ,ᄉ,ᄊ,ᄋ,ᄌᄍ,ᄎ,ᄏ,ᄐ,ᄑᄒ,아,악,안,알,암,압,앙,앞애,액,앵야,얀,약,양,얘,어,억,
                언,얼,엄,업,엉,에,여,역,연,열,염,엽,영,예,ᄀ,여,역,연,열,염,엽,영,예,오,옥,온,올,옴,옹,와,완,왈,왕,왜,외,왼,
                요,욕,용,우,욱,운,울,움,웅,워,원,월,위,유,육,윤,율,융,윷,으,은,을,음읍,응,의,이,익,인,일,임,입,잉,잎]
hindi_alpha = [ऄ, अ, आ, इ, ई, उ, ऊ, ऋ, ऌ, ऍ, ऎ, ए, ऐ, ऑ, ऒ, ओ, औ, क, ख, ग, घ, ङ, च, छ, ज, झ, प, ऩ, न, ध, द, 
               थ, त, ण, ढ, ड, ठ, ट, ञ, फ, ब, भ, म, य, र, ऱ, ल, ळ, ऴ, व, श, ष, ४, ३, २, १, ०, ॥, ।, ॡ, ॠ, ॐ, ऽ, 
               ह, स, ५, ६, ७, ॲ, ॳ, ॴ, ॵ, ॶ, ॷ, ॹ, ॺ, ॻ, ॼ, ॾ, ॿ, ೱऀँं, ः, ऺ, ऻ, ा, ि, ी, ॎ, ॏॕैेॣॢ, ॗ]
spanish_alpha = [á, é, í, ó, ú, ñ, ü, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z]
pushto_alpha = [ﺏ ,پ ,ﺕ ,ټ ,ﺙ ,ﺝ ,چ ,ﺡ ,ﺥ ,څ ,ځ ,ﺩ ,ډ ,ﺫ ,ﺭ ,ړ ,ﺯ ,ژ ,ږ ,ﺱ ,ﺵ ,ښ ,ﺹ ,ﺽ ,ﻁ ,ﻅ ,ﻉ ,ﻍ ,ﻑ ,ﻕ ,ک ,ګ ,ﻝ ,ﻡ ,ﻥ ,ڼ, ,ﻭ ,ه ,ۀ ,ي ,ې ,ی ,ۍ ,ئ]
persian_alpha = [,ش,س,ژ,ز,ر,ذ,د,خ,ح,چ,ج,ث,ت,پ,ب,آ,ا,ص,ض,ط,ظ,ع,غ,ف,ق,ک,گ,ل,م,ن,و,ه,ی]
romanian_alpha = [ă, â, î, ș, ş, ț, ţ, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z]
russian_alpha = [б, в, г, д, ж, з, к, л, м, н, п, р, с, т, ф, х, ц, ч, ш, щ, а, е, ё, и, о, у, ы, э, ю, я, й]
arabic_alpha = [ش,س,ز,ر,ذ,د,خ,ح,ج,ث,ت,ب,ا,ء,ي,و,ه,ن,م,ل,ك,ق,ف,غ,ع,ظ,ط,ض,ص]


In [None]:
#characters for first model
chars = ['e', 't', 'ä', 'ö', 'a', 'n', 'ก', 'ข', 'ค', 'ฅ', 'ฆ', 'ง', 'அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 
         'o', 'r', 'ー', '日', 'あ', 'ぁ', 'ぇ', 'ç', 'ğ', 'ı', 'İ', 'î', 'ö', 'ş', 'i', 'u', 'چ', 'ح', 'خ', 'ش',
         'â', 'ù', 'è', 's', 'î', 'ë', '胡', '童', '。', 'ᄁ', '알', '에', 'ᄃ', 'ऺ', 'त', 'ऻ', 'क', 'á', 'é', 'í', 
         'ó', 'ږ','ک', 'ﻑ', 'ی', 'م', 'ث', 'ţ', 'ă', 'ș', 'ş', 'б', 'в', 'г', 'д', 'ص', 'ف', 'ج', 'ر']

new_chars = [['e', 't', 'a', 'i', 'o', 'n', 's', 'h', 'r'], ['a', 'e', 'i', 'ä', 'ö', 'õ', 'š', 'ü', 'ž'], 
             ['å', 'ä', 'ö', 'a', 'e', 't', 'n', 'r', 's', 'i'], ['ก', 'ข', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช', 'ฌ'],
             ['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ'], ['a', 'e', 'i', 'o', 'h', 'n', 'r', 't', 's'], 
             ['㍿', '㍐', 'ヿ', 'ヾ', 'ヽ', 'ー', '・', 'ヺ', 'ヹ', 'ヸ'], ['ç', 'ğ', 'ı', 'İ', 'î', 'ö', 'ş', 'ü', 'a', 'e'],
             ['a', 'e', 'i', 'n', 'r', 's', 't', 'u', 'm', 'd'], ['چ', 'ح', 'خ', 'ش', 'ن', 'ٹ', 'ن', 'ث', 'گ', 'ج'],
             ['a', 'A', 'i', 'n', 'r', 'm', 's', 't', 'u', 'g'], ['â', 'ê', 'ô', 'ã', 'õ', 'à', 'è', 'ì', 'ò', 'ù'],
             ['ô', 'û', 'à', 'è', 'ì', 'ò', 'ù', 'ë', 'ï', 'ü'], ['主', '人', '公', '阿', '米', '尔', '一', '样', '都', '是'],
             ['응','의','이','익','인','일','임','입','잉','잎'], ['ः', 'ऺ', 'ऻ', 'ा', 'ि', 'ी', 'ॎ', 'ई', 'उ', 'ऊ'], 
             ['á', 'é', 'í', 'ó', 'ú', 'ñ', 'ü', 't', 'e', 'i'], ['ت', 'ا', 'ې', 'ښ', 'ن', 'ر', 'ع', 'ط', 'ړ', 'س'],
             ['ق', ' غ', 'ج', 'ت', ' ن ', 'ی', 'ل ', 'ظ', 'ص', 'ز'], ['ă', 'â', 'î', 'ș', 'ş', 'ț', 'ţ'], 
             ['б', 'в', 'г', 'д', 'ж', 'з', 'к', 'л', 'м', 'н'], ['م', 'ص', 'ظ', 'و', 'ر', 'م', 'ي', 'ج', 'ز', 'ق']]