# Understanding the Kaggle Data
The dataset acquired from [Kaggle](https://www.kaggle.com/code/martinkk5575/language-detection/data) contains words from several different languages. The noise contained in the dataset are duplicate words. To reduce this noise, the words will be broken down into single and double characters, then rated based on how often they show up in that respective language.

In [3]:
import pandas as pd
import numpy as np

# Import data*
fileName = "dataset.csv"
data = pd.read_csv(fileName)

data

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas på eng the jesuit...,Swedish
2,ถนนเจริญกรุง อักษรโรมัน thanon charoen krung เ...,Thai
3,விசாகப்பட்டினம் தமிழ்ச்சங்கத்தை இந்துப் பத்திர...,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
...,...,...
21995,hors du terrain les années et sont des année...,French
21996,ใน พศ หลักจากที่เสด็จประพาสแหลมมลายู ชวา อินเ...,Thai
21997,con motivo de la celebración del septuagésimoq...,Spanish
21998,年月，當時還只有歲的她在美國出道，以mai-k名義推出首張英文《baby i like》，由...,Chinese


In [4]:
from sklearn.model_selection import train_test_split

X=data['Text'] # Feature matrix
y=data['language'] # Label

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Save the languages into a DataFrame that we aren't modifying
language_list = set(y)

In [5]:
X_train

5207     สัมประสิทธิ์ฮอลล์ ฟิสิกส์ไฟฟ้า เกี่ยวกับสนามแม...
4450     เกิดวันที่  พฤศจิกายน ภาคอะนิเมะ คดีฆาตกรรมบนจ...
7033     i omgivningarna runt manigotagan river park re...
487      நிஞ்சா ஹட்டோரி 忍者ハットリくん ninja hattori என்பது க...
19537    эта страница деятельности м в ломоносова — ярк...
                               ...                        
11964    باباجان غفورف تاریخ‌دان و نویسندهٔ کتاب تاریخ ...
21575    en  fue invitado por fernando ii para ocupar l...
5390     doğu kanada atabasklarına geleneksel olarak dü...
860      پژواک د يوې ځانگړې پروژې په توگه د اساسي قانون...
15795    テンサイについては糖分を高度に精製する必要があることからサトウキビと同じような黒糖を作るのは...
Name: Text, Length: 17600, dtype: object

In [6]:
def feature_engineering(dataframe, chars):
    arr = dataframe.to_numpy()
    new_arr = np.zeros((len(arr), len(chars)))
    i=0
    j=0
    for text in arr:
        sentence = text
        j=0
        for char in chars:
            count = 0
            for letter in sentence:
                if letter == char:
                    count = count + 1
                fraction = count/len(sentence)
            new_arr[i,j] = fraction
            j = j + 1
        
        i = i + 1
            
    data_frame = pd.DataFrame(new_arr, columns = chars)
    return data_frame

In [None]:
def feature_engineering_2(dataframe, chars):
    arr = dataframe.to_numpy()
    new_arr = np.zeros((len(arr), len(chars)))
    i=0
    j=0
    for text in arr:
        sentence = text
        count = 0.0
        j = 0
        for list in chars:
            count = 0.0
            for char in list:
                for letter in sentence:
                    if letter == char:
                        count = count + 1.0
            fraction = count/len(sentence)
            new_arr[i,j] = fraction
            j = j+1
        i = i+1
    
    names = ['english', 'estonian', 'swedish', 'thai', 'tamil', 'dutch', 'japanese', 'turkish', 'latin', 'urdu',
             'indonesian', 'portuguese', 'french', 'chinese', 'korean', 'hindi', 'spanish', 'pushto', 'persian',
             'romanian', 'russian', 'arabic']
    
    data_frame = pd.DataFrame(new_arr, columns = names)
    return data_frame

In [None]:
new_chars = [['e', 't', 'a', 'i', 'o', 'n', 's', 'h', 'r'], ['a', 'e', 'i', 'ä', 'ö', 'õ', 'š', 'ü', 'ž'], 
             ['å', 'ä', 'ö', 'a', 'e', 't', 'n', 'r', 's', 'i'], ['ก', 'ข', 'ค', 'ฅ', 'ฆ', 'ง', 'จ', 'ฉ', 'ช', 'ฌ'],
             ['அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 'எ', 'ஏ', 'ஐ', 'ஒ'], ['a', 'e', 'i', 'o', 'h', 'n', 'r', 't', 's'], 
             ['㍿', '㍐', 'ヿ', 'ヾ', 'ヽ', 'ー', '・', 'ヺ', 'ヹ', 'ヸ'], ['ç', 'ğ', 'ı', 'İ', 'î', 'ö', 'ş', 'ü', 'a', 'e'],
             ['a', 'e', 'i', 'n', 'r', 's', 't', 'u', 'm', 'd'], ['چ', 'ح', 'خ', 'ش', 'ن', 'ٹ', 'ن', 'ث', 'گ', 'ج'],
             ['a', 'A', 'i', 'n', 'r', 'm', 's', 't', 'u', 'g'], ['â', 'ê', 'ô', 'ã', 'õ', 'à', 'è', 'ì', 'ò', 'ù'],
             ['ô', 'û', 'à', 'è', 'ì', 'ò', 'ù', 'ë', 'ï', 'ü'], ['主', '人', '公', '阿', '米', '尔', '一', '样', '都', '是'],
             ['응','의','이','익','인','일','임','입','잉','잎'], ['ः', 'ऺ', 'ऻ', 'ा', 'ि', 'ी', 'ॎ', 'ई', 'उ', 'ऊ'], 
             ['á', 'é', 'í', 'ó', 'ú', 'ñ', 'ü', 't', 'e', 'i'], ['ت', 'ا', 'ې', 'ښ', 'ن', 'ر', 'ع', 'ط', 'ړ', 'س'],
             ['ق', ' غ', 'ج', 'ت', ' ن ', 'ی', 'ل ', 'ظ', 'ص', 'ز'], ['ă', 'â', 'î', 'ș', 'ş', 'ț', 'ţ'], 
             ['б', 'в', 'г', 'д', 'ж', 'з', 'к', 'л', 'м', 'н'], ['م', 'ص', 'ظ', 'و', 'ر', 'م', 'ي', 'ج', 'ز', 'ق']]

panda = X_train.head()
some_data = feature_engineering_2(panda, new_chars)
some_data
    

In [None]:
X_train_2 = X_train.head(6000)
X_train_2_eng = feature_engineering_2(X_train_2, new_chars)

y_train_2 = y_train.head(6000)

from sklearn.ensemble import RandomForestClassifier

rnd_clf_2 = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
rnd_clf_2.fit(X_train_2_eng, y_train_2)

X_test_2 = feature_engineering_2(X_test, new_chars)
y_preds_2 = rnd_clf_2.predict(X_test_2)

from sklearn.metrics import accuracy_score

acc_score_2 = accuracy_score(y_test, y_preds_2)

print('Accuracy=%s' % (acc_score_2))

### First Test Model
----
This is a test model to experiment how to implement the finalized model onto the flask website (via pickle file)

In [None]:
#characters for first model
chars_2 = ['e', 't', 'ä', 'ö', 'a', 'n', 'ก', 'ข', 'ค', 'ฅ', 'ฆ', 'ง', 'அ', 'ஆ', 'இ', 'ஈ', 'உ', 'ஊ', 
         'o', 'r', 'ー', '日', 'あ', 'ぁ', 'ぇ', 'ç', 'ğ', 'ı', 'İ', 'î', 'ö', 'ş', 'i', 'u', 'چ', 'ح', 'خ', 'ش',
         'â', 'ù', 'è', 's', 'î', 'ë', '胡', '童', '。', 'ᄁ', '알', '에', 'ᄃ', 'ऺ', 'त', 'ऻ', 'क', 'á', 'é', 'í', 
         'ó', 'ږ','ک', 'ﻑ', 'ی', 'م', 'ث', 'ţ', 'ă', 'ș', 'ş', 'б', 'в', 'г', 'д', 'ص', 'ف', 'ج', 'ر']

X_train_2 = X_train.head(3000)
model_one = feature_engineering(X_train_2, chars_2)
model_one

y_train_one = y_train.head(3000)

from sklearn.ensemble import RandomForestClassifier
rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)
rnd_clf.fit(model_one, y_train_one)

X_test_1 = feature_engineering(X_test, chars_2)

y_preds = rnd_clf.predict(X_test_1)

from sklearn.metrics import accuracy_score
acc_score = accuracy_score(y_test, y_preds)

print('Accuracy=%s' % (acc_score))

import pickle #ask about pickle

saved_model = pickle.dumps(rnd_clf)

rdf_from_pickle = pickle.loads(saved_model)

In [8]:
panda = X_train.head()
panda

5207     สัมประสิทธิ์ฮอลล์ ฟิสิกส์ไฟฟ้า เกี่ยวกับสนามแม...
4450     เกิดวันที่  พฤศจิกายน ภาคอะนิเมะ คดีฆาตกรรมบนจ...
7033     i omgivningarna runt manigotagan river park re...
487      நிஞ்சா ஹட்டோரி 忍者ハットリくん ninja hattori என்பது க...
19537    эта страница деятельности м в ломоносова — ярк...
Name: Text, dtype: object

In [9]:
first_22_features = ['e', 't', 'a', 'i', 'o', 'á', 'é', 'í']

test_data = feature_engineering(panda, first_22_features)
test_data_2 = numpy_into_dataframe(test_data, first_22_features)

In [10]:
test_data_2

Unnamed: 0,e,t,a,i,o,á,é,í
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.106299,0.051181,0.086614,0.051181,0.023622,0.0,0.0,0.0
3,0.0,0.00692,0.00692,0.00692,0.00346,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## Creating Features: Most Used Characters
To train the models to determine which language is being used by the user, we first need to know which characters are used in each language. The best approach for this is to used the dataset, _which is already using the characters from each language_, and find the **most used characters** in them.

### Kaggle Resources
----
The following code is taken from the [Kaggle](https://www.kaggle.com/code/martinkk5575/language-detection/notebook) to understand how to process characters for each language. Kaggle uses the `CountVectorizor` Method from the `sklearn` module to tokenize the characters into readable 1's and 0's. Then, it counts how many times that character has been used in the sample data provided.

This method reduces the necessity of locating alphabets for each language and creating custom functions to find the most used characters in the dataset.
 
**To summarize Kaggle's findings: Languages based off the Latin Alphabet are easier to differentiate from each other in the data set while Languages with their own Alphabet, _like Chinese and Japanese_, can be differentiated by single characters alone.**


This code here fits and transforms the X_train and X_test data sets into readable 1's and 0's and counts the number of times a specific character shows up in the datasets.The `min_df` parameter tells `CountVectorizor` to save any characters that are used **at least 1% of the time** in the dataset.

These matrices are saved as `X_top1Percent_train_raw` and `X_top1Percent_test_raw`.

In [11]:
# This will create a list of single and double characters from the top 1% of to be used as features
from sklearn.feature_extraction.text import CountVectorizer

top1PrecentMixtureVectorizer = CountVectorizer(analyzer='char', ngram_range=(1,2), min_df=1e-2)

X_top1Percent_train_raw = top1PrecentMixtureVectorizer.fit_transform(X_train)
X_top1Percent_test_raw = top1PrecentMixtureVectorizer.transform(X_test)

The `train_lang_dict()` function takes in the raw vectorized X_train and y_train data sets and converts them into readable dictionaries.

In [12]:
# This Command from Kaggle connects the character features to their specific language

# Aggregate Unigrams per language
def train_lang_dict(X_raw_counts, y_train):
    lang_dict = {}
    for i in range(len(y_train)):
        lang = y_train[i]
        v = np.array(X_raw_counts[i])
        if not lang in lang_dict:
            lang_dict[lang] = v
        else:
            lang_dict[lang] += v
            
    # to relative
    for lang in lang_dict:
        v = lang_dict[lang]
        lang_dict[lang] = v / np.sum(v)
        
    return lang_dict

In [13]:
top1PrecentMixtureVectorizer = CountVectorizer(analyzer='char', ngram_range=(1,2), min_df=1e-2, max_df=.9)

X_top1Percent_train_raw = top1PrecentMixtureVectorizer.fit_transform(X_train)
X_top1Percent_test_raw = top1PrecentMixtureVectorizer.transform(X_test)

In [14]:
language_dict_top1Percent = train_lang_dict(X_top1Percent_train_raw.toarray(), y_train.values)

top1PercentFeatures = top1PrecentMixtureVectorizer.get_feature_names_out()

pd.DataFrame(top1PercentFeatures)

Unnamed: 0,0
0,""""
1,-
2,[
3,a
4,b
...,...
3073,（
3074,）
3075,）。
3076,，


The `getRelevantGramsPerLanguage()` function processes the dictionary and returns a dictionary with only the top 50 **most used** characters for **each** language. This number _can_ be changed by setting `top=x` when you call `getRelevantGramsPerLanguage()`

In [15]:
def getRelevantGramsPerLanguage(features, language_dict, top=50):
    relevantGramsPerLanguage = {}
    for lang in language_list:
        chars = []
        relevantGramsPerLanguage[lang] = chars
        v = language_dict[lang]
        sortIndex = (-v).argsort()[:top]
        for i in range(len(sortIndex)):
            chars.append(features[sortIndex[i]])
    return relevantGramsPerLanguage

Below Displays the top 8 and 10 Characters from each language in a DataFrame 

In [16]:
top8PerLanguage_dict = getRelevantGramsPerLanguage(top1PercentFeatures, language_dict_top1Percent, top=8)
top8PerLanguage_dict

{'Urdu': ['ا', 'ی', 'ر', 'و', 'ک', 'م', 'ن', 'ہ'],
 'French': ['e', 'a', 'n', 's', 'i', 'r', 't', 'l'],
 'Pushto': ['و', 'ا', 'ه', 'ي', 'ه ', 'د', 'ر', 'ل'],
 'Spanish': ['e', 'a', 'o', 'n', 's', 'r', 'i', 'l'],
 'Romanian': ['e', 'a', 'i', 'r', 'n', 't', 'u', 'l'],
 'Estonian': ['a', 'i', 'e', 's', 't', 'l', 'n', 'u'],
 'Indonesian': ['a', 'n', 'e', 'i', 'r', 'u', 't', 's'],
 'Persian': ['ا', 'ی', 'ر', 'د', 'ن', 'ه', 'و', 'م'],
 'Dutch': ['e', 'n', 'a', 'i', 'r', 't', 'o', 'd'],
 'Japanese': ['の', '、', 'に', 'た', 'る', '。', 'は', 'と'],
 'Korean': ['이', '의', '다', '의 ', '에', '는', '는 ', '하'],
 'Tamil': ['்', 'க', 'ு', 'ி', 'த', '் ', 'ப', 'ம'],
 'English': ['e', 'a', 't', 'i', 'o', 'n', 's', 'r'],
 'Swedish': ['e', 'r', 'a', 'n', 't', 'i', 's', 'd'],
 'Thai': ['า', 'น', 'ร', 'ก', 'อ', '่', 'เ', 'ง'],
 'Turkish': ['a', 'e', 'i', 'n', 'r', 'l', 'ı', 'd'],
 'Portugese': ['a', 'e', 'o', 's', 'i', 'r', 'd', 'n'],
 'Arabic': ['ا', 'ل', 'ي', 'ال', 'م', 'و', ' ا', 'ن'],
 'Chinese': ['，', '的', '。', 

In [17]:
top10PerLanguage_dict = getRelevantGramsPerLanguage(top1PercentFeatures, language_dict_top1Percent, top=10)
top10PerLanguage_dict

{'Urdu': ['ا', 'ی', 'ر', 'و', 'ک', 'م', 'ن', 'ہ', 'ے', 'ل'],
 'French': ['e', 'a', 'n', 's', 'i', 'r', 't', 'l', 'e ', 'u'],
 'Pushto': ['و', 'ا', 'ه', 'ي', 'ه ', 'د', 'ر', 'ل', 'ن', ' د'],
 'Spanish': ['e', 'a', 'o', 'n', 's', 'r', 'i', 'l', 'd', 't'],
 'Romanian': ['e', 'a', 'i', 'r', 'n', 't', 'u', 'l', 'o', 'c'],
 'Estonian': ['a', 'i', 'e', 's', 't', 'l', 'n', 'u', 'o', 'k'],
 'Indonesian': ['a', 'n', 'e', 'i', 'r', 'u', 't', 's', 'an', 'k'],
 'Persian': ['ا', 'ی', 'ر', 'د', 'ن', 'ه', 'و', 'م', 'ت', 'ب'],
 'Dutch': ['e', 'n', 'a', 'i', 'r', 't', 'o', 'd', 's', 'n '],
 'Japanese': ['の', '、', 'に', 'た', 'る', '。', 'は', 'と', 'ー', 'を'],
 'Korean': ['이', '의', '다', '의 ', '에', '는', '는 ', '하', '을', '을 '],
 'Tamil': ['்', 'க', 'ு', 'ி', 'த', '் ', 'ப', 'ம', 'ட', 'ர'],
 'English': ['e', 'a', 't', 'i', 'o', 'n', 's', 'r', 'h', 'l'],
 'Swedish': ['e', 'r', 'a', 'n', 't', 'i', 's', 'd', 'l', 'o'],
 'Thai': ['า', 'น', 'ร', 'ก', 'อ', '่', 'เ', 'ง', 'ม', 'ั'],
 'Turkish': ['a', 'e', 'i', 'n', 'r', 

In [18]:
def dictToArray(dict, languages=language_list):
    '''Converts a Language dictionary to an array and removes the duplicate values'''
    char = []

    for lang in languages:
        arr = dict[lang]
        char = char + arr
    
    dict_array = list(set(char))

    return dict_array

In [19]:
dict8_array = dictToArray(top8PerLanguage_dict)

print(dict8_array)
print("Total Number of Characters:", len(dict8_array))

['의', ' ا', '하', '、', 'د', 'த', 'l', 'स', 'ال', '는', 'க', '年', 'ه ', 'า', 'u', 'ر', 'о', 'م', 'и', '，', 'น', 'ه', 'ு', '는 ', 'ي', 'र', 'a', 'क', '一', 'ن', 'ा', 'に', '。', 'ہ', '的', 'ک', 'る', 'เ', 'ி', 'а', 'ı', '이', 'ம', 'ل', 's', 'ப', ' क', 'ا', '் ', 'ی', 'อ', 'н', '다', 'e', '中', 'ก', 'т', '่', 'i', 'े', 'с', 'ि', 'た', 'は', 'の', 't', 'n', 'と', 'r', '在', '의 ', '에', 'و', 'ร', 'е', 'o', 'р', 'ง', '்', '्', 'd']
Total Number of Characters: 81


In [20]:
dict10_array = dictToArray(top10PerLanguage_dict)

print(dict10_array)
print("Total Number of Characters:", len(dict10_array))

['의', ' ا', '하', '、', 'د', 'த', 'l', 'h', 'ے', 'स', 'ال', '을 ', '는', 'க', '年', 'ه ', 'า', 'u', 'ر', 'о', 'م', 'и', 'ت', '，', 'n ', 'น', 'ه', 'л', 'ு', '는 ', 'd', 'ي', 'र', 'a', 'क', '一', 'ن', 'ा', 'に', '。', 'ہ', '的', 'ک', 'る', 'เ', 'ி', 'а', 'ı', ' د', '이', 'त', 'ம', 'ー', 'न', 'ب', 'ل', 'ม', 'm', 's', 'ப', 'e ', 'を', 'c', ' क', 'k', 'ا', '் ', 'ی', 'อ', 'ั', 'н', '다', 'e', '中', 'ர', 'ก', 'т', '่', 'i', 'े', 'с', 'ि', 'た', 'は', 'の', 't', 'n', 'と', 'r', 'an', '在', 'в', '을', '의 ', '에', 'و', 'ร', 'е', 'o', 'р', 'ง', '்', '्', 'ட']
Total Number of Characters: 104


## Training the Model: RandomForests
Now that we know the top 8 and 10 used characters for each language, we can use them to train our model.

First, we'll use our `feature engineering` function to find the number of times our characters are used in the test set.

In [21]:
dictVectorizor = feature_engineering(X_train, dict8_array)
dict8Frame = numpy_into_dataframe(dictVectorizor, dict8_array)


In [22]:
dict8Frame

Unnamed: 0,의,ا,하,、,د,த,l,स,ال,는,...,에,و,ร,е,o,р,ง,்,्,d
0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.021622,0.00000,0.000000,0.000000,0.027027,0.000000,0.0,0.000000
1,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.039906,0.00000,0.000000,0.000000,0.035211,0.000000,0.0,0.000000
2,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.031496,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.023622,0.000000,0.000000,0.000000,0.0,0.039370
3,0.0,0.0,0.0,0.000000,0.000000,0.079585,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.003460,0.000000,0.000000,0.148789,0.0,0.000000
4,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.07984,0.000000,0.042914,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17595,0.0,0.0,0.0,0.000000,0.027304,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.051195,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
17596,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.040359,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.076233,0.000000,0.000000,0.000000,0.0,0.053812
17597,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.078283,0.0,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.025253,0.000000,0.000000,0.000000,0.0,0.035354
17598,0.0,0.0,0.0,0.000000,0.057018,0.000000,0.000000,0.0,0.0,0.0,...,0.0,0.061404,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


Next, a `RandomForestClassifier` model will be trained on the created dictionary.

In [23]:
from sklearn.ensemble import RandomForestClassifier

rfc = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)

rfc.fit(dict8Frame, y_train)

RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, random_state=42)

The test set will be vectorized and predicted by the model.

In [24]:
testVectorizor = feature_engineering(X_test, dict8_array)

In [25]:
y_8pred = rfc.predict(testVectorizor)



Now, to display our accuracy score.

In [39]:
from sklearn.metrics import accuracy_score

print("Accuracy Score of top 8 used characters:",accuracy_score(y_test, y_8pred))

Accuracy Score of top 8 used characters: 0.8393181818181819


Now for the Top 10 Characters

In [27]:
dictVectorizor = feature_engineering(X_train, dict10_array)
dict10Frame = numpy_into_dataframe(dictVectorizor, dict10_array)

In [28]:
dict10Frame

Unnamed: 0,의,ا,하,、,د,த,l,h,ے,स,...,에,و,ร,е,o,р,ง,்,्,ட
0,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.021622,0.00000,0.000000,0.000000,0.027027,0.000000,0.0,0.000000
1,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.039906,0.00000,0.000000,0.000000,0.035211,0.000000,0.0,0.000000
2,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.031496,0.011811,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.023622,0.000000,0.000000,0.000000,0.0,0.000000
3,0.0,0.0,0.0,0.000000,0.000000,0.079585,0.000000,0.003460,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.003460,0.000000,0.000000,0.148789,0.0,0.069204
4,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.000000,0.000000,0.07984,0.000000,0.042914,0.000000,0.000000,0.0,0.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17595,0.0,0.0,0.0,0.000000,0.027304,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.051195,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000
17596,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.040359,0.004484,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.076233,0.000000,0.000000,0.000000,0.0,0.000000
17597,0.0,0.0,0.0,0.000000,0.000000,0.000000,0.078283,0.005051,0.0,0.0,...,0.0,0.000000,0.000000,0.00000,0.025253,0.000000,0.000000,0.000000,0.0,0.000000
17598,0.0,0.0,0.0,0.000000,0.057018,0.000000,0.000000,0.000000,0.0,0.0,...,0.0,0.061404,0.000000,0.00000,0.000000,0.000000,0.000000,0.000000,0.0,0.000000


In [34]:
rfc2 = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, random_state=42)

rfc2.fit(dict10Frame, y_train)

RandomForestClassifier(max_leaf_nodes=16, n_estimators=500, random_state=42)

In [35]:
test2Vectorizor = feature_engineering(X_test, dict10_array)

In [37]:
y_10pred = rfc2.predict(test2Vectorizor)



In [40]:
print("Accuracy Score of top 10 used characters:",accuracy_score(y_test, y_10pred))

Accuracy Score of top 10 used characters: 0.8872727272727273


In [None]:
# Create these arrays into dictonaries
english_alpha = [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z]
estonian_alpha = [A, B, D, E, F, G, H, I, J, K, L, M, N, O, P, R, S, Š, Z, Ž, T, U, V, Õ, Ä, Ö, Ü]
swedish_alpha = [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z, å, ä, ö]
thai_alpha = [ก, ข, ค, ฅ, ฆ, ง, จ, ฉ, ช, ฌ, ญ, ฎ, ฏ, ฐ, ฑ, ฒ, ณ, ด, ต, ถ, ท, ธ, น, บ, บ, ผ, ฝ, พ, ฟ, ภ, 
               ม, ย, ร, ล, ว, ศ, ษ, ส, ห, ฬ, อ, ฮ] 
tamil_alpha = [அ, ஆ, இ, ஈ, உ, ஊ, எ, ஏ, ஐ, ஒ, ஓ, ஔ, க, ங, ச, ஞ, ட, ண, த, ந, ன, ப, ம, ய, ர, ற, ல, ள, ழ, வ]
dutch_alpha = english_alpha
japanese_alpha = [ぁ, あ, ぃ, い, ぅ, う, ぇ, え, ぉ, お, か, が, き, ぎ, く, ぐ, け, げ, こ, ご, さ, ざ, し, じ, す, ず,
                  せ, ぜ, そ, ぞ, た, だ, ち, ぢ, っ, つ, づ, て, で, と, ど, な, に, ぬ, ね, の, は, ば, ぱ, ひ, び, ぴ,
                  ふ, ぶ, ぷ, へ, べ, れ, る, り, ら, よ, ょ, ゆ, ゅ, や, ゃ, も, め, む, み, ま, ぽ, ぼ, ほ, ぺ, ろ, ゎ,
                  わ, ゐ, ゑ, を, ん, ゔ, ゕ, ゖ,  ゚, ゛, ゜, ゝ, ゞ, ゟ, ゠, ァ, ア, サ, ゴ, コ, ゲ, ケ, グ, ク, ギ, キ,
                  ガ, カ, オ, ォ, エ, ェ, ウ, ゥ, イ, ィ, ザ, シ, ジ, ス, ズ, セ, ゼ, ソ, ゾ, タ ,ダ ,チ ,ヂ, ッ, ツ, ヅ,
                  テ, デ, ト, ホ, ペ, ベ, ヘ, プ, ブ, フ, ピ, ビ, ヒ, パ, バ, ハ, ノ, ネ, ヌ, ニ, ナ, ド, ボ, ポ, マ, ミ, 
                  ム, メ, モ, ャ, ヤ, ュ, ユ, ョ, ヨ, ラ, リ, ル, レ, ロ, ヮ, ㍿, ㍐, ヿ, ヾ, ヽ, ー, ・, ヺ, ヹ, ヸ, ヷ,
                  ヶ, ヵ, ヴ, ン, ヲ, ヱ, ヰ, ワ]
turkish_alpha = [a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, r, s, t, u, v, y, z, ç, ğ, ı, İ, î, ö, ş, ü]
latin_alpha = english_alpha
urdu_alpha = [ش,س,ژ,ز,ڑ,ر,ذ,ڈ,د,خ,ح,چ,
              ج,ث,ٹ,ت,پ,ب,آ,ا,ے,ی,ھ,ہ,و,ں,ن,م,ل,گ,ک,ق,ف,غ,ع,ظ,ط,ض,ص]
indonesian_alpha = english_alpha
portuguese_alpha = [ç, á, é, í, ó, ú, â, ê, ô, ã, õ, à, è, ì, ò, ù, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z]
french_alpha = [ç, é, â, ê, î, ô, û, à, è, ì, ò, ù, ë, ï, ü, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z]
chinese_alpha = [胡, 赛, 尼, 本, 人, 和, 小, 说, 的, 主, 人, 公, 阿, 米, 尔, 一, 样, 都, 是, 出, 生, 在, 阿, 富, 汗, 首, 都, 
                 喀, 布, 尔, 少, 年, 时, 代, 便, 离, 开, 了, 这, 个, 国, 家, 。, 胡, 赛, 尼, 直, 到, 年, 小, 说, 出, 版, 之, 
                 后, 才, 首, 次, 回, 到, 已, 经, 离, 开, 年, 的, 祖, 国, 。, 他, 在, 苏, 联, 入, 侵, 时, 离, 开, 了, 阿, 富, 
                 汗, 而, 他, 的, 很, 多, 童, 年, 好, 友, 在, 阿, 富, 汗, 生, 活, 在, 他, 们, 出, 发, 之, 前, 罗, 伯, 特, 伊,
                 达, 尔, 文, 卷, 查, 尔, 斯, 赖, 尔, 所, 著, 地, 质, 学, 原, 理, 在, 南, 美, 他, 得, 到, 第, 卷, 该, 书, 将, 
                 地, 形, 地, 貌, 解, 释, 为, 漫, 长, 历, 史, 时, 间, 渐, 进, 演, 变, 的, 的, 结, 果, 当, 他, 旅, 程, 的, 第, 
                 站, 抵, 达, 圣, 地, 亚, 哥, 佛, 得, 角, 的, 时, 候, 达, 尔, 文]
korean_alpha = [ᄁ,ᄂ,ᄃ,ᄄ,ᄅᄆᄇ,ᄈ,ᄉ,ᄊ,ᄋ,ᄌᄍ,ᄎ,ᄏ,ᄐ,ᄑᄒ,아,악,안,알,암,압,앙,앞애,액,앵야,얀,약,양,얘,어,억,
                언,얼,엄,업,엉,에,여,역,연,열,염,엽,영,예,ᄀ,여,역,연,열,염,엽,영,예,오,옥,온,올,옴,옹,와,완,왈,왕,왜,외,왼,
                요,욕,용,우,욱,운,울,움,웅,워,원,월,위,유,육,윤,율,융,윷,으,은,을,음읍,응,의,이,익,인,일,임,입,잉,잎]
hindi_alpha = [ऄ, अ, आ, इ, ई, उ, ऊ, ऋ, ऌ, ऍ, ऎ, ए, ऐ, ऑ, ऒ, ओ, औ, क, ख, ग, घ, ङ, च, छ, ज, झ, प, ऩ, न, ध, द, 
               थ, त, ण, ढ, ड, ठ, ट, ञ, फ, ब, भ, म, य, र, ऱ, ल, ळ, ऴ, व, श, ष, ४, ३, २, १, ०, ॥, ।, ॡ, ॠ, ॐ, ऽ, 
               ह, स, ५, ६, ७, ॲ, ॳ, ॴ, ॵ, ॶ, ॷ, ॹ, ॺ, ॻ, ॼ, ॾ, ॿ, ೱऀँं, ः, ऺ, ऻ, ा, ि, ी, ॎ, ॏॕैेॣॢ, ॗ]
spanish_alpha = [á, é, í, ó, ú, ñ, ü, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z]
pushto_alpha = [,ﺏ ,پ ,ﺕ ,ټ ,ﺙ ,ﺝ ,چ ,ﺡ ,ﺥ ,څ ,ځ ,ﺩ ,ډ ,ﺫ ,ﺭ ,ړ ,ﺯ ,ژ ,ږ ,ﺱ ,ﺵ ,ښ ,ﺹ ,ﺽ ,ﻁ ,ﻅ ,ﻉ ,ﻍ ,ﻑ ,ﻕ ,ک ,ګ ,ﻝ ,ﻡ ,ﻥ ,ڼ, ,ﻭ ,ه ,ۀ ,ي ,ې ,ی ,ۍ ,ئ]
persian_alpha = [,ش,س,ژ,ز,ر,ذ,د,خ,ح,چ,ج,ث,ت,پ,ب,آ,ا,ص,ض,ط,ظ,ع,غ,ف,ق,ک,گ,ل,م,ن,و,ه,ی]
romanian_alpha = [ă, â, î, ș, ş, ț, ţ, a, b, c, d, e, f, g, h, i, j, k, l, m, n, o, p, q, r, s, t, u, v, w, x, y, z]
russian_alpha = [б, в, г, д, ж, з, к, л, м, н, п, р, с, т, ф, х, ц, ч, ш, щ, а, е, ё, и, о, у, ы, э, ю, я, й]
arabic_alpha = [ش,س,ز,ر,ذ,د,خ,ح,ج,ث,ت,ب,ا,ء,ي,و,ه,ن,م,ل,ك,ق,ف,غ,ع,ظ,ط,ض,ص]


In [None]:
# Function to calculate Average Word Length of a single string
def avg_word_len(string):
    # Split string up and find total amount of words present
    words = string.split()
    wordCount = len(words)
    
    # Calculate actual average  
    ch = 0
    for word in words:
        ch += len(word) # Add up all chars
    avg = ch / wordCount # Divide sum of chars by amount of words present
    return avg

In [None]:
# Function to calculate the Average Sentence Length across a piece of text
def avg_sent_len(text):
  sentences = text.split(".") # Split the text into a list of sentences.
  words = text.split(" ") # split the input text into a list of separate words
  if(sentences[len(sentences)-1]==""): # if the last value in sentences is an empty string
    average_sentence_length = len(words) / len(sentences)-1
  else:
    average_sentence_length = len(words) / len(sentences)
  return average_sentence_length # returning avg length of sentence
  
ans = avg_sent_len("I am going.to see you later") # function call
print(ans)

In [None]:
# Lets test this function (avg_sent_len)          #Mr., Dr., Ms., etc. are words that may be a problem for this function
avg_sent_len(input("Provide a body of text: "))

In [None]:
# One Hot Encoding                                #Attempt failed lol
from sklearn.preprocessing import OneHotEncoder

cat_encoder = OneHotEncoder() # instantiate one hot encoder
cat_encoder2 = cat_encoder.fit_transform(data_trans)
cat_encoder2 #sparse matrix

cat_encoder2.toarray()

array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 

In [None]:
# One Hot Encoding
features = pd.get_dummies(data_trans)

In [None]:
features

Unnamed: 0,0_ أ,1_ ہ,2_या,3_ د,4_行,5_》,6_こ,7_ng,8_े,9_अ,...,554_ட,555_்க,556_ा,557_서,558_을,559_த்,560_ु,561_t,562_も,563_ं
Chars,1,1,1,1,1,1,1,1,1,1,...,1,1,1,1,1,1,1,1,1,1


In [None]:
data_trans

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,554,555,556,557,558,559,560,561,562,563
Chars,ed,た。,m,ی,nd,d,と,د,ار,和,...,ใ,ไ,h,d,่,क,อง,‌,有,و
