### Detect language 

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
import pandas as pd
import numpy as np
import re
import matplotlib.pyplot as plt
import pickle
from sklearn.preprocessing import OneHotEncoder
import seaborn as sns

In [10]:
data = pd.read_csv("../data/Language Detection.csv")

In [11]:
data.Language.value_counts()

English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: Language, dtype: int64

In [58]:
data.iloc[0].Text

' Nature, in the broadest sense, is the natural, physical, material world or universe.'

In [12]:
# 'English'
data = data[data.Language.isin(['French', 'Russian'])]

In [13]:
data.shape

(1706, 2)

In [14]:
data.Language.value_counts()

French     1014
Russian     692
Name: Language, dtype: int64

In [15]:
def length(text):
    words = text.split()
    return len(words)

In [16]:
def is_ru(col):
    return 1 if col == "Russian" else 0

In [17]:
def clean(text):
    # remove numbers in text
    text = re.sub(r'[!@#$(),\n"%^*?\:;~`0-9]', ' ', text)
    # remove other symbols in text
    text = re.sub(r'[[]]', ' ', text)
    text = text.lower()
    return text

In [18]:
data['clean'] = data['Text'].apply(clean)

  """


In [19]:
data['lang_code'] = data['Language'].apply(is_ru)

In [20]:
data['len'] = data['Text'].apply(length)

In [21]:
data.head()

Unnamed: 0,Text,Language,clean,lang_code,len
3250,Si vous disposez d'ouvrages ou d'articles de r...,French,si vous disposez d'ouvrages ou d'articles de r...,0,54
3251,Comment ajouter mes sources ?,French,comment ajouter mes sources,0,5
3252,Cette page ou section est en train d'être trad...,French,cette page ou section est en train d'être trad...,0,40
3253,Vous pouvez aider au développement de Wikipédi...,French,vous pouvez aider au développement de wikipédi...,0,16
3254,Le mot nature est un terme polysémique (c’est-...,French,le mot nature est un terme polysémique c’est-...,0,50


In [22]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [23]:
X = cv.fit_transform(data["clean"])

In [24]:
y = data["lang_code"]

In [25]:
from sklearn.model_selection import train_test_split
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size = 0.20)

In [26]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

In [27]:
clf.fit(Xtrain, Ytrain)
Ypred = clf.predict(Xtest)

In [28]:
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report

In [29]:
accr = accuracy_score(Ytest, Ypred)
print(accr)

0.9941520467836257


In [30]:
print(classification_report(Ytest, Ypred))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       215
           1       0.99      0.99      0.99       127

    accuracy                           0.99       342
   macro avg       0.99      0.99      0.99       342
weighted avg       0.99      0.99      0.99       342



In [31]:
languages = {0:"french", 1:"russian"}

In [82]:
x = cv.transform(["Eh bien, mon prince"])
print(clf.predict_proba(x))
print(clf.predict(x))

[[0.99777482 0.00222518]]
[0]


In [33]:
def predict(text, verbose=False):
    x = cv.transform([text])
    lang = clf.predict(x)
    probs = clf.predict_proba(x)
    if verbose:
        print("Probability is: ", probs[0][0])
        print("The language is", languages[lang[0]])

In [35]:
predict("Eh bien, mon prince", verbose=True)

Probability is:  0.9987415039229451
The language is french


In [85]:
predict("Анна Павловна кашляла несколько дней, у нее был  грипп , как она говорила  (грипп  был тогда новое слово, употреблявшееся только редкими)")

Probability is:  1.514648429106703e-12
The language is russian


In [86]:
predict("В записочках, разосланных утром с красным лакеем, было написано без различия во всех:   «Si vous n’avez rien de mieux à faire, M. le comte (или mon prince), et si la perspective de passer la soirée chez une pauvre malade ne vous effraye pas trop, je serai charmée de vous voir chez moi entre 7 et 10 heures.")

Probability is:  1.0
The language is french


In [87]:
predict("Je vois que je vous fais peur,  садитесь и рассказывайте.")

Probability is:  0.999999999733447
The language is french


In [88]:
predict("— Вы весь вечер у меня, надеюсь?")

Probability is:  6.069362600857917e-06
The language is russian


In [89]:
predict("— Le général Koutouzoff, — сказал Болконский, ударяя на последнем слоге  zoff , как француз, — a bien voulu de moi pour aide-de-camp...   — Et Lise, votre femme?")

Probability is:  0.9999999999982663
The language is french


In [90]:
import spacy
from spacy_langdetect import LanguageDetector
from spacy.language import Language

def create_lang_detector(nlp, name):
    return LanguageDetector()

Language.factory("language_detector", func=create_lang_detector)

# nlp = spacy.load("en_core_web_sm")
nlp = spacy.load("fr_core_news_sm")
nlp.add_pipe('language_detector')

<spacy_langdetect.spacy_langdetect.LanguageDetector at 0x104b734d0>

In [91]:
text = 'This is an english text.'
doc = nlp(text)
print(doc._.language)
for sent in doc.sents:
    print(sent, sent._.language)

{'language': 'en', 'score': 0.9999955493241687}
This is an english text. {'language': 'en', 'score': 0.9999964420341838}


In [92]:
text2 = "Eh bien, mon prince"
doc = nlp(text2)
print(doc._.language)

{'language': 'es', 'score': 0.9999938889604822}


In [93]:
text3 = "Анна Павловна кашляла несколько дней, у нее был  грипп , как она говорила  (грипп  был тогда новое слово, употреблявшееся только редкими)"

In [94]:
doc = nlp(text3)
print(doc._.language)

{'language': 'ru', 'score': 0.9999954284182743}


In [95]:
text4 = "Je vois que je vous fais peur,  садитесь и рассказывайте."
doc = nlp(text4)
print(doc._.language)

{'language': 'ru', 'score': 0.7142841147198921}


In [96]:
text5 = "Je vois que je vous fais peur"
doc = nlp(text5)
print(doc._.language)

{'language': 'fr', 'score': 0.9999963567385901}


In [97]:
from custom_mnb import CustomMNB
mnb = CustomMNB()

In [98]:
X, y = data["Text"], data["lang_code"]

In [99]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, y, test_size = 0.20)

In [100]:
mnb.fit(Xtrain, Ytrain)

In [101]:
Ypred = mnb.predict(Xtest)

In [108]:
Ypred = np.asarray(Ypred)
print(classification_report(Ytest, Ypred))

              precision    recall  f1-score   support

           0       0.97      1.00      0.99       195
           1       1.00      0.97      0.98       147

    accuracy                           0.99       342
   macro avg       0.99      0.98      0.99       342
weighted avg       0.99      0.99      0.99       342



#### Create own CountVectorizor and test it!

In [5]:
data = [
    "I don't care, go on and tear me apart",
    "I don't care if you do,",
    "Cause in a sky, cause in sky full of stars",
    "I think I see you",
    "I think I see you"
]

In [6]:
import sys 
sys.path.append('../')

In [7]:
from custom_count_vectorizer import CustomCV

In [8]:
my_cv = CustomCV()

In [10]:
my_cv.fit_transform(data).toarray()

array([[0, 1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0],
       [0, 0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 2, 1, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1]])

In [11]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
X = cv.fit_transform(data)
X.todense().shape

(5, 19)

In [12]:
X.todense()

matrix([[1, 1, 1, 0, 0, 1, 0, 1, 0, 0, 1, 0, 1, 0, 0, 0, 1, 0, 0],
        [0, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
        [0, 0, 0, 2, 0, 0, 1, 0, 0, 2, 0, 1, 0, 0, 2, 1, 0, 0, 0],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1],
        [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1]])

In [49]:
fr = [
    "Bonjour, je me suis réveillé le matin,",
    "Brossé mes dents,",
    "Je suis allé à l'école",
    "Fini la journée",
    "J'écoutais de la musique",
     "rassemblé tous les livres",
     "Rebonjour",
     "J'ai fait mon lit",
     "Ils m'ont écouté",
     "Je me suis couché tot",
]

In [50]:
ru = [
    "Привет. Я проснулся утром,",
    "Почистил зубы,",
    "Отправились в школу",
    "Закончил день",
    "я слушал музыку",
    "Собрал все книги",
    "привет еще раз",
    "я заправил свою постель",
    "Они слушали меня",
    "я рано лег спать"
]

In [51]:
X = fr + ru
y = [0]*10 + [1]*10

In [52]:
X_test = [
    "Bonjour, je m'appelle Junior",
    "Привет, меня зовут Джуниор",
]

In [53]:
from custom_mnb import CustomMNB
mnb = CustomMNB()

In [54]:
mnb.fit(X, y)

In [55]:
mnb.predict(X_test)

[0, 1]

In [141]:
SYMBOLS = '''0123456789!()-[]{};:'"\,<>./?@#$%^&*_~'''

In [142]:
 def tokenize(sentence):
    tokens = ""
    for char in sentence.lower():
        if char not in SYMBOLS:
            tokens += char
    return tokens.split()

In [179]:
def count_words(X, y):
    counts = {}
    for doc, label in zip(X, y):
        for word in tokenize(doc):
            # avoid KeyError
            if word not in counts:
                counts[word] = [0, 0]
            counts[word][label] += 1
    return counts

In [186]:
counts = count_words(X,y)

In [193]:
def prior_probas(counts):
    words_a, words_b = 0, 0
    for word, val in counts.items():
        words_a += val[0]
        words_b += val[1]
    total = words_a + words_b
    prior_a = words_a / total
    prior_b = words_b / total
    return prior_a, prior_b

In [211]:
cat0_count, cat1_count = words_a, words_b

In [244]:
def word_probabilities(counts):
    """ word, p(w | cat0), p(w | cat1)"""
    k = 0.5
    vocab = [word for word, (cat0, cat1) in counts.items()]
    return [(word, (cat0 + k) / (cat0_count + 2 * k),(cat1 + k) / (cat1_count + 2 * k))
            for word, (cat0, cat1) in counts.items()]

In [245]:
word_probabilities(counts)

[('je', 0.11904761904761904, 0.0625),
 ('me', 0.07142857142857142, 0.0625),
 ('suis', 0.11904761904761904, 0.0625),
 ('réveillé', 0.07142857142857142, 0.0625),
 ('le', 0.07142857142857142, 0.0625),
 ('matin', 0.07142857142857142, 0.0625),
 ('brossé', 0.07142857142857142, 0.0625),
 ('mes', 0.07142857142857142, 0.0625),
 ('dents', 0.07142857142857142, 0.0625),
 ('allé', 0.07142857142857142, 0.0625),
 ('à', 0.07142857142857142, 0.0625),
 ('lécole', 0.07142857142857142, 0.0625),
 ('fini', 0.07142857142857142, 0.0625),
 ('la', 0.07142857142857142, 0.0625),
 ('journée', 0.07142857142857142, 0.0625),
 ('я', 0.07142857142857142, 0.0625),
 ('проснулся', 0.07142857142857142, 0.0625),
 ('утром', 0.07142857142857142, 0.0625),
 ('почистил', 0.023809523809523808, 0.1875),
 ('зубы', 0.023809523809523808, 0.1875),
 ('отправились', 0.023809523809523808, 0.1875),
 ('в', 0.023809523809523808, 0.1875),
 ('школу', 0.023809523809523808, 0.1875),
 ('закончил', 0.023809523809523808, 0.1875),
 ('день', 0.02380

In [247]:
def word_probas(counts):
    word_probas = []
    k = 0.5
    for word, val in counts.items():
        pw_a = (val[0] + k) / (words_a + 2*k)
        pw_b = (val[1] + k) / (words_b + 2*k)
        word_probas.append((word, pw_a, pw_b))
    return word_probas

In [217]:
words_a, words_b = 0, 0
for word, val in counts.items():
    words_a += val[0]
    words_b += val[1]

In [7]:
with open("../data/tolstoy_full.txt") as fh:
    text = fh.read()

In [8]:
tolstoy = text.split("\n")

In [9]:
len(tolstoy)

30183

In [38]:
print(tolstoy[0])
predict(tolstoy[0], verbose=True)

— Eh bien, mon prince.
Probability is:  0.9987415039229451
The language is french


In [39]:
print(tolstoy[10])
predict(tolstoy[10], verbose=True)

— отвечал, нисколько не смутясь такою встречей, вошедший князь, в придворном, шитом мундире, в чулках, башмаках, и звездах, с светлым выражением плоского лица.
Probability is:  0.006340336594678441
The language is russian
