In [1]:
import pandas as pd

In [2]:
df  = pd.read_excel("Frases.xlsx")

In [3]:
df.head()

Unnamed: 0,Idioma,Texto
0,Aleman,Dies geht aus dem Jahresbericht des Beobachtun...
1,Aleman,Bei zwei der Geiseln handelt es sich demnach u...
2,Aleman,Zuvor hatte die Nachrichtenagentur Reuters übe...
3,Aleman,Selten verliefen die Halbfinals in Champions u...
4,Aleman,Dennoch dürfte das Pro-Militärlager keine Schw...


In [4]:
df.shape

(60, 2)

In [5]:
df['Chars'] = df.Texto.str.replace('[^\w]|\\d','')

In [6]:
df.head()

Unnamed: 0,Idioma,Texto,Chars
0,Aleman,Dies geht aus dem Jahresbericht des Beobachtun...,DiesgehtausdemJahresberichtdesBeobachtungszent...
1,Aleman,Bei zwei der Geiseln handelt es sich demnach u...,BeizweiderGeiselnhandeltessichdemnachumfranzös...
2,Aleman,Zuvor hatte die Nachrichtenagentur Reuters übe...,ZuvorhattedieNachrichtenagenturReutersüberents...
3,Aleman,Selten verliefen die Halbfinals in Champions u...,SeltenverliefendieHalbfinalsinChampionsundEuro...
4,Aleman,Dennoch dürfte das Pro-Militärlager keine Schw...,DennochdürftedasProMilitärlagerkeineSchwierigk...


## Train

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X = df['Chars']
y = df['Idioma']

In [9]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=.2, stratify=y )

## Vectorizar

In [10]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [11]:
vect = TfidfVectorizer(analyzer='char', strip_accents='ascii') #, strip_accents='ascii'

In [12]:
modelo_tfidf = vect.fit(X_train)

In [13]:
X_train_tfidf = modelo_tfidf.transform(X_train).todense()

In [14]:
X_train_tfidf.shape

(48, 26)

In [15]:
X_train_tfidf

matrix([[0.28208537, 0.07037599, 0.1379084 , ..., 0.01188575, 0.        ,
         0.0204822 ],
        [0.41768707, 0.06558454, 0.1285191 , ..., 0.03046044, 0.02624559,
         0.        ],
        [0.39742826, 0.01726055, 0.16911841, ..., 0.01603317, 0.01381464,
         0.02762928],
        ...,
        [0.23469914, 0.01596923, 0.12517288, ..., 0.        , 0.        ,
         0.        ],
        [0.16082024, 0.07849994, 0.0908984 , ..., 0.        , 0.0114233 ,
         0.11423298],
        [0.54051798, 0.0513175 , 0.16341241, ..., 0.02383418, 0.        ,
         0.        ]])

In [17]:
modelo_tfidf.get_feature_names()

['a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z']

## KNN

In [18]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler

In [19]:
knn = KNeighborsClassifier(n_neighbors=7)

In [20]:
scale = StandardScaler().fit(X_train_tfidf)

In [21]:
#X_norm = scale.transform(X_train_tfidf)
X_norm = X_train_tfidf

In [22]:
knn.fit(X_norm, y_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=7, p=2,
           weights='uniform')

### Test

In [23]:
X_test_tfidf = modelo_tfidf.transform(X_test).todense()

In [24]:
#X_test_norm = scale.transform(X_test_tfidf)
X_test_norm = X_test_tfidf

In [25]:
y_pred = knn.predict(X_test_norm)

In [26]:
pd.crosstab(y_test, y_pred)

col_0,Aleman,Frances,Ingles,Italiano,Portugues,Spanish
Idioma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aleman,2,0,0,0,0,0
Frances,0,2,0,0,0,0
Ingles,0,0,2,0,0,0
Italiano,0,0,0,2,0,0
Portugues,0,0,0,0,1,1
Spanish,0,0,0,0,0,2


### Cross validation

In [27]:
datos = TfidfVectorizer(analyzer='char', strip_accents=None).fit_transform(X).todense()
objetivo = y

In [28]:
from sklearn.model_selection import cross_val_score
from sklearn import metrics

In [29]:
vecinos = 4
modelo = KNeighborsClassifier(n_neighbors=vecinos, weights="uniform")

In [30]:
scores = cross_val_score(modelo, datos, objetivo, cv=5, scoring='accuracy')
scores

array([0.91666667, 1.        , 1.        , 1.        , 0.83333333])

In [31]:
y.value_counts()

Ingles       10
Portugues    10
Aleman       10
Spanish      10
Frances      10
Italiano     10
Name: Idioma, dtype: int64

In [32]:
modelo.fit(datos,objetivo)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=4, p=2,
           weights='uniform')

In [33]:
y_pred = modelo.predict(datos)

In [34]:
pd.crosstab(objetivo, y_pred)

col_0,Aleman,Frances,Ingles,Italiano,Portugues,Spanish
Idioma,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
Aleman,10,0,0,0,0,0
Frances,0,10,0,0,0,0
Ingles,0,0,10,0,0,0
Italiano,0,0,0,10,0,0
Portugues,0,0,0,0,9,1
Spanish,0,0,0,0,0,10


## Nuevos datos

In [35]:
texto_nuevo = "Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum"

In [36]:
texto_nuevo

"Lorem Ipsum is simply dummy text of the printing and typesetting industry. Lorem Ipsum has been the industry's standard dummy text ever since the 1500s, when an unknown printer took a galley of type and scrambled it to make a type specimen book. It has survived not only five centuries, but also the leap into electronic typesetting, remaining essentially unchanged. It was popularised in the 1960s with the release of Letraset sheets containing Lorem Ipsum passages, and more recently with desktop publishing software like Aldus PageMaker including versions of Lorem Ipsum"

In [37]:
tn = modelo_tfidf.transform([texto_nuevo]).todense()

In [38]:
tn

matrix([[0.23799728, 0.04188011, 0.08206803, 0.13130884, 0.48420136,
         0.05343406, 0.09403683, 0.12467946, 0.3118585 , 0.        ,
         0.09385354, 0.18054966, 0.15592925, 0.3118585 , 0.20517007,
         0.1591444 , 0.        , 0.19696327, 0.32006531, 0.35289252,
         0.13951565, 0.04452838, 0.09855296, 0.03112164, 0.17429943,
         0.        ]])

In [39]:
knn.predict(tn)

array(['Ingles'], dtype=object)

In [40]:
texto_nuevo = "Ut et leo quis orci fringilla lobortis. Suspendisse vel vehicula enim, vel gravida mi. In ut elit finibus, sagittis velit quis, tempus justo. Quisque ultricies ornare tortor. Pellentesque congue libero est, vel laoreet nisl lobortis non. Morbi id elit velit. Cras varius id nunc ut viverra. Praesent dapibus ut nisi eu bibendum. Aliquam erat volutpat. Sed dignissim, nibh ac scelerisque posuere, justo leo ornare elit, ac varius orci massa dictum tellus."

In [41]:
tn = modelo_tfidf.transform([texto_nuevo]).todense()

knn.predict(tn)

array(['Frances'], dtype=object)

In [42]:
texto_latin = 'Hoc libello "cibus" linguae Latinae conditur et gratiorem saporem accipit. Tredecim capitibus ars Latine dicendi exercetur. Ita non solum Latine leges sed etiam loqueris. Appendice grammaticali PIPER et SAL tibi secreta sua patefacient. Certe etiam vocabularium tibi erit auxilio. Praeterea lectiones maxima ex parte auctoribus antiquis delibatae continentur, quibus vires Latinas tuas temptare poteris.'

In [43]:
tn = modelo_tfidf.transform([texto_latin]).todense()

knn.predict(tn)

array(['Italiano'], dtype=object)

In [44]:
txt = 'suben bajando'

In [45]:
tn = modelo_tfidf.transform([txt]).todense()

knn.predict(tn)

array(['Spanish'], dtype=object)