In [None]:
import pandas as pd
import string

In [4]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("basilb2s/language-detection")

print("Path to dataset files:", path)

  from .autonotebook import tqdm as notebook_tqdm


Downloading from https://www.kaggle.com/api/v1/datasets/download/basilb2s/language-detection?dataset_version_number=1...


100%|██████████| 542k/542k [00:00<00:00, 1.12MB/s]

Extracting files...
Path to dataset files: C:\Users\monae\.cache\kagglehub\datasets\basilb2s\language-detection\versions\1





In [7]:
df = pd.read_csv('Language Detection.csv')
df.head()

Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [8]:
#regex for punctuation
string.punctuation

'!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~'

In [17]:
def punc_remove(text):
    for punc in string.punctuation:
        text = text.replace(punc, "") #remove all punctuation
    text = text.lower() #make all text lower case
    return (text)

In [10]:
#test the removal of punctuation, do not replace with anything
"my name is madame monae melodie carson florencia de valhosa".replace("d", "")

'my name is maame monae meloie carson florencia e valhosa'

In [18]:
punc_remove("'My name is madame monae melodie carson florencia de valhosa!! What did we not understand about that?', she yelled at the top of her lungs.")

'my name is madame monae melodie carson florencia de valhosa what did we not understand about that she yelled at the top of her lungs'

In [19]:
#confirmed to work so can move on to the actual data
#replacing the text column with the cleaner and lowered dDeprecationWarning
df['Text'] = df['Text'].apply(punc_remove)
df.head()

Unnamed: 0,Text,Language
0,nature in the broadest sense is the natural p...,English
1,nature can refer to the phenomena of the physi...,English
2,the study of nature is a large if not the only...,English
3,although humans are part of nature human activ...,English
4,1 the word nature is borrowed from the old fre...,English


In [20]:
#divide data into test and train
from sklearn.model_selection import train_test_split

In [21]:
X = df.iloc[:,0] #all rows from the first column
Y = df.iloc[:,1] #all rows from the second column

In [23]:
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = .2)

In [24]:
X_train
#proof of random split, these are french and... swahili? dutch?

4090    ce serait génial quil y ait un bouton rouge là...
8458    57 den 24 oktober 2005 publicerade brittiska t...
3661    toutefois les études ne concluent pas sur le f...
6479    ты как сидишь можешь сказать что мне все равно...
3513    ce projet est décrit par son cofondateur jimmy...
                              ...                        
7608                                              dio mio
5541                  no puedo agradecerles lo suficiente
3416    dans les mots et les choses michel foucault dé...
1381    narcisa changed her ways she struggled at firs...
4282    daar verwijst het eerder genoemde woord wiki o...
Name: Text, Length: 8269, dtype: object

In [25]:
#unigrams and bigrams, whatever the n-gram is, can provide information about the relationship between 
#one or more words in relationship to others; 
#but ofc some are way more frequent than others (term frequency) but don't "add" anything to differentiate
#also there is IDF, meaning inverse document frequency, frequency of appearance in other douments

#one resolution is to just weigh them differently
#together help determine how important a word is to a document

In [26]:
from sklearn import feature_extraction

In [27]:
vec = feature_extraction.text.TfidfVectorizer(ngram_range=(1,2), analyzer= 'char')
#1,2 is unigrams and bigrams

In [31]:
from sklearn import pipeline, linear_model
#connect a series of steps, give access to linear models
model_pipe = pipeline.Pipeline([('vec', vec), ('clf', linear_model.LogisticRegression())])

In [32]:
model_pipe.fit(X_train, Y_train)

In [33]:
model_pipe.classes_

array(['Arabic', 'Danish', 'Dutch', 'English', 'French', 'German',
       'Greek', 'Hindi', 'Italian', 'Kannada', 'Malayalam', 'Portugeese',
       'Russian', 'Spanish', 'Sweedish', 'Tamil', 'Turkish'], dtype=object)

In [37]:
predict_val = model_pipe.predict(X_test)

In [40]:
from sklearn import metrics
metrics.accuracy_score(Y_test, predict_val)*100

97.43713733075435

In [41]:
#can also look at the confusion matrix
metrics.confusion_matrix(Y_test, predict_val)

array([[101,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,  81,   1,   2,   1,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   2,   0,   0],
       [  0,   2, 115,   2,   1,   1,   0,   0,   0,   0,   0,   1,   0,
          2,   0,   0,   0],
       [  0,   1,   1, 294,   2,   0,   0,   0,   2,   0,   0,   1,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   2, 206,   1,   0,   0,   2,   0,   0,   0,   0,
          1,   0,   0,   2],
       [  0,   1,   0,   0,   0,  89,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,  67,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,  12,   0,   0,   0,   0,   0,
          0,   0,   0,   0],
       [  0,   0,   0,   0,   2,   0,   0,   0, 128,   0,   0,   0,   0,
          3,   0,   0,   0],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,  90,   0,   0,   0,
         

In [42]:
model_pipe.predict(["my name is madame monae melodie carson florencia de valhosa"])
#one where it predicts incorrectly, probably due to the mix of languages but the name sounding very spanish

array(['Spanish'], dtype=object)

In [43]:
model_pipe.predict(["zina na mono limon"])
#that was kikongo

array(['Italian'], dtype=object)

In [44]:
model_pipe.predict(["orukọ mi ni lẹmọọn"])
#that was yoruba

array(['Italian'], dtype=object)

In [45]:
model_pipe.predict(["שמי לימון"])
#hebrew

array(['Danish'], dtype=object)

In [46]:
model_pipe.predict(["私の名前はレモンです"])

array(['Arabic'], dtype=object)

In [48]:
#ok, seriously, using languages I know it actually has, off the dome silly sentence
model_pipe.predict(["ce nom est limon, je vais arriver parce qu'il y a des emergencies"])

array(['French'], dtype=object)

In [49]:
import pickle

In [52]:
pickle_file = open('model.pkl', 'wb')
pickle.dump(model_pipe, pickle_file)
pickle_file.close()

In [53]:
#check that pickling happened correctly
import os
os.listdir()

['.ipynb_checkpoints',
 'Basic Text Processing.ipynb',
 'Language Detection.csv',
 'model.pkl']