In [1]:
%%bash

pip install -q kaggle
mkdir -p ~/.kaggle
cp drive/MyDrive/kaggle.json ~/.kaggle/
chmod 600 /root/.kaggle/kaggle.json
kaggle datasets download -d chazzer/big-language-detection-dataset
unzip big-language-detection-dataset.zip

Downloading big-language-detection-dataset.zip to /content

Archive:  big-language-detection-dataset.zip
  inflating: lan_to_language.json    
  inflating: sentences.csv           


  0%|          | 0.00/208M [00:00<?, ?B/s]  2%|▏         | 5.00M/208M [00:00<00:05, 41.5MB/s]  4%|▍         | 9.00M/208M [00:00<00:05, 38.0MB/s] 16%|█▌        | 33.0M/208M [00:00<00:03, 54.6MB/s] 27%|██▋       | 57.0M/208M [00:01<00:02, 63.3MB/s] 35%|███▌      | 73.0M/208M [00:01<00:01, 80.0MB/s] 43%|████▎     | 89.0M/208M [00:01<00:01, 69.1MB/s] 47%|████▋     | 99.0M/208M [00:01<00:01, 74.8MB/s] 58%|█████▊    | 121M/208M [00:01<00:01, 66.9MB/s]  66%|██████▌   | 137M/208M [00:02<00:00, 78.6MB/s] 73%|███████▎  | 153M/208M [00:02<00:00, 73.1MB/s] 85%|████████▍ | 177M/208M [00:02<00:00, 96.3MB/s] 91%|█████████ | 189M/208M [00:02<00:00, 91.1MB/s]100%|██████████| 208M/208M [00:02<00:00, 81.1MB/s]


# Big Language Detection

## Importing required datasets

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.naive_bayes import BernoulliNB

## Getting dataset

In [2]:
df = pd.read_csv("sentences.csv")
df.head()

Unnamed: 0,id,lan_code,sentence
0,1,cmn,我們試試看！
1,2,cmn,我该去睡觉了。
2,3,cmn,你在干什麼啊？
3,4,cmn,這是什麼啊？
4,5,cmn,今天是６月１８号，也是Muiriel的生日！


In [3]:
import json
f = open("lan_to_language.json")
data = json.load(f)

In [4]:
data['syc']

'Classical Syriac'

## Looking out dataset

In [5]:
len(df)

10341812

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10341812 entries, 0 to 10341811
Data columns (total 3 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   id        int64 
 1   lan_code  object
 2   sentence  object
dtypes: int64(1), object(2)
memory usage: 236.7+ MB


In [7]:
df.drop("id", inplace=True, axis=1)

## Cleaning dataset

In [8]:
df.isna().sum()

lan_code    0
sentence    0
dtype: int64

## Pre-Processing dataset

In [9]:
le = LabelEncoder()

X = df["sentence"]
y = df["lan_code"]

y = le.fit_transform(y)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)


In [13]:
len(le.classes_)

404

In [10]:
from sklearn.feature_extraction.text import CountVectorizer

cv = CountVectorizer()

X_train = cv.fit_transform(X_train)
X_test = cv.transform(X_test) 


## Training model

In [11]:
!pip install xgboost==1.7.1

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting xgboost==1.7.1
  Downloading xgboost-1.7.1-py3-none-manylinux2014_x86_64.whl (193.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 MB[0m [31m7.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: xgboost
  Attempting uninstall: xgboost
    Found existing installation: xgboost 0.90
    Uninstalling xgboost-0.90:
      Successfully uninstalled xgboost-0.90
Successfully installed xgboost-1.7.1


In [None]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression()
model.fit(X_train, y_train)