In [1]:
!pip install kaggle



In [2]:
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [3]:
!kaggle datasets download -d 'basilb2s/language-detection'

Dataset URL: https://www.kaggle.com/datasets/basilb2s/language-detection
License(s): CC0-1.0
Downloading language-detection.zip to /content
  0% 0.00/542k [00:00<?, ?B/s]
100% 542k/542k [00:00<00:00, 587MB/s]


In [4]:
import zipfile
zip_ref = zipfile.ZipFile('/content/language-detection.zip', 'r')
zip_ref.extractall('/content')
zip_ref.close()

In [5]:
# 1. Import library
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report


In [6]:
# 2. Unggah file CSV
from google.colab import files
uploaded = files.upload()

Saving Language Detection.csv to Language Detection (1).csv


In [7]:
# 3. Baca dataset
df = pd.read_csv('/content/Language Detection.csv')
print("Data Awal:\n", df.head())

Data Awal:
                                                 Text Language
0   Nature, in the broadest sense, is the natural...  English
1  "Nature" can refer to the phenomena of the phy...  English
2  The study of nature is a large, if not the onl...  English
3  Although humans are part of nature, human acti...  English
4  [1] The word nature is borrowed from the Old F...  English


In [8]:
# 4. Preprocessing (optional: lowercase)
df['text_clean'] = df['Text'].str.lower()

In [9]:
# 5. Vectorisasi Teks
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(df['text_clean'])
y = df['Language']

In [10]:
# 6. Split data latih & uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

In [11]:
# 7. Buat model Naive Bayes
model = MultinomialNB()
model.fit(X_train, y_train)

In [12]:
# 8. Evaluasi model
y_pred = model.predict(X_test)
print("\nHasil Evaluasi:\n")
print(classification_report(y_test, y_pred))


Hasil Evaluasi:

              precision    recall  f1-score   support

      Arabic       1.00      0.98      0.99       165
      Danish       0.98      0.95      0.97       123
       Dutch       0.99      0.97      0.98       175
     English       0.90      1.00      0.95       435
      French       0.98      0.99      0.99       309
      German       1.00      0.97      0.98       135
       Greek       1.00      0.98      0.99       102
       Hindi       1.00      1.00      1.00        20
     Italian       0.99      0.99      0.99       217
     Kannada       1.00      0.99      1.00       101
   Malayalam       1.00      0.98      0.99       184
  Portugeese       0.99      0.98      0.98       205
     Russian       1.00      0.98      0.99       200
     Spanish       0.99      0.98      0.98       246
    Sweedish       0.98      0.97      0.98       193
       Tamil       1.00      0.98      0.99       145
     Turkish       1.00      0.93      0.96       147

    accu

In [13]:
# 9. Prediksi kalimat baru
def prediksi_bahasa(kalimat):
    kalimat_bersih = kalimat.lower()
    vektor = vectorizer.transform([kalimat_bersih])
    hasil = model.predict(vektor)[0]
    print(f"Kalimat: '{kalimat}' → Bahasa: {hasil}")

In [14]:
# 10. Contoh prediksi
prediksi_bahasa("Ho molta fame")
prediksi_bahasa("Where are you going?")
prediksi_bahasa("Je ne comprends pas")
prediksi_bahasa("إلى أين أنت ذاهب")
prediksi_bahasa("Hvor skal du hen")
prediksi_bahasa("Wohin gehst du?")

Kalimat: 'Ho molta fame' → Bahasa: Italian
Kalimat: 'Where are you going?' → Bahasa: English
Kalimat: 'Je ne comprends pas' → Bahasa: French
Kalimat: 'إلى أين أنت ذاهب' → Bahasa: Arabic
Kalimat: 'Hvor skal du hen' → Bahasa: Danish
Kalimat: 'Wohin gehst du?' → Bahasa: Sweedish
