In [15]:
pip install pandas numpy scikit-learn nltk

Note: you may need to restart the kernel to use updated packages.


In [16]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
import nltk

In [17]:
# Load the dataset (adjust the file path if needed)
df = pd.read_csv('langdetect.csv')

# Inspect the first few rows
print("First 5 rows of the dataset:")
print(df.head())

# Check language distribution
print("\nLanguage distribution:")
print(df['Language'].value_counts())

# Check for missing values
print("\nMissing values:")
print(df.isnull().sum())

First 5 rows of the dataset:
                                                Text Language
0   Nature, in the broadest sense, is the natural...  English
1  "Nature" can refer to the phenomena of the phy...  English
2  The study of nature is a large, if not the onl...  English
3  Although humans are part of nature, human acti...  English
4  [1] The word nature is borrowed from the Old F...  English

Language distribution:
Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

Missing values:
Text        0
Language    0
dtype: int64


In [19]:
def preprocess_text(text, language='English'):
    # List of Latin-based languages where lowercasing applies
    latin_languages = ['English', 'French', 'Spanish', 'Portugeese', 'Italian', 
                       'Sweedish', 'Dutch', 'German', 'Danish']
    # Lowercase only for Latin-based languages
    if language in latin_languages:
        text = text.lower()
    # Remove numbers
    text = re.sub(r'[0-9]+', '', text)
    # Remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    return text

In [20]:
# Apply preprocessing to the 'Text' column
df['Text'] = df.apply(lambda row: preprocess_text(row['Text'], row['Language']), axis=1)

# Drop any rows with missing values (if any)
df = df.dropna()

# Verify preprocessing by checking a few samples
print("\nPreprocessed samples:")
for lang in df['Language'].unique():
    sample = df[df['Language'] == lang]['Text'].iloc[0]
    print(f"Language: {lang}, Sample: {sample}")


Preprocessed samples:
Language: English, Sample: nature in the broadest sense is the natural physical material world or universe
Language: Malayalam, Sample: ഭതകപരപഞചതത മതതതതൽ സചപപകകനന പദമണ പരകത ജർമൻ Natur ഫരഞച ഇഗലഷ Nature സപനഷ Naturaleza പർചചഗസ Natureza
Language: Hindi, Sample: वकशबदकष एक मकत शबदकष एव समनतर कष वकतब मफत कतब और उपयग समगर वककवट वभनन सभषत क सकलन वकसरत मकत सतरत समगर कमस वकमडय परकलप क मडय फइल भडर वकसमचर मकत समचर यगदनकरतओ क लए यह लख इटरनट इणटरनट वशवकश क बर म बतलत ह क वकपडय क मखय पषठ क लए वकपडय क मखय पषठ दख वकपडय क आगतक परचय क लए वकपडय क बर म पषठ दख वकपडय एक मफत वब आधरत और सहयग बहभष वशवकश ह ज गरलभ वकमडय फउनडशन स सहयग परपत परयजन म उतपनन हआ इसक नम द शबद वक wiki यह सहयग वबसइट क नरमण क एक तकनक ह यह एक हवई शबद वक ह जसक अरथ ह जलद और एनसइकलपडय encyclopedia क सयजन ह दनय भर म सवयसवक क दवर सहयग स वकपडय क करड लख लख अगरज वकपडय म लख गए ह और इसक लगभग सभ लख क वह कई भ वयकत सपदत कर सकत ह ज वकपडय वबसईट क उपयग कर सकत ह इस जनवर म जमम वलस और लर सगर क दवर पररभ पररमभ कय गय यह वरतमन म इटरनट इणटरनट 

In [21]:
# Initialize TF-IDF Vectorizer
tfidf = TfidfVectorizer(max_features=5000)  # Limit to 5000 features for efficiency

# Fit and transform the 'Text' column
X = tfidf.fit_transform(df['Text']).toarray()
y = df['Language']

# Check the shape of the feature matrix
print("\nFeature matrix shape:", X.shape)  # Should be (10337, 5000) based on your data


Feature matrix shape: (10337, 5000)


In [22]:
import joblib
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

['tfidf_vectorizer.pkl']

In [23]:
# Step 5: Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Training set size:", X_train.shape)
print("Testing set size:", X_test.shape)

Training set size: (8269, 5000)
Testing set size: (2068, 5000)


In [24]:
# Step 6: Train the model
model = MultinomialNB()
model.fit(X_train, y_train)
print("\nModel training completed.")


Model training completed.


In [25]:
# Step 7: Evaluate the model
y_pred = model.predict(X_test)
print("\nModel Evaluation:")
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))


Model Evaluation:
Accuracy: 0.9545454545454546
Classification Report:
               precision    recall  f1-score   support

      Arabic       1.00      0.93      0.97       106
      Danish       1.00      0.92      0.96        73
       Dutch       0.98      0.95      0.97       111
     English       0.80      1.00      0.89       291
      French       0.97      0.98      0.98       219
      German       1.00      0.95      0.97        93
       Greek       1.00      0.93      0.96        68
       Hindi       1.00      1.00      1.00        10
     Italian       1.00      0.97      0.98       145
     Kannada       1.00      0.98      0.99        66
   Malayalam       1.00      0.89      0.94       121
  Portugeese       1.00      0.93      0.96       144
     Russian       1.00      0.94      0.97       136
     Spanish       0.96      0.97      0.97       160
    Sweedish       0.96      0.98      0.97       133
       Tamil       1.00      0.97      0.98        87
     Turk

In [26]:
# Step 8: Define prediction function
def predict_language(text):
    processed_text = preprocess_text(text, 'English')  # Default to English for new input
    text_vector = tfidf.transform([processed_text]).toarray()
    prediction = model.predict(text_vector)
    return prediction[0]

In [27]:
# Step 9: Test the model
print("\nPredictions:")
test_texts = [
    "hello how are you",           # English
    "നിനക്ക് സുഖമാണോ",            # Malayalam: "Are you well?"
    "привет как дела",             # Russian: "Hi, how are you?"
    "hola cómo estás",             # Spanish: "Hello, how are you?"
    "ನಮಸ್ಕಾರ ನೀವು ಹೇಗಿದ್ದೀರಿ"     # Kannada: "Hello, how are you?"
]
for text in test_texts:
    print(f"Text: '{text}' -> Predicted Language: {predict_language(text)}")


Predictions:
Text: 'hello how are you' -> Predicted Language: English
Text: 'നിനക്ക് സുഖമാണോ' -> Predicted Language: English
Text: 'привет как дела' -> Predicted Language: Russian
Text: 'hola cómo estás' -> Predicted Language: Spanish
Text: 'ನಮಸ್ಕಾರ ನೀವು ಹೇಗಿದ್ದೀರಿ' -> Predicted Language: Kannada


In [29]:
# Step 10: Save the model and vectorizer
joblib.dump(model, 'language_model.pkl')
print("\nModel saved.")


Model saved.


In [30]:
import sklearn
print(sklearn.__version__)  # Should output 1.3.2 based on your logs

1.5.1
