In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report
from sklearn.feature_extraction.text import TfidfVectorizer
from imblearn.over_sampling import SMOTE
from sklearn.ensemble import RandomForestClassifier
from imblearn.under_sampling import RandomUnderSampler
from collections import Counter
from sklearn.metrics import classification_report, roc_auc_score, confusion_matrix
# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

<div style="
    background-color: #fffbea;
    border: 2px solid #d4a373;
    border-radius: 10px;
    padding: 20px;
    box-shadow: 3px 3px 10px rgba(0,0,0,0.1);
    font-family: 'Courier New', monospace;
    line-height: 1.6;
    max-width: 800px;
    margin: auto;">
    <p style="border-left: 3px solid #d4a373; padding-left: 10px;">
        "Every two miles, the water changes; every four miles, the language." (Translation of a Hindi proverb: "Kos kos par badle paani, chaar kos par baani.")
— Indian Proverb (Signifying the immense linguistic diversity of India.)
    </p>
</div>

In [None]:
train_df = pd.read_csv("/kaggle/input/indian-language-identification/language_identification_dataset/train.csv")
val_df = pd.read_csv("/kaggle/input/indian-language-identification/language_identification_dataset/validation.csv")
test_df = pd.read_csv("/kaggle/input/indian-language-identification/language_identification_dataset/test.csv")

<div style="
    width: 100%;
    max-width: 900px;
    margin: 20px auto;
    padding: 15px;
    border-radius: 10px;
    background: #ffffff;
    color: #333;
    font-family: 'Arial', sans-serif;
    font-size: 20px;
    font-weight: bold;
    text-align: center;
    text-transform: uppercase;
    letter-spacing: 2px;
    border: 2px solid #ddd;
    box-shadow: 0px 4px 10px rgba(0, 0, 0, 0.1);
">
    text preprocessing
</div>

In [None]:
train_df.dropna(inplace=True)
val_df.dropna(inplace=True)
test_df.dropna(inplace=True)

<div style="width: 100%; max-width: 1000px; margin: 20px auto; padding: 10px; border-radius: 30px; background: linear-gradient(90deg, #007bff, #00c6ff); box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.3); font-family: Arial, sans-serif; text-align: center; color: white; font-size: 16px; font-weight: bold; transform: skew(-10deg);">
    Let's see, Language sample percentage comparision to complete data
</div>

In [None]:
language_distribution = train_df["Language"].value_counts(normalize=True) * 100
print(language_distribution)

In [None]:
sns.countplot(x=train_df['Language'])
plt.title("Class Distribution Before Balancing")
plt.xticks(rotation=90)
plt.show()

<div style="
    background-color: #ffffff;
    border-radius: 12px;
    padding: 25px;
    box-shadow: 0px 5px 15px rgba(0, 0, 0, 0.1);
    font-family: 'Inter', sans-serif;
    max-width: 750px;
    margin: auto;">
    <h2 style="color: #333; text-align: center;">Dataset Inspection</h2>
    <p>Finding: Dataset is Unbalanced</p>
    <div style="
        background-color: #f7f7f7;
        padding: 15px;
        border-left: 4px solid #007bff;">
        <strong>Tip:</strong> We use Smote to resample the data for better accuracy
    </div>
</div>

<div style="width: 100%; max-width: 600px; margin: 20px auto; padding: 15px; border-radius: 15px; background: radial-gradient(circle, #f0fff4, #c6f6d5); box-shadow: 0px 4px 10px rgba(0, 0, 0, 0.3); font-family: Arial, sans-serif; text-align: center; animation: fadein 2s;">
    <div style="font-size: 18px; font-weight: bold; color: #008000; margin-bottom: 10px;">
        Comparision Summary
    </div>
    <div style="font-size: 14px; color: #333; line-height: 1.5;">
        <p>📌 ML Classification Using MultinominalNB with Non-Sampled Data</p>
        <p>📌 ML Classification Using RandomForestClassifier with Sampled Data</p>
    </div>
</div>

<style>
@keyframes fadein {
    from { opacity: 0; transform: scale(0.95); }
    to { opacity: 1; transform: scale(1); }
}
</style>

<div style="width: 100%; max-width: 1000px; margin: 20px auto; padding: 10px; border-radius: 10px; background: linear-gradient(90deg, #ff4d4d, #ff9999); box-shadow: 0px 4px 8px rgba(0, 0, 0, 0.3); font-family: Arial, sans-serif; text-align: center; color: white; font-size: 16px; font-weight: bold; text-transform: uppercase;">
    TF-IDF for converting text to numeric
</div>

In [None]:
vectorizer = TfidfVectorizer(max_features=5000)  #Convert text to vectors
X_train = vectorizer.fit_transform(train_df['Headline'])
X_val = vectorizer.transform(val_df['Headline'])
X_test = vectorizer.transform(test_df['Headline'])

In [None]:
y_train = train_df['Language']
y_val = val_df['Language']
y_test = test_df['Language']

<div style="width: 100%; max-width: 1000px; margin: 20px auto; padding: 10px; border-radius: 50px; background: radial-gradient(circle, #4CAF50, #81C784); box-shadow: inset 0px 2px 10px rgba(0, 0, 0, 0.2); font-family: Arial, sans-serif; text-align: center; color: white; font-size: 16px; font-weight: bold; letter-spacing: 1px;">
    Training MultinomialNB Model | Validate Model | Test the Model for New Data
</div>

In [None]:
model = MultinomialNB()
model.fit(X_train, y_train)

In [None]:
val_preds = model.predict(X_val)
print("Validation Accuracy:", accuracy_score(y_val, val_preds))
print(classification_report(y_val, val_preds))

In [None]:
test_preds = model.predict(X_test)
print("Test Accuracy:", accuracy_score(y_test, test_preds))
print(classification_report(y_test, test_preds))

In [None]:
def predict_language_with_normal(text):
    text_vectorized = vectorizer.transform([text])
    prediction = model.predict(text_vectorized)[0]
    return prediction
input_text = "নমস্কার, তুমি কেমন আছো?" 
predicted_lang = predict_language_with_normal(input_text)
print(f"Predicted Language: {predicted_lang}")

In [None]:
language_labels = [
    "Odia", "Nepali", "Hindi", "Assamese", "Sanskrit", "Malayalam",
    "Konkani", "English", "Kannada", "Telugu", "Marathi", "Gujarati",
    "Urdu", "Sindhi", "Punjabi", "Bengali", "Kashmiri", "Tamil"
]
language_sentences = [
    "ଏହି ଜଗତ ଅତି ସୁନ୍ଦର |",  
    "यो संसार धेरै सुन्दर छ।",  
    "यह दुनिया बहुत सुंदर है।",  
    "এই বিশ্ব খুব সুন্দর।",    
    "संसारः अतीव सुन्दरः अस्ति।",    
    "ഈ ലോകം അതിവളരെയോ മനോഹരമാണ്.",  
    "हें जग सुंदर आसा।",  
    "This world is very beautiful.",   
    "ಈ ಜಗತ್ತು ತುಂಬಾ ಸುಂದರವಾಗಿದೆ.",  
    "ఈ ప్రపంచం చాలా అందంగా ఉంది.", 
    "ही दुनिया खूप सुंदर आहे.",  
    "આ દુનિયા ખૂબ સુંદર છે.",    
    "یہ دنیا بہت خوبصورت ہے۔",  
    "اها دنيا تمام سهڻي آهي.",  
    "ਇਹ ਦੁਨਿਆ ਬਹੁਤ ਸੋਹਣੀ ਹੈ।",  
    "এই পৃথিবী খুব সুন্দর।",  
    "یہ دنیا بہت خوبصورت ہے۔", 
    "இந்த உலகம் மிகவும் அழகானது.", 
]

In [None]:
for i, sentence in enumerate(language_sentences):
    actual_label = language_labels[i]
    predicted_lang = predict_language_with_normal(sentence)
    status = "✅ Correct" if predicted_lang == actual_label else "❌ Wrong"
    print(f"Sentence: {sentence}")
    print(f"Actual Language: {actual_label} | Predicted Language: {predicted_lang} | Status: {status}")
    print("-" * 80)

<div style="
    background-color: #fffbea;
    border: 2px solid #d4a373;
    border-radius: 10px;
    padding: 20px;
    box-shadow: 3px 3px 10px rgba(0,0,0,0.1);
    font-family: 'Courier New', monospace;
    line-height: 1.6;
    max-width: 800px;
    margin: auto;">
    <h2 style="text-align: center; color: #d4a373;">FINDING</h2>
    <p style="border-left: 3px solid #d4a373; padding-left: 10px;">
        Model trained on non sampled or unbalanced data with MultinomialNB
    </p>
    <ul>
        <li>1. Accuracy Score: 94</li>
        <li>2. Predicted Wrong: Nepali, Assamese, Sanskrit, Kannada, Marathi, Kashmiri </li>
    </ul>
</div>

In [None]:
train_df["Headline"] = train_df["Headline"].fillna("")
vectorizer = TfidfVectorizer(max_features=5000)  # Adjust max_features as needed
X_tfidf = vectorizer.fit_transform(train_df["Headline"])  # Now there are no NaNs
y = train_df["Language"]

<div style="
    width: 100%;
    max-width: 900px;
    margin: 20px auto;
    padding: 25px;
    border-radius: 15px;
    background: #ffffff;
    color: #222;
    font-family: 'Georgia', serif;
    font-size: 22px;
    font-weight: bold;
    text-align: center;
    text-transform: uppercase;
    letter-spacing: 1px;
    border: 1px solid #ccc;
    box-shadow: 2px 2px 8px rgba(0, 0, 0, 0.1);
">
    Apply smote
</div>

In [None]:
smote = SMOTE(sampling_strategy='auto', random_state=42)
X_resampled, y_resampled = smote.fit_resample(X_tfidf, y)
print(Counter(y_resampled))

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

<div style="width: 100%; max-width: 1000px; margin: 20px auto; padding: 10px; border-radius: 50px; background: radial-gradient(circle, #4CAF50, #81C784); box-shadow: inset 0px 2px 10px rgba(0, 0, 0, 0.2); font-family: Arial, sans-serif; text-align: center; color: white; font-size: 16px; font-weight: bold; letter-spacing: 1px;">
    Training Random Forest Model | Validate Model | Test the Model for New Data
</div>

In [None]:
clf = RandomForestClassifier(n_estimators=100, class_weight="balanced", random_state=42)
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)

In [None]:
print("Classification Report:\n", classification_report(y_test, y_pred))
print("ROC-AUC Score:", roc_auc_score(y_test, clf.predict_proba(X_test), multi_class='ovr'))

In [None]:
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt="d")
plt.title("Confusion Matrix")
plt.show()

In [None]:
def predict_language_with_sample(text):
    text_vectorized = vectorizer.transform([text])
    prediction = clf.predict(text_vectorized)[0]    
    return prediction
input_text = "নমস্কার, তুমি কেমন আছো?"  
predicted_lang = predict_language_with_sample(input_text)
print(f"Predicted Language: {predicted_lang}")

In [None]:
for i, sentence in enumerate(language_sentences):
    actual_label = language_labels[i]
    predicted_lang = predict_language_with_sample(sentence)
    status = "✅ Correct" if predicted_lang == actual_label else "❌ Wrong"
    print(f"Sentence: {sentence}")
    print(f"Actual Language: {actual_label} | Predicted Language: {predicted_lang} | Status: {status}")
    print("-" * 80)

<div style="
    background-color: #fffbea;
    border: 2px solid #d4a373;
    border-radius: 10px;
    padding: 20px;
    box-shadow: 3px 3px 10px rgba(0,0,0,0.1);
    font-family: 'Courier New', monospace;
    line-height: 1.6;
    max-width: 800px;
    margin: auto;">
    <h2 style="text-align: center; color: #d4a373;">FINDING</h2>
    <p style="border-left: 3px solid #d4a373; padding-left: 10px;">
        Model trained sampled or balanced data with RandomForest Classifier
    </p>
    <ul>
        <li>1. Accuracy Score: 95 </li>
        <li>2. Predicted Wrong: Nepali, Assamese, Sanskrit, Kannada, Gujrati, Urdu, Konkani, Hindi </li>
    </ul>
</div>