In [1]:
# Import necessary libraries
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

In [3]:
file_path = '/content/dataset.csv'  # Update this with your dataset path
languages = pd.read_csv(file_path, delimiter=',', encoding='utf-8')

In [None]:
languages.head(30) # Call head on the languages DataFrame

Unnamed: 0,Text,language
0,klement gottwaldi surnukeha palsameeriti ning ...,Estonian
1,sebes joseph pereira thomas p eng the jesuits...,Swedish
2,thanon charoen krung,Thai
3,,Tamil
4,de spons behoort tot het geslacht haliclona en...,Dutch
5,,Japanese
6,tsutinalar ingilizce tsuutina kanadada alberta...,Turkish
7,mller mox figura centralis circulorum doctorum...,Latin
8,electric charge conserved ...,Urdu
9,rt,Japanese


In [4]:
languages.columns = languages.columns.str.strip()

In [5]:
languages['Text'] = languages['Text'].str.lower().str.replace('[^a-z\s]', '', regex=True)

In [6]:
# Step 2: N-gram Feature Extraction using CountVectorizer
# Create n-grams (using unigrams, bigrams, and trigrams)
vectorizer = CountVectorizer(ngram_range=(1, 3))  # 1-gram to 3-gram
X = vectorizer.fit_transform(languages['Text'])  # Create feature vectors
# Labels
y = languages['language']

In [7]:
# Step 3: Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [8]:
# Step 4: Train a Multinomial Naive Bayes Classifier
nb_classifier = MultinomialNB()
nb_classifier.fit(X_train, y_train)

In [9]:
# Step 5: Predict on the test data
y_pred = nb_classifier.predict(X_test)

In [10]:
svm_classifier = SVC(kernel='linear')  # Linear kernel for text classification
svm_classifier.fit(X_train, y_train)

# Step 5: Predict on the test data
y_pred = svm_classifier.predict(X_test)


In [11]:
# Step 6: Calculate the accuracy
accuracy = accuracy_score(y_test, y_pred)
# print(f"Accuracy using n-grams: {accuracy * 100:.2f}%")
print(f"SVM Accuracy using n-grams: {accuracy * 100:.2f}%")

SVM Accuracy using n-grams: 53.86%


In [12]:
# for i in range(5):
#     print(f"Text: {languages['Text'].iloc[i]}")
#     print(f"Predicted Language: {nb_classifier.predict(vectorizer.transform([languages['Text'].iloc[i]]))[0]}")
#     print(f"Actual Language: {languages['language'].iloc[i]}")
#     print('-' * 80)
# Step 7: Display sample predictions (Optional)
for i in range(5):
    print(f"Text: {languages['Text'].iloc[i]}")
    print(f"Predicted Language: {svm_classifier.predict(vectorizer.transform([languages['Text'].iloc[i]]))[0]}")
    print(f"Actual Language: {languages['language'].iloc[i]}")
    print('-' * 80)

Text: klement gottwaldi surnukeha palsameeriti ning paigutati mausoleumi surnukeha oli aga liiga hilja ja oskamatult palsameeritud ning hakkas ilmutama lagunemise tundemrke  aastal viidi ta surnukeha mausoleumist ra ja kremeeriti zlni linn kandis aastatel  nime gottwaldov ukrainas harkivi oblastis kandis zmiivi linn aastatel  nime gotvald
Predicted Language: Estonian
Actual Language: Estonian
--------------------------------------------------------------------------------
Text: sebes joseph pereira thomas  p eng the jesuits and the sinorussian treaty of nerchinsk  the diary of thomas pereira bibliotheca instituti historici s i    rome libris 
Predicted Language: Swedish
Actual Language: Swedish
--------------------------------------------------------------------------------
Text:   thanon charoen krung         
Predicted Language: Thai
Actual Language: Thai
--------------------------------------------------------------------------------
Text:                              
Predicted Lan

In [13]:
# Import necessary libraries
import pickle

# Step 8: Save the trained model and vectorizer
with open('nb_language_model.pkl', 'wb') as model_file:
    pickle.dump(nb_classifier, model_file)

with open('vectorizer.pkl', 'wb') as vec_file:
    pickle.dump(vectorizer, vec_file)

print("Model and Vectorizer saved successfully!")


Model and Vectorizer saved successfully!


In [14]:
# Import necessary libraries
import pickle
from sklearn.feature_extraction.text import CountVectorizer
import re

# Function to load the model and vectorizer, and predict the language
def predict_language(text):
    # Load the saved model and vectorizer
    with open('/content/nb_language_model.pkl', 'rb') as model_file:
        nb_classifier = pickle.load(model_file)

    with open('/content/vectorizer.pkl', 'rb') as vec_file:
        vectorizer = pickle.load(vec_file)

    # Preprocess the input text
    text = text.lower()
    text = re.sub('[^a-z\s]', '', text)  # Use re.sub for regex-based replacement

    # Transform the input text using the vectorizer
    text_features = vectorizer.transform([text])

    # Predict the language using the loaded model
    prediction = nb_classifier.predict(text_features)

    return prediction[0]

# Interface for user input
while True:
    user_input = input("Enter a sentence to predict the language (or 'exit' to quit): ")

    if user_input.lower() == 'exit':
        break

    if __name__ == "__main__":
        # This code block will only execute when the script is run directly, not when imported as a module.
        predicted_language = predict_language(user_input)
        print(f"Predicted Language: {predicted_language}")

    if user_input.lower() == 'exit':
        break

Enter a sentence to predict the language (or 'exit' to quit): sebes joseph pereira thomas p eng the jesuits
Predicted Language: Swedish
Enter a sentence to predict the language (or 'exit' to quit): de spons behoort tot het geslacht haliclona en.
Predicted Language: Dutch
Enter a sentence to predict the language (or 'exit' to quit): exit
