Developement of Language Detection System using Machine Learning


In [37]:
#IMPORT LIBRARIES


import string
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.svm import SVC
from sklearn import pipeline
import matplotlib.pyplot as plt
from sklearn.naive_bayes import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier, VotingClassifier



In [38]:
#LOADING DATASET

df = pd.read_csv("C:\\Users\\abhis\\Desktop\\Language Detection.csv")
df.head()


Unnamed: 0,Text,Language
0,"Nature, in the broadest sense, is the natural...",English
1,"""Nature"" can refer to the phenomena of the phy...",English
2,"The study of nature is a large, if not the onl...",English
3,"Although humans are part of nature, human acti...",English
4,[1] The word nature is borrowed from the Old F...,English


In [39]:
df.shape

(10337, 2)

In [40]:
df["Language"].value_counts()

Language
English       1385
French        1014
Spanish        819
Portugeese     739
Italian        698
Russian        692
Sweedish       676
Malayalam      594
Dutch          546
Arabic         536
Turkish        474
German         470
Tamil          469
Danish         428
Kannada        369
Greek          365
Hindi           63
Name: count, dtype: int64

In [41]:
# Checking null values 

print(df['Text'].isnull().sum())  # Check for NaN values


0


In [42]:
# method to remove puncutations for easy predication

def remove_pun(Text):
    if isinstance(Text, str):
        for pun in string.punctuation:
            Text = Text.replace(pun, "")
        Text = Text.lower()
    return Text

In [43]:
#Applying the remove_pun function to the 'Text' column of your DataFrame df

df['Text'] = df['Text'].apply(remove_pun)

In [44]:
# Assigning depenndent and independent values for X and Y 

X=df.iloc[:,0]  #X is assigned as Text column
Y=df.iloc[:,1]   #Y is assigned as Language column

In [45]:
# Spliting the data 
# 25% data is given for testing and remaining 75% is given for training

X_train,X_test,Y_train,Y_test=train_test_split(X,Y,test_size=0.25,random_state=42)

In [46]:
# Display X_train values after splitting and applying random state=42

X_train

9651                            ich werde in ordnung sein
5196    el análisis por agrupamiento clustering en ing...
5396    puede decirle que no vale la pena como ¿por qu...
8871                                        nej inte alls
5012    es una medida de economía de esfuerzos que per...
                              ...                        
5734    σε μια αγγλόφωνη χώρα μπορείτε να το χρησιμοπο...
5191    las conexiones tienen pesos numéricos que se a...
5390    si alguien te pregunta si estás cansado y quie...
860     due to its generality the field is studied in ...
7270    sono disponibili inoltre applicazioni dedicate...
Name: Text, Length: 7752, dtype: object

In [54]:
# Initializes a TF-IDF vectorizer using character n-grams (1 to 2 characters) to capture language patterns.
# Define the vectorizer


vec = TfidfVectorizer(ngram_range=(1, 2), analyzer='char')

In [56]:
vec

In [57]:
print(vec.get_params())


{'analyzer': 'char', 'binary': False, 'decode_error': 'strict', 'dtype': <class 'numpy.float64'>, 'encoding': 'utf-8', 'input': 'content', 'lowercase': True, 'max_df': 1.0, 'max_features': None, 'min_df': 1, 'ngram_range': (1, 2), 'norm': 'l2', 'preprocessor': None, 'smooth_idf': True, 'stop_words': None, 'strip_accents': None, 'sublinear_tf': False, 'token_pattern': '(?u)\\b\\w\\w+\\b', 'tokenizer': None, 'use_idf': True, 'vocabulary': None}


In [58]:
# Fit and transform the data 

Tfid_Matrix =vec.fit_transform(X)

In [59]:
## Display the shape of the resulting TF-IDF matrix
Tfid_Matrix.shape

(10337, 9368)

In [60]:
# Printing the feature names
vec.get_feature_names_out()

array(['\n', '\n1', '\n2', ..., '\ufeff', '\ufeff ', '\ufeff3'],
      dtype=object)

In [61]:
# X_train elements are converted to strings including numbers etc 

X_train = X_train.astype(str)

In [63]:
# Define individual models

random_forest = RandomForestClassifier(n_estimators=100, random_state=42)
svm = SVC(kernel='linear', probability=True, random_state=42)
naive_bayes = MultinomialNB()