In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import style
%matplotlib inline
import seaborn as sns

import itertools
import re
import string
import pickle

import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, f1_score, make_scorer
from sklearn.model_selection import KFold

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from collections import Counter
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.corpus import stopwords, wordnet 
from wordcloud import WordCloud
from copy import deepcopy

from IPython.display import (
    Markdown as md,
    Latex,
    HTML,
)
from tqdm.auto import tqdm

# set plot style
sns.set()

[nltk_data] Downloading package stopwords to C:\Users\BUYPC
[nltk_data]     COMPUTERS\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\BUYPC
[nltk_data]     COMPUTERS\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\BUYPC
[nltk_data]     COMPUTERS\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\BUYPC COMPUTERS\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [3]:
df_train = pd.read_csv("train_set.csv")
df_test = pd.read_csv("test_set.csv")

In [4]:
df_train.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [5]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33000 entries, 0 to 32999
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   lang_id  33000 non-null  object
 1   text     33000 non-null  object
dtypes: object(2)
memory usage: 515.8+ KB


In [23]:
def change_case(word):
    return word.str.lower()
df_train['nt_text'] = change_case(df_train['text'])
df_test['nt_text'] = change_case(df_test['text'])

In [24]:
#Remove Stopwords

stop_words = set(stopwords.words('english'))

def remove_stopword(tokens):
    # Remove stop words from the token list
    filtered_tokens = [token for token in tokens if token not in stop_words]
    return filtered_tokens
df_train['sw_text'] = remove_stopword(df_train['nt_text'])
df_test['sw_text'] = remove_stopword(df_test['nt_text'])

In [32]:
# replace \n and \t with " "
def remove_newlines(word):
    # Replace newlines and tabs with spaces
    word = re.sub(r"[\n\t]", " ", word)
    return word
# Apply newline removal to the 'text' column
df_train['nl_text'] = df_train['sw_text'].apply(remove_newlines)
df_test['nl_text'] = df_test['sw_text'].apply(remove_newlines)

In [27]:
def remove_punctuation(text):
    alphabet = string.ascii_lowercase
    return ''.join([x for x in text if x in alphabet + " "])
df_train['no_punc'] = df_train['nl_text'].apply(remove_punctuation)
df_test['no_punc'] = df_test['nl_text'].apply(remove_punctuation)

In [28]:
df_train.head()

Unnamed: 0,lang_id,text,cleaned,nt_text,sw_message,sw_text,nl_text,no_punc
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,"[u, g, q, e, k, , w, e, n, z, , l, u, n, g, ...",umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,"[h, , , k, u, b, , n, b, u, l, u, k, , b, ...",i-dha iya kuba nobulumko bokubeka umsebenzi na...,i-dha iya kuba nobulumko bokubeka umsebenzi na...,i-dha iya kuba nobulumko bokubeka umsebenzi na...,i-dha iya kuba nobulumko bokubeka umsebenzi na...,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,the province of kwazulu-natal department of tr...,"[h, e, , p, r, v, n, c, e, , f, , k, w, z, ...",the province of kwazulu-natal department of tr...,the province of kwazulu-natal department of tr...,the province of kwazulu-natal department of tr...,the province of kwazulu-natal department of tr...,the province of kwazulunatal department of tra...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,"[ , n, e, e, f, , g, r, e, , , b, , f, l, ...",o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...,o netefata gore o ba file dilo ka moka te le d...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,"[k, h, h, n, , , n, n, g, n, , , b, e, u, ...",khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [33]:
def _analyzer (x):
    """
    Function combines all the cleaning operations
    """
    x = remove_punctuation(x)
    x = normalize_text(x)
    x = x.apply(remove_newlines)
    x = remove_stopword(x)
    return x

In [35]:
df_train.head()

Unnamed: 0,lang_id,text,cleaned,nt_text,sw_message,sw_text,nl_text,no_punc
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,"[u, g, q, e, k, , w, e, n, z, , l, u, n, g, ...",umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqo-siseko wenza amalungiselelo kumaziko ax...,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,"[h, , , k, u, b, , n, b, u, l, u, k, , b, ...",i-dha iya kuba nobulumko bokubeka umsebenzi na...,i-dha iya kuba nobulumko bokubeka umsebenzi na...,i-dha iya kuba nobulumko bokubeka umsebenzi na...,i-dha iya kuba nobulumko bokubeka umsebenzi na...,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,the province of kwazulu-natal department of tr...,"[h, e, , p, r, v, n, c, e, , f, , k, w, z, ...",the province of kwazulu-natal department of tr...,the province of kwazulu-natal department of tr...,the province of kwazulu-natal department of tr...,the province of kwazulu-natal department of tr...,the province of kwazulunatal department of tra...
3,nso,o netefatša gore o ba file dilo ka moka tše le...,"[ , n, e, e, f, , g, r, e, , , b, , f, l, ...",o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...,o netefatša gore o ba file dilo ka moka tše le...,o netefata gore o ba file dilo ka moka te le d...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,"[k, h, h, n, , , n, n, g, n, , , b, e, u, ...",khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [36]:
# Splitting  X (indepedent) and Y (target/dependent) variables
X = df_train['no_punc']
y = df_train['lang_id']

In [37]:
from sklearn.model_selection import train_test_split
X_train , X_test , y_train , y_test = train_test_split(X , y, stratify=y,
                                                       test_size =0.4, 
                                                       random_state=42)

In [38]:
# Models
from sklearn.svm import LinearSVC, SVC
from sklearn.naive_bayes import MultinomialNB, ComplementNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.model_selection import KFold
from sklearn.linear_model import LogisticRegression, SGDClassifier, RidgeClassifier
from sklearn.model_selection import cross_val_score, GridSearchCV, RepeatedStratifiedKFold
from sklearn.ensemble import RandomForestClassifier

In [39]:
alg = [LogisticRegression(random_state =42 , max_iter=5000) , 
       MultinomialNB(), LinearSVC(random_state=42), 
       SGDClassifier(random_state=42), RidgeClassifier(random_state=42)]

In [40]:
from sklearn import metrics
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer , TfidfVectorizer

In [41]:
def performance_assessment(algorithms, X_train, y_train, X_test, y_test):
    model_stats = {}

    for clf in algorithms:
        model = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.9, ngram_range=(1, 5), analyzer='char')),
            ('clf', clf)
        ])

        model.fit(X_train, y_train)  # Training
        model_pred = model.predict(X_test)  # Testing

        model_name = clf.__class__.__name__
        f1_macro = metrics.f1_score(y_test, model_pred, average='macro')
        f1_micro = metrics.f1_score(y_test, model_pred, average='micro')
        f1_weighted = metrics.f1_score(y_test, model_pred, average='weighted')

        model_stats[model_name] = {
            'F1-Macro': f1_macro,
            'F1-Accuracy': f1_micro,
            'F1-Weighted': f1_weighted
        }

    return pd.DataFrame.from_dict(model_stats, orient='index')

In [42]:
performance = _performace_assesment(alg , X_train , X_test , y_train , y_test)
performance.to_csv('performance.csv')
dataframe = pd.read_csv('performance.csv', index_col = 0)
dataframe.sort_values('F1-Weighted', ascending=False)



Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
MultinomialNB,0.999318,0.999318,0.999318
RidgeClassifier,0.999167,0.999167,0.999167
LinearSVC,0.999091,0.999091,0.999091
SGDClassifier,0.998939,0.998939,0.998939
LogisticRegression,0.998182,0.998182,0.998182


In [43]:
def param_tuning(algorithms, X_train, y_train):
    best_params = {}

    for clf in algorithms:
        model = Pipeline([
            ('tfidf', TfidfVectorizer(stop_words='english', max_df=0.9, ngram_range=(1, 5), analyzer='char')),
            ('clf', clf)
        ])

        model.fit(X_train, y_train)  # Training

        params = model.get_params()
        model_name = clf.__class__.__name__
        model_params = {}

        for key in params:
            if key.startswith("clf__"):
                param_name = key.split('__', 1)[1]  # Extract parameter name
                model_params[param_name] = params[key]

        best_params[model_name] = model_params

    return best_params

In [44]:
best_params = _param_tuning(alg, X_train, y_train)



In [45]:
#Best parameters
best_params

{'LogisticRegression': {'model': LogisticRegression(max_iter=5000, random_state=42),
  'C': 1.0,
  'class_weight': None,
  'dual': False,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'l1_ratio': None,
  'max_iter': 5000,
  'multi_class': 'auto',
  'n_jobs': None,
  'penalty': 'l2',
  'random_state': 42,
  'solver': 'lbfgs',
  'tol': 0.0001,
  'verbose': 0,
  'warm_start': False},
 'MultinomialNB': {'model': MultinomialNB(),
  'alpha': 1.0,
  'class_prior': None,
  'fit_prior': True,
  'force_alpha': 'warn'},
 'LinearSVC': {'model': LinearSVC(random_state=42),
  'C': 1.0,
  'class_weight': None,
  'dual': True,
  'fit_intercept': True,
  'intercept_scaling': 1,
  'loss': 'squared_hinge',
  'max_iter': 1000,
  'multi_class': 'ovr',
  'penalty': 'l2',
  'random_state': 42,
  'tol': 0.0001,
  'verbose': 0},
 'SGDClassifier': {'model': SGDClassifier(random_state=42),
  'alpha': 0.0001,
  'average': False,
  'class_weight': None,
  'early_stopping': False,
  'epsilon': 0.1,
  'eta0':

In [46]:
#model
model1 = MultinomialNB()

In [47]:
Vectorize = TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(1, 5), analyzer= 'char')
X_train = Vectorize.fit_transform(X_train)
X_test = Vectorize.transform(X_test)



In [48]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True,
                                   random_state=42)

In [49]:
best_params[alg[1].__class__.__name__]

{'model': MultinomialNB(),
 'alpha': 1.0,
 'class_prior': None,
 'fit_prior': True,
 'force_alpha': 'warn'}

In [50]:
alpha = list(np.linspace(0.1,0.02,4))
param_grid = dict(alpha=alpha)
grid_search = GridSearchCV(estimator= model1,
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=stratified_kfold,
                           error_score=0,
                           n_jobs=-1)

In [51]:
grid_search.fit(X_train, y_train)
prediction = grid_search.predict(X_test)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)

In [52]:
print(f'Cross-validation score: {cv_score}')
print(f'Test score: {test_score}')
grid_search.best_params_    
grid_search.best_estimator_

Cross-validation score: 0.9996464641593393
Test score: 0.9995455174533855


In [53]:
#model
model2 = RidgeClassifier()
best_params[alg[4].__class__.__name__]

{'model': RidgeClassifier(random_state=42),
 'alpha': 1.0,
 'class_weight': None,
 'copy_X': True,
 'fit_intercept': True,
 'max_iter': None,
 'positive': False,
 'random_state': 42,
 'solver': 'auto',
 'tol': 0.0001}

In [54]:
alpha = list(np.linspace(0.15,0.4, 5))
param_grid = dict(alpha=alpha)
grid_search = GridSearchCV(estimator= model2,
                           param_grid=param_grid,
                           scoring='f1_weighted',
                           cv=stratified_kfold,
                           error_score=0,
                           n_jobs=-1)

In [55]:
grid_search.fit(X_train, y_train)
prediction = grid_search.predict(X_test)
cv_score = grid_search.best_score_
test_score = grid_search.score(X_test, y_test)

In [56]:
print(f'Cross-validation score: {cv_score}')
print(f'Test score: {test_score}')
grid_search.best_params_    
grid_search.best_estimator_

Cross-validation score: 0.999545452790985
Test score: 0.9993181815945916


In [57]:
from sklearn.ensemble import StackingClassifier
from sklearn.pipeline import make_pipeline

In [58]:
X_train , X_test , y_train , y_test = train_test_split(X, y,  stratify=y, test_size=0.4, random_state =1)

In [59]:
vect = TfidfVectorizer(stop_words = 'english', max_df=0.9, ngram_range=(2, 6), analyzer= 'char')
X_train = vect.fit_transform(X_train)
X_test = vect.transform(X_test)



In [60]:
multiNB1 = MultinomialNB(alpha=0.1)
multiNB2 = MultinomialNB(alpha=0.1)

estimators = [('multiNB1', multiNB1), ('multiNB2', multiNB2)]
final_est = RidgeClassifier(alpha=0.2125)

In [61]:
stacking_NB2 = StackingClassifier(estimators = estimators,
                           final_estimator = final_est,
                           passthrough = True)

In [62]:
stacking_NB2.fit(X_train , y_train)

In [63]:
pred = stacking_NB2.predict(X_test)

In [64]:
model_stats = {}
model_stats[stacking_NB2.__class__.__name__] = {
        'F1-Macro':metrics.f1_score(y_test, pred, average='macro'),
        'F1-Accuracy':metrics.f1_score(y_test, pred, average='micro'),
        'F1-Weighted':metrics.f1_score(y_test, pred, average='weighted')}
pd.DataFrame.from_dict(model_stats, orient='index')

Unnamed: 0,F1-Macro,F1-Accuracy,F1-Weighted
StackingClassifier,0.999773,0.999773,0.999773


In [65]:
count_vec = CountVectorizer(ngram_range=(3,7), analyzer= 'char')
X_train , X_test , y_train , y_test = train_test_split(X, y, stratify=y,test_size=0.05, random_state =1)
X_train = count_vec.fit_transform(X_train)
X_test = count_vec.transform(X_test)

In [66]:
multiNB1 = MultinomialNB(alpha=0.1)
multiNB2 = MultinomialNB(alpha=0.1)
multiNB3 = MultinomialNB(alpha=0.1)

estimators = [('multiNB1', multiNB1), ('multiNB2', multiNB2), ('multiNB3', multiNB3)]
final_est = RidgeClassifier(alpha=0.2125)

In [70]:
X = df_test['no_punc']
Vectorize = vect.transform(X)
df_test['lang_id'] = stacking_NB2.predict(Vectorize)
submission = df_test[['index', 'lang_id']]
submission.to_csv('Submission.csv',index=False)
submission

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot


In [None]:
test.cleaned