# Hackathon

## 1. Imports:

In [132]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

# Libraries for data preparation and model building
from sklearn.metrics import classification_report, f1_score, precision_score, recall_score, confusion_matrix  # Classification metrics
from sklearn.model_selection import train_test_split, GridSearchCV  # Train-test split and grid search
from sklearn.linear_model import LogisticRegression  # Logistic Regression classifier for machine learning
from sklearn.tree import DecisionTreeClassifier  # Decision Tree classifier for machine learning
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier  # Random Forest classifier for machine learning
from sklearn.svm import LinearSVC, SVC  # Support Vector Machine classifiers
from sklearn.naive_bayes import GaussianNB, MultinomialNB  # Naive Bayes classifiers
from sklearn.ensemble import BaggingClassifier  # Bagging classifier
from sklearn.ensemble import ExtraTreesClassifier  # Extra Trees classifier
from sklearn.ensemble import VotingClassifier  # Voting classifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier  # Efficient and flexible gradient boosting library
from catboost import CatBoostClassifier  # High-performance gradient boosting on decision trees library
from scipy.sparse import hstack  # Used for stacking sparse matrices horizontally
import pickle  # Serialization library
from sklearn.utils import resample  # Resampling tool
from sklearn import feature_selection  # Feature selection module
from sklearn.feature_selection import f_classif  # Feature selection using F-statistic
from mlxtend.feature_selection import SequentialFeatureSelector  # Sequential feature selection
from sklearn import preprocessing  # Data preprocessing
import pickle  # Serialization library
import tensorflow as tf  # TensorFlow for neural net
from tensorflow.keras.layers import Dense  # Dense layer in Keras
from tensorflow.keras.models import Sequential  # Sequential model in Keras
from tensorflow.keras.utils import to_categorical  # Conversion to categorical data
from scipy.sparse import issparse
from sklearn import metrics

# Feature selection Libraries:
from sklearn.feature_selection import SelectKBest  # To reduce features
from sklearn.feature_selection import chi2  # Used to estimate which features are most impactful

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Flags for notebook Execution
VECTORIZER_TO_USE = "count"  # Chooses between TfIDF vectorizer or Count Vectorizer - accepted values are "tfidf" or "count"

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\T460\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\T460\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\T460\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## 2. Load DataFrames:

In [133]:
# Read training data from CSV file into a DataFrame
df_train = pd.read_csv('train_set.csv')

# Read test data without labels from CSV file into a DataFrame
df_test = pd.read_csv('test_set.csv')

# 3. Cleaning Data:

In [134]:
df_train.shape

(33000, 2)

In [135]:
df_test.shape

(5682, 2)

In [136]:

def nlp_preprocessing(texts):
    # Convert to lowercase
    texts = [text.lower() for text in texts]

    # Remove numbers
    texts = [re.sub(r'\d+', '', text) for text in texts]

    # Remove punctuation
    texts = [text.translate(str.maketrans("", "", string.punctuation)) for text in texts]

    # Tokenize the text
    words = [word_tokenize(text) for text in texts]

    # Remove stopwords
    stop_words = set(stopwords.words('english'))
    words = [[word for word in doc if word not in stop_words] for doc in words]

    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [[lemmatizer.lemmatize(word) for word in doc] for doc in words]

    # Join the cleaned words back into a single string
    cleaned_texts = [' '.join(doc) for doc in words]

    return cleaned_texts

In [137]:
df_train['text'] = nlp_preprocessing(df_train['text'])

In [138]:
df_train

Unnamed: 0,lang_id,text
0,xho,umgaqosiseko wenza amalungiselelo kumaziko axh...
1,xho,idha iya kuba nobulumko bokubeka umsebenzi nap...
2,eng,province kwazulunatal department transport inv...
3,nso,netefatša gore ba file dilo ka moka tše le dum...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...
32996,sot,modise mosadi na ntse sa utlwe hore thabang ra...
32997,eng,closing date submission completed tender augus...
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...


In [139]:
df_test['text'] = nlp_preprocessing(df_test['text'])

In [140]:
df_test

Unnamed: 0,index,text
0,1,mmasepala fa maemo kgethegileng letlelela kgat...
1,2,uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,tshivhumbeo tshi fana na ngano dza vhathu
3,4,kube inja nelikati betingevakala kutsi titsini...
4,5,winste op buitelandse valuta
...,...,...
5677,5678,mark ballot private
5678,5679,ge ka kgetha ka bowena go se šomiše mofani ka ...
5679,5680,e ka kopo etsa kgetho ya hao ka hloko hobane h...
5680,5681,tb ke bokudi ba pmb mme morero tla lefella tlh...


In [141]:
# Assuming df_train is your DataFrame and 'lang_id' is the column you want to encode
le = LabelEncoder()
df_train['lang_id_encoded'] = le.fit_transform(df_train['lang_id'])

# Display the mapping between original categories and encoded values
label_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("Label Mapping:", label_mapping)

Label Mapping: {'afr': 0, 'eng': 1, 'nbl': 2, 'nso': 3, 'sot': 4, 'ssw': 5, 'tsn': 6, 'tso': 7, 'ven': 8, 'xho': 9, 'zul': 10}


In [142]:
df_train

Unnamed: 0,lang_id,text,lang_id_encoded
0,xho,umgaqosiseko wenza amalungiselelo kumaziko axh...,9
1,xho,idha iya kuba nobulumko bokubeka umsebenzi nap...,9
2,eng,province kwazulunatal department transport inv...,1
3,nso,netefatša gore ba file dilo ka moka tše le dum...,3
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,8
...,...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...,6
32996,sot,modise mosadi na ntse sa utlwe hore thabang ra...,4
32997,eng,closing date submission completed tender augus...,1
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...,9


In [143]:
# Extract index that will be used when submitting predictions based on the evaluate dataset
df_index_submission = pd.DataFrame(df_train['lang_id'])
df_index_submission

Unnamed: 0,lang_id
0,xho
1,xho
2,eng
3,nso
4,ven
...,...
32995,tsn
32996,sot
32997,eng
32998,xho


In [144]:
# Dropping the message column since it has been vectorized
df_train = df_train.drop("lang_id", axis='columns')
df_train.head()

Unnamed: 0,text,lang_id_encoded
0,umgaqosiseko wenza amalungiselelo kumaziko axh...,9
1,idha iya kuba nobulumko bokubeka umsebenzi nap...,9
2,province kwazulunatal department transport inv...,1
3,netefatša gore ba file dilo ka moka tše le dum...,3
4,khomishini ya ndinganyiso ya mbeu yo ewa maana...,8


In [145]:
# Initialize and fit specified Vectorizer
if VECTORIZER_TO_USE == "tfidf":

    # Initialize Vectorizer
    tfid_train = TfidfVectorizer(ngram_range=(4,5), analyzer='char', min_df=2, max_df =0.45, max_features=30000) # Change max features to include more data

    # Fit vectoriser on text data:
    vec_text_train = tfid_train.fit_transform(df_train["text"])

    # Transform both test set for 'tfid'
    vec_text_test = tfid_train.transform(df_test["text"])
    
elif VECTORIZER_TO_USE == "count":

    # Initialize Vectorizer
    count_vec_train = CountVectorizer(ngram_range=(4,5), analyzer='char', min_df=2, max_df =0.45, max_features=30000) # Change max features to include more data

    # Fit vectoriser on text data:
    vec_text_train = count_vec_train.fit_transform(df_train["text"])

    # Transform both test set for 'count'
    vec_text_test = count_vec_train.transform(df_test["text"])

In [146]:
# Converting vectorized text into sparse dataframe
if VECTORIZER_TO_USE == "tfidf":
    sparse_vec_msg_train = pd.DataFrame.sparse.from_spmatrix(vec_text_train, columns = tfid_train.get_feature_names_out())
elif VECTORIZER_TO_USE == "count":
    sparse_vec_msg_train = pd.DataFrame.sparse.from_spmatrix(vec_text_train, columns = count_vec_train.get_feature_names_out())
sparse_vec_msg_train.head()

Unnamed: 0,aan,aan.1,aang,aans,aanv,aba,abab,abad,abaf,abah,...,ṅwe n,ṱang,ṱanga,ṱanz,ṱanzi,ṱha,ṱhan,ṱhanz,ṱhe,ṱhis
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [147]:
# Converting vectorized text into sparse dataframe
if VECTORIZER_TO_USE == "tfidf":
    sparse_vec_msg_test = pd.DataFrame.sparse.from_spmatrix(vec_text_test, columns = tfid_train.get_feature_names_out())
elif VECTORIZER_TO_USE == "count":
    sparse_vec_msg_test = pd.DataFrame.sparse.from_spmatrix(vec_text_test, columns = count_vec_train.get_feature_names_out())
sparse_vec_msg_test.head()

Unnamed: 0,aan,aan.1,aang,aans,aanv,aba,abab,abad,abaf,abah,...,ṅwe n,ṱang,ṱanga,ṱanz,ṱanzi,ṱha,ṱhan,ṱhanz,ṱhe,ṱhis
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [148]:
# Adding the dataframes together
df_vectorized_clean = pd.concat([df_train.reset_index(drop=True), sparse_vec_msg_train.reset_index(drop=True)], axis=1)
df_vectorized_clean.head()

Unnamed: 0,text,lang_id_encoded,aan,aan.1,aang,aans,aanv,aba,abab,abad,...,ṅwe n,ṱang,ṱanga,ṱanz,ṱanzi,ṱha,ṱhan,ṱhanz,ṱhe,ṱhis
0,umgaqosiseko wenza amalungiselelo kumaziko axh...,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,idha iya kuba nobulumko bokubeka umsebenzi nap...,9,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,province kwazulunatal department transport inv...,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,netefatša gore ba file dilo ka moka tše le dum...,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,khomishini ya ndinganyiso ya mbeu yo ewa maana...,8,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [149]:
# Adding the dataframes together
df_vectorized_test_clean = pd.concat([df_test.reset_index(drop=True), sparse_vec_msg_test.reset_index(drop=True)], axis=1)
df_vectorized_test_clean.head()

Unnamed: 0,index,text,aan,aan.1,aang,aans,aanv,aba,abab,abad,...,ṅwe n,ṱang,ṱanga,ṱanz,ṱanzi,ṱha,ṱhan,ṱhanz,ṱhe,ṱhis
0,1,mmasepala fa maemo kgethegileng letlelela kgat...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,uzakwaziswa ngokufaneleko nakungafuneka eminye...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,tshivhumbeo tshi fana na ngano dza vhathu,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,kube inja nelikati betingevakala kutsi titsini...,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,winste op buitelandse valuta,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [150]:
# Dropping the message column since it has been vectorized
df_vectorized_clean = df_vectorized_clean.drop("text", axis='columns')
df_vectorized_clean.head()

Unnamed: 0,lang_id_encoded,aan,aan.1,aang,aans,aanv,aba,abab,abad,abaf,...,ṅwe n,ṱang,ṱanga,ṱanz,ṱanzi,ṱha,ṱhan,ṱhanz,ṱhe,ṱhis
0,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,9,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,8,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [151]:
# Dropping the message column since it has been vectorized
df_vectorized_test_clean = df_vectorized_test_clean.drop("text", axis='columns')
df_vectorized_test_clean.head()

Unnamed: 0,index,aan,aan.1,aang,aans,aanv,aba,abab,abad,abaf,...,ṅwe n,ṱang,ṱanga,ṱanz,ṱanzi,ṱha,ṱhan,ṱhanz,ṱhe,ṱhis
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [152]:
# Extract index that will be used when submitting predictions based on the evaluate dataset
df_index_submission = pd.DataFrame(df_vectorized_test_clean['index'])
df_index_submission

Unnamed: 0,index
0,1
1,2
2,3
3,4
4,5
...,...
5677,5678
5678,5679
5679,5680
5680,5681


In [153]:
# Remove index form both training and test datasets
df_vectorized_test_clean = df_vectorized_test_clean.drop("index", axis=1)

df_vectorized_test_clean.head()

Unnamed: 0,aan,aan.1,aang,aans,aanv,aba,abab,abad,abaf,abah,...,ṅwe n,ṱang,ṱanga,ṱanz,ṱanzi,ṱha,ṱhan,ṱhanz,ṱhe,ṱhis
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


# 4. Pre-processing:

In [154]:
# Extracting the response variable from the training dataset
y_train_og = df_vectorized_clean[:len(df_vectorized_clean)][['lang_id_encoded']]

# Separating predictor variables from response variable
x_train = df_vectorized_clean[:len(df_vectorized_clean)].drop("lang_id_encoded", axis = "columns")

In [155]:
# Splitting into our training and validation data subsets
X_train, X_validate, y_train, y_validate = train_test_split(x_train, y_train_og, test_size=0.2, random_state=42)

# 5. Models:

### Logistic Regression

In [156]:
# Initialize Logistic Regression model
log_reg = LogisticRegression()

# Fitting the model to our training data subset
log_reg.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


LogisticRegression()

In [157]:
# Predict values of the testing subset
pred_log_reg = log_reg.predict(X_validate)

# Let's produce a classification report of the model as is:
print(classification_report(y_validate, pred_log_reg))

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       583
           1       1.00      1.00      1.00       615
           2       0.99      1.00      1.00       583
           3       1.00      1.00      1.00       625
           4       1.00      1.00      1.00       618
           5       1.00      1.00      1.00       584
           6       1.00      1.00      1.00       598
           7       1.00      1.00      1.00       561
           8       1.00      1.00      1.00       634
           9       1.00      1.00      1.00       609
          10       1.00      0.99      1.00       590

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600



In [158]:
log_reg_cm = confusion_matrix(y_validate, pred_log_reg)
log_reg_f1 = f1_score(y_validate, pred_log_reg, average="weighted") 
log_reg_precision = precision_score(y_validate, pred_log_reg, average='weighted')
log_reg_recall = recall_score(y_validate, pred_log_reg, average="weighted")

In [159]:
# Initiate final model
final_log_reg = LogisticRegression() 

# Train on all available data
final_log_reg.fit(x_train, y_train_og)

# Generate predictions for the evaluation dataset
log_reg_predictions = final_log_reg.predict(df_vectorized_test_clean)

  y = column_or_1d(y, warn=True)
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [160]:
# Make submissions into dataframe
log_reg_predictions = pd.DataFrame(log_reg_predictions, columns=["lang_id_encoded"])

# Convert lang_id_encoded to integer
log_reg_predictions['lang_id_encoded'] = log_reg_predictions['lang_id_encoded'].astype('int')

# Assuming log_reg_predictions is your DataFrame and 'lang_id_encoded' is the encoded column
lang_id = le.inverse_transform(log_reg_predictions['lang_id_encoded'])

# Create a new column in the DataFrame with the original values
log_reg_predictions['lang_id'] = lang_id

# Dropping the message column since it has been vectorized
log_reg_predictions = log_reg_predictions.drop("lang_id_encoded", axis='columns')

# Match submissions to index
log_reg_submission = pd.concat([df_index_submission.reset_index(drop=True), log_reg_predictions.reset_index(drop=True)], axis=1,)


#Save submission as csv
log_reg_submission.to_csv('Logistic_Regression_Predictions.csv', index=False)

log_reg_submission

Unnamed: 0,index,lang_id
0,1,eng
1,2,nbl
2,3,eng
3,4,eng
4,5,eng
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot


### Multinominal Naive Bayes

In [174]:
# Creating a Multinomial Naive Bayes Classifier object:
multinom_nb = MultinomialNB()

# Define hyperparameter grid for Multinomial Naive Bayes
param_grid_multinom_nb = {'alpha': [0.001, 0.01, 0.1, 1.0, 2.0, 10.0]}

# Fitting the model to our training data subset
grid_search_multinom_nb = GridSearchCV(multinom_nb, param_grid_multinom_nb, cv=5, scoring='f1_macro', verbose=1, n_jobs=-1)
grid_search_multinom_nb.fit(X_train, y_train)  # Use ravel() to convert y_train to a 1D array

# Get the best Multinomial Naive Bayes model from the grid search
best_multinom_nb = grid_search_multinom_nb.best_estimator_

# Print the best hyperparameters found
print('Best Hyperparameters for Multinomial Naive Bayes:', grid_search_multinom_nb.best_params_)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


  y = column_or_1d(y, warn=True)


Best Hyperparameters for Multinomial Naive Bayes: {'alpha': 0.1}


In [167]:
# Create a new instance of the Multinomial Naive Bayes model with the best hyperparameter
best_multinom_nb = MultinomialNB(alpha=0.1)

# Fitting the model to our training data subset
best_multinom_nb.fit(X_train, y_train)

  y = column_or_1d(y, warn=True)


MultinomialNB(alpha=0.1)

In [169]:
# Predict values of the testing subset
pred_multinom_nb = best_multinom_nb.predict(X_validate)

# Let's produce a classification report of the model as is:
print(classification_report(y_validate, pred_multinom_nb)) 

              precision    recall  f1-score   support

           0       1.00      1.00      1.00       583
           1       1.00      1.00      1.00       615
           2       1.00      1.00      1.00       583
           3       1.00      1.00      1.00       625
           4       1.00      1.00      1.00       618
           5       1.00      1.00      1.00       584
           6       1.00      1.00      1.00       598
           7       1.00      1.00      1.00       561
           8       1.00      1.00      1.00       634
           9       1.00      1.00      1.00       609
          10       1.00      1.00      1.00       590

    accuracy                           1.00      6600
   macro avg       1.00      1.00      1.00      6600
weighted avg       1.00      1.00      1.00      6600



In [170]:
multinom_nb_cm = confusion_matrix(y_validate, pred_multinom_nb)
multinom_nb_f1 = f1_score(y_validate, pred_multinom_nb, average="weighted") 
multinom_nb_precision = precision_score(y_validate, pred_multinom_nb, average='weighted')
multinom_nb_recall = recall_score(y_validate, pred_multinom_nb, average="weighted")

In [171]:
# Initiate final model
final_multinom_nb = MultinomialNB(alpha=0.1)

# Train on all available data
final_multinom_nb.fit(x_train, y_train_og)

# Generate predictions for the evaluation dataset
multinom_nb_predictions = final_multinom_nb.predict(df_vectorized_test_clean)

  y = column_or_1d(y, warn=True)


In [172]:
# Make submissions into dataframe
multi_nb_predictions = pd.DataFrame(multinom_nb_predictions, columns=["lang_id_encoded"])

# Convert lang_id_encoded to integer
multi_nb_predictions['lang_id_encoded'] = multi_nb_predictions['lang_id_encoded'].astype('int')

# Assuming multi_nb_predictions is your DataFrame and 'lang_id_encoded' is the encoded column
lang_id = le.inverse_transform(multi_nb_predictions['lang_id_encoded'])

# Create a new column in the DataFrame with the original values
multi_nb_predictions['lang_id'] = lang_id

# Dropping the message column since it has been vectorized
multi_nb_predictions = multi_nb_predictions.drop("lang_id_encoded", axis='columns')

# Match submissions to index
multi_nb_submission = pd.concat([df_index_submission.reset_index(drop=True), multi_nb_predictions.reset_index(drop=True)], axis=1,)


#Save submission as csv
multi_nb_submission.to_csv('Multinominal_Naive_Bayes_Predictions.csv', index=False)

multi_nb_submission

Unnamed: 0,index,lang_id
0,1,tsn
1,2,nbl
2,3,ven
3,4,ssw
4,5,afr
...,...,...
5677,5678,eng
5678,5679,nso
5679,5680,sot
5680,5681,sot
