# South African Language Identification Hack 2023

#### Overview


        South Africa is a multicultural society with rich linguistic diversity. 
        Its 11 official languages hold equal status and play crucial roles in enhancing democracy
        and enriching various aspects of social, cultural, economic, and political life. 
        The majority of South Africans are multilingual, proficient in speaking two or more official languages.

 #### Importing Libraries

In [67]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

#### Loading Data

In [60]:
train_df = pd.read_csv(r"C:\Users\nengo\Downloads\south-african-language-identification-hack-2023\train_set.csv")
test_df = pd.read_csv(r"C:\Users\nengo\Downloads\south-african-language-identification-hack-2023\test_set.csv")

#### Exploratory Data Analysis (EDA)

In [61]:
print("Train Dataset:")
print(train_df.head())

print("\nTest Dataset:")
print(test_df.head())

Train Dataset:
  lang_id                                               text
0     xho  umgaqo-siseko wenza amalungiselelo kumaziko ax...
1     xho  i-dha iya kuba nobulumko bokubeka umsebenzi na...
2     eng  the province of kwazulu-natal department of tr...
3     nso  o netefatša gore o ba file dilo ka moka tše le...
4     ven  khomishini ya ndinganyiso ya mbeu yo ewa maana...

Test Dataset:
   index                                               text
0      1  Mmasepala, fa maemo a a kgethegileng a letlele...
1      2  Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2      3         Tshivhumbeo tshi fana na ngano dza vhathu.
3      4  Kube inja nelikati betingevakala kutsi titsini...
4      5                      Winste op buitelandse valuta.


#### Data Preprocessing

In [62]:
def preprocess_data(train_df, test_df):
    # Initializing the TF-IDF vectorizer
    vectorizer = TfidfVectorizer()

    # Fitting the vectorizer on the training data
    vectorizer.fit(train_df['text'])

    # Transforming the training and test data using the fitted vectorizer
    train_features = vectorizer.transform(train_df['text'])
    test_features = vectorizer.transform(test_df['text'])

    return train_features, test_features, vectorizer

In [63]:
train_features, test_features, vectorizer = preprocess_data(train_df, test_df)

In [69]:
from langdetect import detect

def identify_language(text):
    try:
        language = detect(text)
        return language
    except Exception as e:
        print(f"Error: {e}")
        return None

# Example text in various languages
texts = [
    "Hello, how are you?",                    # English
    "¡Hola! ¿Cómo estás?",                    # Spanish
    "Bonjour, comment ça va?",                # French
    "Hallo, wie geht es dir?",                # German
    "Hoe gaan dit met jou?",                  # Afrikaans
    "Avuxeni, u njhani?",                     # Xitsonga
    "Yebo, kunjani?",                         # Zulu
    "Dumela, o kae?",                         # Setswana
    "Lefatshe la rona le kgalwa ke wena.",    # Sesotho
    "Molweni, unjani?",                       # isiXhosa
    "Salibonani, unjani?"                     # isiZulu
]

# Identifying languages for each text
for idx, text in enumerate(texts):
    language = identify_language(text)
    print(f"Text {idx + 1} is in {language} language.")


Text 1 is in en language.
Text 2 is in es language.
Text 3 is in fr language.
Text 4 is in de language.
Text 5 is in af language.
Text 6 is in hr language.
Text 7 is in sw language.
Text 8 is in hr language.
Text 9 is in sw language.
Text 10 is in sw language.
Text 11 is in sw language.


## Training and Evaluation

#### Logistics Regression

In [64]:
X_train, X_val, y_train, y_val = train_test_split(train_features, train_df['lang_id'], test_size=0.2, random_state=42)
lr_model = LogisticRegression()
lr_model.fit(X_train, y_train)
lr_preds = lr_model.predict(X_val)
lr_f1 = f1_score(y_val, lr_preds, average='weighted')

print("Logistic Regression F1 Score:", lr_f1)

Logistic Regression F1 Score: 0.994245605433102


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


#### K Nearest Neighbors (KNN)

In [65]:
knn_model = KNeighborsClassifier()
knn_model.fit(X_train, y_train)
knn_preds = knn_model.predict(X_val)
knn_f1 = f1_score(y_val, knn_preds, average='weighted')

print("KNN F1 Score:", knn_f1)

KNN F1 Score: 0.9593450685034197


#### Support Vector Machine


In [58]:
svm = SVC()
svm.fit(X_train, y_train)
svm_predictions = svm.predict(X_val)
svm_f1 = f1_score(y_val, svm_predictions, average='weighted')
print("SVM F1 Score:", svm_f1)


SVM F1 Score: 0.9942650475719715


#### Naive Bayes


In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_predictions = nb.predict(X_val)
nb_f1 = f1_score(y_val, nb_predictions, average='weighted')
print("Naive Bayes F1 Score:", nb_f1)


#### Generate predictions on the test set

In [56]:
# Converting the test data into TF-IDF vectors
X_test = vectorizer.transform(test_df['text'])

# Generating predictions on the best performing model
test_predictions = nb.predict(X_test)

#### Creating a csv for submission


In [57]:
# Creating a submission dataframe with 'index' and 'lang_id' columns
submission_df = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions})

submission_df.to_csv('FinalSub1.csv', index=False)