# South African Language Identification Hack 2023

#### Overview


        South Africa is a multicultural society with rich linguistic diversity. 
        Its 11 official languages hold equal status and play crucial roles in enhancing democracy
        and enriching various aspects of social, cultural, economic, and political life. 
        The majority of South Africans are multilingual, proficient in speaking two or more official languages.

 #### Importing Libraries

In [1]:
# Importing necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer

from sklearn.metrics import f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.naive_bayes import MultinomialNB

#### Loading Data

In [2]:
train_df = pd.read_csv(r"C:\Users\nengo\Downloads\south-african-language-identification-hack-2023\train_set.csv")
test_df = pd.read_csv(r"C:\Users\nengo\Downloads\south-african-language-identification-hack-2023\test_set.csv")

## Exploratory Data Analysis (EDA)

In [3]:
train_df.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [4]:
print("Train Dataset:")
print(train_df.head())

print("\nTest Dataset:")
print(test_df.head())

Train Dataset:
  lang_id                                               text
0     xho  umgaqo-siseko wenza amalungiselelo kumaziko ax...
1     xho  i-dha iya kuba nobulumko bokubeka umsebenzi na...
2     eng  the province of kwazulu-natal department of tr...
3     nso  o netefatša gore o ba file dilo ka moka tše le...
4     ven  khomishini ya ndinganyiso ya mbeu yo ewa maana...

Test Dataset:
   index                                               text
0      1  Mmasepala, fa maemo a a kgethegileng a letlele...
1      2  Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2      3         Tshivhumbeo tshi fana na ngano dza vhathu.
3      4  Kube inja nelikati betingevakala kutsi titsini...
4      5                      Winste op buitelandse valuta.


#### Data Preprocessing

In [5]:
from sklearn.feature_extraction.text import CountVectorizer

def preprocess_data(train_df, test_df):
    # Initializing the Count Vectorizer
    vectorizer = CountVectorizer()

    # Fitting the vectorizer on the training data
    vectorizer.fit(train_df['text'])

    # Transforming the training and test data using the fitted vectorizer
    train_features = vectorizer.transform(train_df['text'])
    test_features = vectorizer.transform(test_df['text'])

    return train_features, test_features, vectorizer


In [6]:
train_features, test_features, vectorizer = preprocess_data(train_df, test_df)

In [7]:
from langdetect import detect

def identify_language(text):
    try:
        language = detect(text)
        return language
    except Exception as e:
        print(f"Error: {e}")
        return None

# Example text in various languages
texts = [
    "Hello, how are you?",                    # English
    "¡Hola! ¿Cómo estás?",                    # Spanish
    "Bonjour, comment ça va?",                # French
    "Hallo, wie geht es dir?",                # German
    "Hoe gaan dit met jou?",                  # Afrikaans
    "Avuxeni, u njhani?",                     # Xitsonga
    "Yebo, kunjani?",                         # Zulu
    "Dumela, o kae?",                         # Setswana
    "Lefatshe la rona le kgalwa ke wena.",    # Sesotho
    "Molweni, unjani?",                       # isiXhosa
    "Salibonani, unjani?"                     # isiZulu
]

# Identifying languages for each text
for idx, text in enumerate(texts):
    language = identify_language(text)
    print(f"Text {idx + 1} is in {language} language.")


Text 1 is in en language.
Text 2 is in es language.
Text 3 is in fr language.
Text 4 is in de language.
Text 5 is in af language.
Text 6 is in hr language.
Text 7 is in sw language.
Text 8 is in hr language.
Text 9 is in sw language.
Text 10 is in sw language.
Text 11 is in sw language.


##  Developing, Training,  Evaluation and Validation using Different Classification Models

#### Splitting the data

In [8]:
## splitting the data
X_train, X_val, y_train, y_val = train_test_split(train_features, train_df['lang_id'], test_size=0.2, random_state=42)


## Logistics Regression Classification Model

#### Using Cross Validation in order to validate the model

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, cross_val_predict
from sklearn.metrics import f1_score

# Initialize Logistic Regression model
linearR_model = LogisticRegression()

# Train the model using the training data
linearR_model.fit(X_train, y_train)

# Perform 5-fold cross-validation and calculate F1 score
f1_scores = cross_val_score(linearR_model, X_train, y_train, cv=5, scoring='f1_weighted')

# Calculate mean F1 score from cross-validation
mean_f1_score_cv = f1_scores.mean()
print("Mean F1 Score (Cross-validation):", mean_f1_score_cv)

# Predict on the validation set (X_val)
linearR_preds_val = linearR_model.predict(X_val)

# Calculate F1 score on the validation set
linearR_f1_val = f1_score(y_val, linearR_preds_val, average='weighted')
print("F1 Score on Validation Set:", linearR_f1_val)

# Finally, predict and calculate F1 score on the test set (X_test)
linearR_preds_test = linearR_model.predict(X_val)
linearR_f1_test = f1_score(y_val, linearR_preds_test, average='weighted')
print("F1 Score on Test Set:", linearR_f1_test)


## K Nearest Neighbors (KNN) Classification Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import f1_score, make_scorer
from sklearn.model_selection import cross_val_score, cross_val_predict

# Initialize KNN model
knn_model = KNeighborsClassifier()

# Define the scorer using F1 score
scorer = make_scorer(f1_score, average='weighted')

# Perform 5-fold cross-validation and calculate F1 score
knn_f1_scores = cross_val_score(knn_model, X_train, y_train, cv=5, scoring=scorer)

# Calculate mean F1 score from cross-validation
mean_f1_score_cv = knn_f1_scores.mean()
print("Mean F1 Score (Cross-validation):", mean_f1_score_cv)

# Make predictions using cross-validation (on training data)
knn_cv_predictions = cross_val_predict(knn_model, X_train, y_train, cv=5)

# Fit the model on the entire training data
knn_model.fit(X_train, y_train)

# Predict on the validation set (X_val)
knn_val_predictions = knn_model.predict(X_val)

# Calculate F1 score on the validation set
knn_f1_val = f1_score(y_val, knn_val_predictions, average='weighted')
print("F1 Score on Validation Set:", knn_f1_val)

# Finally, predict and calculate F1 score on the test set (X_test)
knn_test_predictions = knn_model.predict(X_val)
knn_f1_test = f1_score(y_val, knn_test_predictions, average='weighted')
print("F1 Score on Test Set:", knn_f1_test)


## Support Vector Machine Classification Model


In [None]:
from sklearn.metrics import precision_score, recall_score
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score

# Initialize SVM classifier
svm = SVC()

# Train the SVM model using the training data
svm.fit(X_train, y_train)

# Make predictions on the validation set (X_val)
svm_predictions = svm.predict(X_val)

# Calculate precision, recall, and F1 score
precision = precision_score(y_val, svm_predictions, average='weighted')
recall = recall_score(y_val, svm_predictions, average='weighted')
svm_f1 = f1_score(y_val, svm_predictions, average='weighted')

print("SVM Precision:", precision)
print("SVM Recall:", recall)
print("SVM F1 Score:", svm_f1)


In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt

# Initialize SVM classifier (you can adjust parameters as needed)
svm_classifier = SVC(kernel='rbf', C=1.0, gamma='scale')  

svm_classifier.fit(X_train, y_train)

# Evaluate the model
accuracy = accuracy_score(y_val, svm_predictions)
print(f"Accuracy: {accuracy}")

# Create a confusion matrix
conf_matrix = confusion_matrix(y_val, svm_predictions)

# Display the confusion matrix using a heatmap
plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, cmap='Blues', fmt='g', cbar=False)
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')
plt.show()

##  Building the Multinomial Naive Bayes Model


In [None]:
nb = MultinomialNB()
nb.fit(X_train, y_train)
nb_predictions = nb.predict(X_val)
nb_f1 = f1_score(y_val, nb_predictions, average='weighted')
print("Naive Bayes F1 Score:", nb_f1)


In [None]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import f1_score
from sklearn.model_selection import cross_val_score


# Initialize Multinomial Naive Bayes model
nb = MultinomialNB()

#Train Moddel
nb.fit(X_train, y_train)

# Make predictions on the validation set
nb_predictions = nb.predict(X_val)

# Calculate F1 score on the validation set
nb_f1 = f1_score(y_val, nb_predictions, average='weighted')
print("Naive Bayes F1 Score:", nb_f1)
# Perform 5-fold cross-validation and calculate F1 score
f1_scores_cv = cross_val_score(nb, X_train, y_train, cv=5, scoring='f1_weighted')

# Calculate mean F1 score from cross-validation
mean_f1_score_cv = f1_scores_cv.mean()
print("Mean F1 Score (Cross-validation):", mean_f1_score_cv)

#### Generate predictions on the test set

In [None]:
#Using nb Model
# Converting the test data into TF-IDF vectors
X_test = vectorizer.transform(test_df['text'])

# Generating predictions on the best performing model
test_predictions = nb.predict(X_test)

In [None]:
#Using Knn_model
# Converting the test data into TF-IDF vectors
X_test = vectorizer.transform(test_df['text'])

# Generating predictions on the best performing model
test_predictions = svm.predict(X_test) # svm model

In [None]:
Using Knn_model
# Converting the test data into TF-IDF vectors
X_test = vectorizer.transform(test_df['text'])

# Generating predictions on the best performing model
test_predictions = knn_model.predict(X_test) # knn_model

#### Creating a csv for submission


In [None]:
# Creating a submission dataframe with 'index' and 'lang_id' columns
submission_df = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions})

submission_df.to_csv('FinalSub1.csv', index=False) #nb model

In [None]:
# Creating a submission dataframe with 'index' and 'lang_id' columns
submission_df = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions})

submission_df.to_csv('FinalSub2.csv', index=False) #svm model

In [None]:
# Creating a submission dataframe with 'index' and 'lang_id' columns
submission_df = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions})

submission_df.to_csv('FinalSub3.csv', index=False) # knn_model

In [None]:
# Creating a submission dataframe with 'index' and 'lang_id' columns
submission_df = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions})

submission_df.to_csv('FinalSub5.csv', index=False) #Decision tree

## Decision Tree Classification Model

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the Decision Tree Classifier
decision_tree = DecisionTreeClassifier(random_state=42)

# Train the Decision Tree Classifier
decision_tree.fit(X_train, y_train)

# Predict on the test set
predictions = decision_tree.predict(X_test)

# Calculate accuracy
accuracy = accuracy_score(y_test, predictions)
print("Accuracy:", accuracy)

# Generate a classification report
print("Classification Report:")
print(classification_report(y_test, predictions))


In [None]:
#Using Decision Tree Model 
# Converting the test data into TF-IDF vectors
X_test = vectorizer.transform(test_df['text'])

# Generating predictions on the best performing model
test_predictions = linearR_model.predict(X_test) 

In [None]:
# Creating a submission dataframe with 'index' and 'lang_id' columns
submission_df = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions})

submission_df.to_csv('FinalSub5.csv', index=False) #Decision tree

## AdaBoost Classification Model

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

# Assuming train_df contains 'text' column and 'lang_id' as the target variable
X_train, X_test, y_train, y_test = train_test_split(train_df['text'], train_df['lang_id'], test_size=0.2, random_state=42)

# Initialize and fit the CountVectorizer on the training text data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)

# Initialize the AdaBoost Classifier
adaboost_clf = AdaBoostClassifier(random_state=42)

# Train the AdaBoost Classifier
adaboost_clf.fit(X_train_vectorized, y_train)

# Transform the test data using the same vectorizer
X_test_vectorized = vectorizer.transform(X_test)

# Predict on the test set
test_predictions = adaboost_clf.predict(X_test_vectorized)

# Calculate accuracy
accuracy = accuracy_score(y_test, test_predictions)
print("Accuracy:", accuracy)

# Generate a classification report
print("Classification Report:")
print(classification_report(y_test, test_predictions))



In [None]:
#Using AdaBoost classifier Model 
# Converting the test data into TF-IDF vectors
X_test = vectorizer.transform(test_df['text'])

# Generating predictions on the best performing model
test_predictions = adaboost_clf.predict(X_test) 

In [None]:
# Creating a submission dataframe with 'index' and 'lang_id' columns
submission_df = pd.DataFrame({'index': test_df['index'], 'lang_id': test_predictions})

submission_df.to_csv('FinalSub6.csv', index=False) #Decision tree

## Neural Network Model Classifier

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam

# Assuming train_df and test_df are your training and test datasets

# Preprocessing
X_train = train_df['text'].astype(str)
y_train = train_df['lang_id']

# Splitting the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2, random_state=42)

# Initializing CountVectorizer and fitting it on the training data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(X_train)
X_val_vectorized = vectorizer.transform(X_val)

# Build the Neural Network model
model = Sequential()
model.add(Dense(128, activation='relu', input_shape=(X_train_vectorized.shape[1],)))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dense(len(train_df['lang_id'].unique()), activation='softmax'))  # Adjust output layer units based on unique language IDs

# Compile the model
optimizer = Adam(learning_rate=0.001)
model.compile(optimizer=optimizer, loss='sparse_categorical_crossentropy', metrics=['accuracy'])

# Train the model
history = model.fit(X_train_vectorized, y_train, epochs=5, batch_size=32, validation_data=(X_val_vectorized, y_val))

# Evaluate the model on the validation set
loss, accuracy = model.evaluate(X_val_vectorized, y_val)
print(f'Validation Accuracy: {accuracy}')

# Optionally, if you have a test set, you can use it for predictions
X_test = test_df['text'].astype(str)
X_test_vectorized = vectorizer.transform(X_test)
predictions = model.predict(X_test_vectorized)


In [None]:
from sklearn.feature_extraction.text import CountVectorizer

# Initialize and fit the CountVectorizer on the training data
vectorizer = CountVectorizer()
X_train_vectorized = vectorizer.fit_transform(train_df['text'])

# Transform the test data
X_test_vectorized = vectorizer.transform(test_df['text'])


In [None]:
# Converting the sparse matrix to a pandas DataFrame
X_test_vectorized_df = pd.DataFrame.sparse.from_spmatrix(X_test_vectorized)

# Concatenating the 'index' column from test_df and the predicted language IDs
submission_df = pd.concat([test_df['index'].reset_index(drop=True), X_test_vectorized_df], axis=1)

# Saving the DataFrame to a CSV file without row indices
submission_df.to_csv('FinalSub7.csv', index=False)
