### NoTE For Next Time I resume Project
* Format The functions well
* Document the results
* Save the pickle files inside the function
* Create Scripts
* Don't start streamlit until after payment

* Data Name : Wili 2018
* Source: Kaggle [https://www.kaggle.com/datasets/sharansmenon/wili-2018?select=data.csv]
* The main language data. Contains about 200k instances for 235 languages



In [95]:
#Load Libraries
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split 
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, f1_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV
import pickle


## Load Data

In [23]:
main_data = pd.read_csv('data.csv')

In [24]:
main_data.head()

Unnamed: 0,text,class
0,Klement Gottwaldi surnukeha palsameeriti ning ...,est\n
1,"Sebes, Joseph; Pereira Thomas (1961) (på eng)....",swe\n
2,भारतीय स्वातन्त्र्य आन्दोलन राष्ट्रीय एवम क्षे...,mai\n
3,"Après lo cort periòde d'establiment a Basilèa,...",oci\n
4,ถนนเจริญกรุง (อักษรโรมัน: Thanon Charoen Krung...,tha\n


## Data Cleaning
* Remove Excape strings at the end of the Data


In [34]:
def clean_data(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Parameters: dataframe(pd.DataFrame): Main Downloaded Data

    Return: pd.DataFrame (A Clean Version of Data Frame)
    
    """
    dataframe['class'] = dataframe['class'].apply(lambda x: x[:-1])

    print('Cleaning Completeed')
    return(dataframe)
    
    

## Data PreProcesing
* Select only data label to Afrikaans, Spanish and German only
* Feature Extraction (TF-IDF- Term Frequency-Inverse Document Frequency)

In [104]:
def extract_selected_language(dataframe: pd.DataFrame) -> pd.DataFrame:
    """
    Parameters: dataframe(pd.DataFrame) -> Main Data Frame containing all Data 

    Returns: pd.DataFrame -> A Dataframe containing only Afrikaans, Spanish, German, Alemannic German

    """
    extracted_data = dataframe[(dataframe['class'] == 'afr') | (dataframe['class'] == 'spa') | (dataframe['class'] == 'als') | (dataframe['class'] == 'deu')]
    print('Succesfully Extracted Data')
    return extracted_data


def save_extracted_data_to_csv(dataframe: pd.DataFrame) -> None:
    """
    Parameters: dataframe(pd.DataFrame) -> Dataframe of already extracted languages

    Save The Data to a csv file
    
    """
    dataframe.to_csv('data_folder/extracted_data.csv', index=False)
    print('Succesfully Saved To Csv')

In [105]:
save_extracted_data_to_csv(extract_selected_language(main_data))

Succesfully Extracted Data
Succesfully Saved To Csv


In [106]:
extracted_data = pd.read_csv('data_folder/extracted_data.csv')
extracted_data.head()

Unnamed: 0,text,class
0,Uf di hitig Greßi isch s schließlig anne 1998 ...,als
1,"En Navidad de 1974, poco después de que interp...",spa
2,1499: D Schlacht im Schwaderloh im Thurgau goh...,als
3,Die geelblom (Cineraria saxifraga) is 'n klein...,afr
4,"Talle van mense, dikwels uit geïsoleerde gemee...",afr


In [101]:
def feature_extraction_et_label_encoding(dataframe: pd.DataFrame) -> list:
    """
    Parameters:  dataframe: pd.DataFrame -> Data frame of selected language

    Return: A list [text_vectors, language_label, language_label_mapping]

    Receive a Dataframe and perform the following operations
    * Split to Features and Labels
    * Perform Labeel Encoding on the Label Section
    * Perform Feature Extarction usinf TfIDF on the feature Section
    * Extract Label Mapping
    * Return A list Containing The Vectors, Labels and Label Mapping
    
    """

    # Split Data to Texts and Labels
    texts = dataframe.text
    language_label = dataframe['class']
    
    #Encoding
    label_encoder = LabelEncoder() # initialize Encoder
    language_label= label_encoder.fit_transform(language_label)
    language_label_mapping = dict(zip(label_encoder.classes_,
                                     label_encoder.transform(label_encoder.classes_)))

    # Save Encoding Dictionary
    with open('pickles/language_label_mapping.pkl','wb') as file0:
        pickle.dump(language_label_mapping, file0)
    
    #Vectorization
    tfidf_vectorizer = TfidfVectorizer() # Initialize the Vectorizer

    tfidf_text_vectors = tfidf_vectorizer.fit_transform(texts).toarray()
    
    # Save Vectorizer
    with open('pickles/tfidf_vectorizer.pkl','wb') as file:
        pickle.dump(tfidf_vectorizer, file)
    
    print('Encoding And Preprocessing Completed')
    return [tfidf_text_vectors,language_label,language_label_mapping]

In [102]:
vectors, label, mapping = feature_extraction_et_label_encoding(extracted_data)

Encoding And Preprocessing Completed


### Spliting Data

In [82]:
def split_data(vectors: pd.Series, label: pd.Series) -> tuple:
    """
    Parameters vectors(pd.Series) label(pd.Series)

    Return: A tupple of the split features

    Split Data into Train and test features with 30 Percent to test and 70 to training
    
    """
    train_features, test_features, train_label, test_label = train_test_split(vectors,
                                                                              label,
                                                                              test_size=0.30, 
                                                                              random_state=3)
    return train_features,test_features,train_label,test_label
    


In [83]:
def train_models(train_features: pd.Series,
                 test_features: pd.Series,
                 train_label: pd.Series,
                 test_label: pd.Series):

    """
    Parameters : 
    train_features -> features to be use in training (pd.Series)
    test_features -> features for training (pd.Series)
    train_labels -> training labels (pd.Series)
    test_labels -> testing labels (pd.Series)
    Return A tupple for the prediction of all Models

    Train 3 Language Identification Models
    
    """
    #Models Initialization
    naive_bayes_model = MultinomialNB() # Naive Bayes
    logistic_regression_model = LogisticRegression() # Logistic Regression
    random_forest_classifier_model = RandomForestClassifier() # Random Forest Classifier
    print('Models Initialization Completed')
    

    #Models Training
    naive_bayes_model.fit(train_features, train_label)
    logistic_regression_model.fit(train_features, train_label)
    random_forest_classifier_model.fit(train_features, train_label)
    print('Models Training Completed')

    #Models Predictions
    naive_bayes_model_prediction = naive_bayes_model.predict(test_features)
    logistic_regression_model_prediction = logistic_regression_model.predict(test_features)
    random_forest_classifier_model_prediction = random_forest_classifier_model.predict(test_features)


    return naive_bayes_model_prediction, logistic_regression_model_prediction, random_forest_classifier_model_prediction


    
    

In [84]:

def evaluate_models(test_label: pd.Series, 
                    naive_bayes_prediction: pd.Series, 
                    logistic_regression_prediction: pd.Series, 
                    random_forest_prediction: pd.Series):
    
    """
    Parameters:
    - test_label: True labels for the test set (pd.Series)
    - naive_bayes_prediction: Predictions from the Naive Bayes model (pd.Series)
    - logistic_regression_prediction: Predictions from the Logistic Regression model (pd.Series)
    - random_forest_prediction: Predictions from the Random Forest model (pd.Series)
    
    Return:
    - Prints evaluation metrics including F1-score, Recall, Precision, and Classification Report for all models.
    """
    
    # Model Evaluation: Naive Bayes
    print('Naive Bayes Model Evaluation:')
    print(f'F1-Score: {f1_score(test_label, naive_bayes_prediction, average="weighted"):.4f}')
    print(f'Recall: {recall_score(test_label, naive_bayes_prediction, average="weighted"):.4f}')
    print(f'Precision: {precision_score(test_label, naive_bayes_prediction, average="weighted"):.4f}')
    print('Classification Report:\n', classification_report(test_label, naive_bayes_prediction))
    
    print('-' * 50)
    
    # Model Evaluation: Logistic Regression
    print('Logistic Regression Model Evaluation:')
    print(f'F1-Score: {f1_score(test_label, logistic_regression_prediction, average="weighted"):.4f}')
    print(f'Recall: {recall_score(test_label, logistic_regression_prediction, average="weighted"):.4f}')
    print(f'Precision: {precision_score(test_label, logistic_regression_prediction, average="weighted"):.4f}')
    print('Classification Report:\n', classification_report(test_label, logistic_regression_prediction))
    
    print('-' * 50)
    
    # Model Evaluation: Random Forest
    print('Random Forest Model Evaluation:')
    print(f'F1-Score: {f1_score(test_label, random_forest_prediction, average="weighted"):.4f}')
    print(f'Recall: {recall_score(test_label, random_forest_prediction, average="weighted"):.4f}')
    print(f'Precision: {precision_score(test_label, random_forest_prediction, average="weighted"):.4f}')
    print('Classification Report:\n', classification_report(test_label, random_forest_prediction))
    


In [86]:
train_features, test_features, train_label, test_label = split_data(vectors,label)

naive_bayes_pred, logistic_regression_pred, random_forest_pred = train_models(train_features, test_features, train_label, test_label)

# Call evaluate function
evaluate_models(test_label, naive_bayes_pred, logistic_regression_pred, random_forest_pred)


Models Initialization Completed
Models Training Completed
Naive Bayes Model Evaluation:
F1-Score: 0.9883
Recall: 0.9883
Precision: 0.9888
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       309
           1       1.00      0.96      0.98       297
           2       0.96      1.00      0.98       285
           3       1.00      0.99      1.00       309

    accuracy                           0.99      1200
   macro avg       0.99      0.99      0.99      1200
weighted avg       0.99      0.99      0.99      1200

--------------------------------------------------
Logistic Regression Model Evaluation:
F1-Score: 0.9842
Recall: 0.9842
Precision: 0.9843
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       309
           1       0.98      0.97      0.97       297
           2       0.96      0.99      0.97       285
           3       0.99

In [92]:
def grid_search_tuning(train_features: pd.Series,
                       test_features: pd.Series,
                       train_label: pd.Series,
                       test_label: pd.Series):
    """
    Perform hyperparameter tuning for Naive Bayes, Logistic Regression, and Random Forest models using GridSearchCV.
    
    Parameters:
    - train_features: Training features (pd.Series)
    - train_label: Training labels (pd.Series)
    
    Returns:
    - Best hyperparameters for each model
    """
    
    # Naive Bayes Hyperparameter Tuning
    naive_bayes = MultinomialNB()
    naive_bayes_params = {
        'alpha': [0.1, 0.5, 1.0]  # Example hyperparameter values for Naive Bayes
    }
    print('Starting HyperParameter Tuning')
    naive_bayes_grid = GridSearchCV(estimator=naive_bayes, param_grid=naive_bayes_params, cv=5, scoring='f1_weighted')
    naive_bayes_grid.fit(train_features, train_label)
    naive_bayes_grid_predictions = naive_bayes_grid.predict(test_features)
    print(f'Best Params for Naive Bayes: {naive_bayes_grid.best_params_}')

    print('Naive Bayes Model Grid Search Evaluation:')
    print(f'F1-Score: {f1_score(test_label, naive_bayes_grid_predictions, average="weighted"):.4f}')
    print(f'Recall: {recall_score(test_label, naive_bayes_grid_predictions, average="weighted"):.4f}')
    print(f'Precision: {precision_score(test_label, naive_bayes_grid_predictions, average="weighted"):.4f}')
    print('Classification Report:\n', classification_report(test_label, naive_bayes_grid_predictions))
    
    
   
    return naive_bayes_grid

In [93]:
naivebayes = grid_search_tuning(train_features, test_features, train_label, test_label)

Starting HyperParameter Tuning
Best Params for Naive Bayes: {'alpha': 0.1}
Naive Bayes Model Grid Search Evaluation:
F1-Score: 0.9933
Recall: 0.9933
Precision: 0.9935
Classification Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       309
           1       1.00      0.98      0.99       297
           2       0.98      1.00      0.99       285
           3       1.00      1.00      1.00       309

    accuracy                           0.99      1200
   macro avg       0.99      0.99      0.99      1200
weighted avg       0.99      0.99      0.99      1200



In [99]:
with open('pickles/final_naivebaye_model.pkl','wb') as file:
    pickle.dump(naivebayes, file)