In [57]:
# python=3.8
# conda env name : gender_pred_env

# Packges

In [58]:
import mlflow
import mlflow.keras
from mlflow.models.signature import infer_signature
import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from tensorflow.keras.models import Sequential, Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense, concatenate, Dropout
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.regularizers import l2
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
import matplotlib.pyplot as plt
import seaborn as sns

import warnings
warnings.filterwarnings('ignore')

In [59]:
# Load Data
df = pd.read_pickle('data/dataset_after_preporcessing.pkl')
df.head()

Unnamed: 0,name,sex,last_letter,last_two_letters,first_letter,first_two_letters,name_length
0,ابتسام,0,م,ام,ا,اب,6
1,ابتهاج,0,ج,اج,ا,اب,6
2,ابتهال,0,ل,ال,ا,اب,6
3,اجتهاد,0,د,اد,ا,اج,6
4,ازدهار,0,ر,ار,ا,از,6


In [60]:
max_name_length = max(df['name'].apply(len))
unique_chars = set(''.join(df['name']))  
vocab_size = len(unique_chars) + 1  

max_name_length, vocab_size

(17, 41)

In [61]:
scaler = MinMaxScaler()
label_encoder = LabelEncoder()

for column in ['last_letter', 'last_two_letters', 'first_letter', 'first_two_letters']:
    df[column] = label_encoder.fit_transform(df[column])
    df[column] = scaler.fit_transform(df[[column]])


df.head()

Unnamed: 0,name,sex,last_letter,last_two_letters,first_letter,first_two_letters,name_length
0,ابتسام,0,0.823529,0.102857,0.15625,0.080702,6
1,ابتهاج,0,0.264706,0.066667,0.15625,0.080702,6
2,ابتهال,0,0.794118,0.100952,0.15625,0.080702,6
3,اجتهاد,0,0.352941,0.072381,0.15625,0.085965,6
4,ازدهار,0,0.411765,0.07619,0.15625,0.096491,6


In [62]:
# Tokenizing and padding the 'name' column for LSTM input
tokenizer = Tokenizer(num_words=vocab_size, char_level=True)  # char_level=True for character tokenization
tokenizer.fit_on_texts(df['name'])
sequences = tokenizer.texts_to_sequences(df['name'])
padded_sequences = pad_sequences(sequences, maxlen=max_name_length)

In [63]:
# Preparing additional features
additional_features = df[['last_letter', 'last_two_letters', 'first_letter', 'first_two_letters', 'name_length']].values
additional_features = additional_features.astype('float32')

In [64]:
X = np.concatenate([padded_sequences, additional_features], axis=1)
y = df.sex

X.shape,y.shape

((8665, 22), (8665,))

In [65]:
# Model parameters
embedding_dim = 512
lstm_units = 64
epochs = 24
batch_size = 32
l2_lambda = 0.000
dropout_rate = 0.2

In [66]:
# First split: Separate out the training data (80% of the total data)
X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=0.2, random_state=11)

# Second split: Divide the remaining 20% of the data into validation and test sets (50% each of the remaining data)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, random_state=11)

# Check shapes
X_train.shape, y_train.shape, X_val.shape, y_val.shape, X_test.shape, y_test.shape

((6932, 22), (6932,), (866, 22), (866,), (867, 22), (867,))

In [67]:
X_train_seq = X_train[:, :max_name_length]
X_train_feat = X_train[:, max_name_length:]

X_val_seq = X_val[:, :max_name_length]
X_val_feat = X_val[:, max_name_length:]

X_test_seq = X_test[:, :max_name_length]
X_test_feat = X_test[:, max_name_length:]

In [68]:
# Calculate dataset sizes
total_size = len(X)
train_size = len(X_train)
val_size = len(X_val)
test_size = len(X_test)

# Calculate percentages and round to 2 decimal points
train_percentage = round((train_size / total_size) * 100, 3)
val_percentage = round((val_size / total_size) * 100, 3)
test_percentage = round((test_size / total_size) * 100, 3)

train_percentage , val_percentage , test_percentage

(80.0, 9.994, 10.006)

In [69]:
mlflow.set_tracking_uri("http://127.0.0.1:5000")

In [70]:
mlflow.set_experiment("Gender Prediction Models Tracking")

<Experiment: artifact_location='mlflow-artifacts:/237850698007442871', creation_time=1704716553191, experiment_id='237850698007442871', last_update_time=1704716553191, lifecycle_stage='active', name='Gender Prediction Models Tracking', tags={}>

In [71]:

# Initialize XGBoost classifier
xgb_model = xgb.XGBClassifier(
    objective='binary:logistic',
    eval_metric='logloss',
    use_label_encoder=False,
    early_stopping_rounds=10  
)

# Start MLflow run
with mlflow.start_run():
    # MLflow will log parameters used in the XGBoost model
    mlflow.log_params({param: value for param, value in xgb_model.get_params().items()})

    # Train the model (note that XGBoost doesn't use epochs or batch_size)
    xgb_model.fit(
        X_train, y_train, 
        eval_set=[(X_val, y_val)],
    )

    # Log metrics
    train_accuracy = accuracy_score(y_train, xgb_model.predict(X_train))
    val_accuracy = accuracy_score(y_val, xgb_model.predict(X_val))
    mlflow.log_metric("train_accuracy", train_accuracy)
    mlflow.log_metric("val_accuracy", val_accuracy)

    # Evaluate the model
    y_pred = xgb_model.predict(X_test)
    test_accuracy = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    mlflow.log_metrics({"test_accuracy": test_accuracy, "val_f1_score": f1})

    # Log the model - note that this is different for XGBoost
    mlflow.xgboost.log_model(xgb_model, "model")

    # Log additional information
    mlflow.set_tags({
        "Description": "XGBoost binary classifier",
        "Features": ', '.join(df.columns.tolist()),
        "Encoding" : "Char Level For names | Label Encoding for other featuers",
        "Model Type": "XGBoost"
    })

# Output the results
print('-----------------------------------------------------------')
print(f"Train Accuracy: {round(train_accuracy, 3)}")
print(f"Validation Accuracy: {round(val_accuracy, 3)}")
print(f"Test Accuracy: {round(test_accuracy, 3)}")
print(f"F1 Score: {round(f1, 3)}")
print('-----------------------------------------------------------\n')
# End MLflow run
;

[0]	validation_0-logloss:0.49214
[1]	validation_0-logloss:0.45760
[2]	validation_0-logloss:0.42518
[3]	validation_0-logloss:0.40201
[4]	validation_0-logloss:0.39135
[5]	validation_0-logloss:0.38311
[6]	validation_0-logloss:0.37535
[7]	validation_0-logloss:0.36499
[8]	validation_0-logloss:0.36185
[9]	validation_0-logloss:0.35836
[10]	validation_0-logloss:0.35265
[11]	validation_0-logloss:0.34794
[12]	validation_0-logloss:0.34557
[13]	validation_0-logloss:0.34352
[14]	validation_0-logloss:0.34339
[15]	validation_0-logloss:0.34116
[16]	validation_0-logloss:0.33945
[17]	validation_0-logloss:0.33838
[18]	validation_0-logloss:0.33826
[19]	validation_0-logloss:0.33568
[20]	validation_0-logloss:0.33501
[21]	validation_0-logloss:0.33465
[22]	validation_0-logloss:0.33467
[23]	validation_0-logloss:0.33572
[24]	validation_0-logloss:0.33515
[25]	validation_0-logloss:0.33385
[26]	validation_0-logloss:0.33366
[27]	validation_0-logloss:0.33455
[28]	validation_0-logloss:0.33419
[29]	validation_0-loglos

-----------------------------------------------------------
Train Accuracy: 0.92
Validation Accuracy: 0.858
Test Accuracy: 0.849
F1 Score: 0.901
-----------------------------------------------------------



''