<a href="https://colab.research.google.com/github/Fsrkh/Persian-dates-NLP/blob/main/persian_dates.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dropout, Dense, BatchNormalization
from keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from sklearn.utils.class_weight import compute_class_weight # Import the necessary function
import tensorflow as tf
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import EarlyStopping
import re

Import data

In [4]:
data = pd.read_csv('dates.csv')
data.head()

Unnamed: 0,Formal Date,Informal Date 1,Informal Date 2,Informal Date 3,Informal Date 4,Informal Date 5,Informal Date 6,Informal Date 7,Informal Date 8,Informal Date 9
0,1400/01/01,اول فروردین هزار و چهارصد,یک فروردین هزار و چهارصد,یک فروردین ۱۴۰۰,فروردین یک ۱۴۰۰,اول فروردین ۱۴۰۰,یکِ فروردین ۱۴۰۰,یکم فروردین ۱۴۰۰,اول فروردین ماه ۱۴۰۰,فروردینِ یک ۱۴۰۰
1,1400/02/10,دهم اردیبهشت هزار و چهارصد,ده اردیبهشت هزار و چهارصد,ده اردیبهشت ۱۴۰۰,اردیبهشت ده ۱۴۰۰,دهم اردیبهشت ۱۴۰۰,دهِ اردیبهشت ۱۴۰۰,دهم اردیبهشت ماه ۱۴۰۰,اردیبهشتِ ده ۱۴۰۰,ده اردیبهشت ۱۴۰۰
2,1401/03/05,پنجم خرداد هزار و چهارصد و یک,پنج خرداد هزار و چهارصد و یک,پنج خرداد ۱۴۰۱,خرداد پنج ۱۴۰۱,پنجم خرداد ۱۴۰۱,پنجِ خرداد ۱۴۰۱,پنجم خرداد ماه ۱۴۰۱,خردادِ پنج ۱۴۰۱,پنج خرداد ۱۴۰۱
3,1401/04/15,پانزدهم تیر هزار و چهارصد و یک,پانزده تیر هزار و چهارصد و یک,پانزده تیر ۱۴۰۱,تیر پانزده ۱۴۰۱,پانزدهم تیر ۱۴۰۱,پانزدهِ تیر ۱۴۰۱,پانزدهم تیر ماه ۱۴۰۱,تیرِ پانزده ۱۴۰۱,پانزده تیر ۱۴۰۱
4,1402/05/20,بیستم مرداد هزار و چهارصد و دو,بیست مرداد هزار و چهارصد و دو,بیست مرداد ۱۴۰۲,مرداد بیست ۱۴۰۲,بیستم مرداد ۱۴۰۲,بیستِ مرداد ۱۴۰۲,بیستم مرداد ماه ۱۴۰۲,مردادِ بیست ۱۴۰۲,بیست مرداد ۱۴۰۲


In [5]:
missing_values = data.isnull().sum()
print("Missing values in each column:\n", missing_values)

Missing values in each column:
 Formal Date        0
Informal Date 1    0
Informal Date 2    0
Informal Date 3    0
Informal Date 4    0
Informal Date 5    0
Informal Date 6    0
Informal Date 7    0
Informal Date 8    0
Informal Date 9    0
dtype: int64


In [6]:
data['Informal Dates'] = data.iloc[:, 1:].apply(lambda x: ' '.join(x.dropna()), axis=1)

y = data['Formal Date']

X = data['Informal Dates']

In [7]:
def convert_persian_numbers(text):
    persian_numbers = {
        "یک": "1", "اول": "1", "دو": "2", "سه": "3", "چهار": "4", "پنج": "5",
        "شش": "6", "هفت": "7", "هشت": "8", "نه": "9", "ده": "10",
        "یازده": "11", "دوازده": "12", "سیزده": "13", "چهارده": "14", "پانزده": "15",
        "شانزده": "16", "هفده": "17", "هجده": "18", "نوزده": "19",
        "بیست": "20", "سی": "30", "چهل": "40", "پنجاه": "50", "شصت": "60",
        "هفتاد": "70", "هشتاد": "80", "نود": "90", "صد": "100", "هزار": "1000"
    }

    def handle_compound_numbers(text):
        compound_numbers = [
            ("بیست", 20), ("سی", 30), ("چهل", 40), ("پنجاه", 50),
            ("شصت", 60), ("هفتاد", 70), ("هشتاد", 80), ("نود", 90)
        ]

        for tens_word, tens_value in compound_numbers:

            for unit_word, unit_value in persian_numbers.items():
                compound_pattern = f"{tens_word} و {unit_word}"
                compound_number = str(tens_value + int(unit_value))
                text = re.sub(compound_pattern, compound_number, text)

        return text

    for word, digit in persian_numbers.items():
        text = re.sub(r'\b' + word + r'\b', digit, text)

    text = handle_compound_numbers(text)

    return text

In [8]:
for col in data.columns:
    if "Informal Date" in col:
        data[col] = data[col].apply(lambda x: convert_persian_numbers(x))

In [9]:
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'\s+', ' ', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = text.strip()
    return text

In [10]:
data['Informal Dates'] = data['Informal Dates'].apply(preprocess_text)

In [11]:
y.shape

(138,)

In [12]:
X.shape

(138,)

#Tokenazation and Encoding

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(X)

In [38]:
X_sequences = tokenizer.texts_to_sequences(X)

In [15]:
max_length = max(len(seq) for seq in X_sequences)
X_padded = pad_sequences(X_sequences, maxlen=max_length, padding='post')
X_padded

array([[24, 10,  3, ...,  0,  0,  0],
       [26, 12,  3, ...,  0,  0,  0],
       [27, 13,  3, ...,  0,  0,  0],
       ...,
       [26, 18,  3, ...,  0,  0,  0],
       [30, 19,  3, ...,  0,  0,  0],
       [25, 20,  3, ...,  0,  0,  0]], dtype=int32)

In [16]:
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)
y_encoded

array([ 90,  91,  92,  93,  94,  95,  96,  97,  98,  99, 100, 101, 102,
       103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
       116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128,
       129, 130, 131, 132, 133, 134, 135, 136, 137,  10,  11,  12,  13,
        14,  15,  16,  17,  18,  19,  20,  21,  22,  23,  24,  25,  26,
        27,  28,  29,  30,  31,  32,  33,  34,  35,  36,  37,  38,  39,
        40,  41,  42,  43,  44,  45,  46,  47,  48,  49,  50,  51,  52,
        53,  54,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,  65,
        66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
        79,  80,  81,  82,  83,  84,  85,  86,  87,  88,  89,   0,   1,
         2,   3,   4,   5,   6,   7,   8,   9])

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_padded, y_encoded, test_size=0.3, random_state=42)

In [18]:
X_train

array([[ 27,  10,   3, ...,   0,   0,   0],
       [ 26,  15,   3, ...,   0,   0,   0],
       [  2,   1,  27, ...,   8,  21, 138],
       ...,
       [ 25,  13,   3, ...,   0,   0,   0],
       [ 24,  19,   3, ...,   0,   0,   0],
       [ 31,  17,   3, ...,   0,   0,   0]], dtype=int32)

In [19]:
y_train

array([ 22,  86,  56, 134, 105,  81, 129, 112,  17,   1,  60, 137, 120,
        43,   5,  99, 123,  72, 115,  46,  24, 118,  28,  31,  15,  95,
        40,  45,  11, 125, 124,  57,  97, 133,  32,  44,  77,   2,  98,
       103,  62,  93, 107, 128,  34,  74,  96,  55,  85,  76,  16,  12,
        58, 136,  59,  23,   7,  41,  75,  53, 131,  20,  52,  10,  50,
        80,  84,  19,  37, 122,   4,  21,  25,  82, 127, 119,  69,   0,
        91,  14, 111,  92, 113,  65,  61,  78,  49,  36,  83,   9, 110,
        33,  68, 104,  54,  64])

In [20]:
print("Shape of X_train:", X_train.shape)
print("Shape of y_train:", y_train.shape)
print("Shape of X_test:", X_test.shape)
print("Shape of y_test:", y_test.shape)

Shape of X_train: (96, 58)
Shape of y_train: (96,)
Shape of X_test: (42, 58)
Shape of y_test: (42,)


# Building and Training the model

In [21]:
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-5)

In [22]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

In [23]:
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=128, input_length=max_length))

model.add(Bidirectional(LSTM(64, return_sequences=True, kernel_regularizer=l2(0.001))))
model.add(Dropout(0.4))

model.add(Bidirectional(LSTM(32, kernel_regularizer=l2(0.001))))
model.add(Dropout(0.4))

model.add(Dense(len(label_encoder.classes_), activation='softmax'))



In [24]:
adam_optimizer = Adam()

In [25]:
model.compile(loss='sparse_categorical_crossentropy', optimizer=Adam(learning_rate=1e-3), metrics=['accuracy'])

In [26]:
model.summary()

In [27]:
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
reduce_lr = ReduceLROnPlateau(monitor='val_loss', factor=0.2, patience=3, min_lr=1e-5)

In [34]:
class_weights = compute_class_weight('balanced', classes=np.unique(y_train), y=y_train)
class_weights_dict = dict(enumerate(class_weights))

history = model.fit(X_train, y_train, validation_data=(X_test, y_test), epochs=100, batch_size=16,
                    class_weight=class_weights_dict, callbacks=[early_stopping, reduce_lr])

Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 106ms/step - accuracy: 0.0318 - loss: 4.6505 - val_accuracy: 0.0000e+00 - val_loss: 6.0725 - learning_rate: 1.0000e-05
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 94ms/step - accuracy: 0.0586 - loss: 4.7447 - val_accuracy: 0.0000e+00 - val_loss: 6.0707 - learning_rate: 1.0000e-05
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 94ms/step - accuracy: 0.0234 - loss: 4.6617 - val_accuracy: 0.0000e+00 - val_loss: 6.0645 - learning_rate: 1.0000e-05
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 109ms/step - accuracy: 0.0070 - loss: 4.7064 - val_accuracy: 0.0000e+00 - val_loss: 6.0553 - learning_rate: 1.0000e-05
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 351ms/step - accuracy: 0.0107 - loss: 4.6675 - val_accuracy: 0.0000e+00 - val_loss: 6.0530 - learning_rate: 1.0000e-05
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━

#Prediction

In [35]:
def predict_formal_date(informal_date):
    seq = tokenizer.texts_to_sequences([preprocess_text(informal_date)])
    padded = pad_sequences(seq, maxlen=max_length, padding='post')
    pred = model.predict(padded)
    return label_encoder.inverse_transform([np.argmax(pred)])

In [37]:
new_informal_date = "دهم اردیبهشت هزار و چهارصد"
predicted_date = predict_formal_date(new_informal_date)
print(f"Predicted formal date: {predicted_date[0]}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
Predicted formal date: 1300/01/01
