In [4]:
import pandas as pd
import csv
import os
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize

In [2]:
def preprocess(data):
  data.insert(0, 'sentence', np.nan)
  indexes = data.index[data['word']=='।'].tolist()

  val = 0
  for id in indexes:
    data.loc[data.index[id], 'sentence'] = val
    val = val+1

  data =data.fillna(method ="bfill")
  data["sentence"] = LabelEncoder().fit_transform(data["sentence"])

  data.rename(columns={"sentence":"sentence_id","word":"words","O_BI":"labels"}, inplace =True)
  data["labels"] = data["labels"].str.upper()

  return data

In [3]:
def formatting(filename):
    with open(filename, "r", encoding="utf8") as in_file:
        buf = in_file.readlines()

    with open(filename, "w", encoding="utf8") as out_file:
        for line in buf:
            string = r"।"
            if string in line:
                line = line + "\n"
            out_file.write(line)


def csv_to_tsv(up_file_name):
    new_file_name = os.path.splitext(up_file_name)[0] + ".tsv"

    with open(up_file_name, 'r', encoding='utf-8') as csvin, open(new_file_name, 'w', newline='',
                                                                  encoding='utf-8') as tsvout:
        csvin = csv.reader(csvin)
        next(csvin, None)
        tsvout = csv.writer(tsvout, delimiter='\t')

        for row in csvin:
            tsvout.writerow(row)

    formatting(new_file_name)


def process_label(df):
    if df['O_BI'] == "B-PER":
        return "B-PER"
    elif df['O_BI'] == "I-PER":
        return "I-PER"
    else:
        return 'O'

In [None]:
def dataset_preparation(file_path, file_name):
  df_train = pd.read_csv(file_path)
  df_train['O_BI'] = df_train.apply(process_label, axis=1)
  df_updated = df_train.copy()
  df_updated.to_csv(file_name, index=False)
  csv_to_tsv(file_name)

In [None]:
dataset_preparation("/content/train_data_ner.csv", "train_up_ner.csv")

In [None]:
dataset_preparation("/content/test_data_ner.csv", "test_up_ner.csv")

In [None]:
data = pd.read_csv("/content/test_up_ner.csv")

label = data["O_BI"].unique().tolist()
label

['O', 'B-PER', 'I-PER']

In [None]:
def process_labels(x):
  if x['O_BI'] == "B-PER" : return  "B-PER"
  elif x['O_BI'] == "I-PER": return "I-PER"
  else:
    return 'O'


In [None]:
data['O_BI'] = data.apply(process_labels, axis=1)

In [None]:
label = data["O_BI"].unique().tolist()
label

['O', 'B-PER', 'I-PER']

In [None]:
stoi = dict()

with open('vocab.txt') as v:
    lines = v.readlines()

for line in lines:
  word_with_id = line.rstrip().split(" : ")
  # print(word_with_id[1])
  try:
    stoi[word_with_id[0]] = int(word_with_id[1])
  except:
    continue

stoi

{'<unk>': 0,
 '<pad>': 1,
 '।': 2,
 ',': 3,
 'ও': 4,
 'করে': 5,
 'হয়': 6,
 'থেকে': 7,
 'এ': 8,
 'না': 9,
 '(': 10,
 ')': 11,
 'করা': 12,
 'হয়েছে': 13,
 'এই': 14,
 'এক': 15,
 'বলেন': 16,
 'সঙ্গে': 17,
 'নিয়ে': 18,
 'এবং': 19,
 'জন্য': 20,
 'তিনি': 21,
 'করেন': 22,
 'হবে': 23,
 'গত': 24,
 'ছিল': 25,
 'কোনো': 26,
 'তাঁর': 27,
 'গতকাল': 28,
 'আর': 29,
 'করতে': 30,
 'একটি': 31,
 'মধ্যে': 32,
 'হয়ে': 33,
 'তবে': 34,
 'পর': 35,
 'আওয়ামী': 36,
 'বলে': 37,
 'কিন্তু': 38,
 'প্রথম': 39,
 'পুলিশ': 40,
 'ওই': 41,
 'দুই': 42,
 'সময়': 43,
 'এর': 44,
 'দেওয়া': 45,
 'নতুন': 46,
 'দেশের': 47,
 'বছর': 48,
 'রাজনৈতিক': 49,
 'লীগের': 50,
 'দলের': 51,
 'শুরু': 52,
 'করেছে': 53,
 'গেছে': 54,
 'হাজার': 55,
 'দিয়ে': 56,
 'জানান': 57,
 'কথা': 58,
 'বছরের': 59,
 ':': 60,
 'আরও': 61,
 'তাঁদের': 62,
 'নির্বাচন': 63,
 'করার': 64,
 'তার': 65,
 'অনেক': 66,
 'করেছেন': 67,
 'দিন': 68,
 'যে': 69,
 'পারে': 70,
 'শেষ': 71,
 'আমার': 72,
 'তা': 73,
 'হচ্ছে': 74,
 'কোটি': 75,
 'জাতীয়': 76,
 'যায়': 77,
 'জন': 78,
 'নেই': 79,


In [None]:
# !pip install bnlp_toolkit

In [None]:
!pip install bangla-stemmer

Collecting bangla-stemmer
  Downloading bangla_stemmer-1.0-py3-none-any.whl (9.1 kB)
Installing collected packages: bangla-stemmer
Successfully installed bangla-stemmer-1.0


In [None]:
from bangla_stemmer.stemmer import stemmer
sen = "সাইদুলকে কাস্টমার একশ টাকা বাকি দিলেন।"
## stemmer function
def stem_text(x):
  stmr = stemmer.BanglaStemmer()
  words=x.split(' ')
  print(words)
  stm = stmr.stem(words)
  words=(' ').join(stm)
  return words

words = stem_text(sen)
words

['সাইদুলকে', 'কাস্টমার', 'একশ', 'টাকা', 'বাকি', 'দিলেন।']
applied first rules..
applied fourth rules..
applied fourth rules..
applied fourth rules..


'সাইদুলকে কাস্টম একশ টাকা বাকি দিলেন।'

In [9]:
import nltk
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [17]:
lemmatizer = WordNetLemmatizer()
text = "সাইদুলকে কাস্টমার একশ টাকা বাকি দিলেন।"
sentences = ["মুনীর চৌধুরী এবং তানভীর হোসেন ছোটবেলার বন্ধু।", "সাইদুল সাহেব কাস্টমারকে একশ টাকা বাকি দিলেন।",
                 "আজ রাতে কোন রূপকথা নয়!"]
x=[]
for line in sentences:
  input_str = word_tokenize(line)
  print(input_str)
  for word in input_str:
      lemmatizer.lemmatize(word)
  x.insert(0, input_str)

x[::-1]


['মুনীর', 'চৌধুরী', 'এবং', 'তানভীর', 'হোসেন', 'ছোটবেলার', 'বন্ধু।']
['সাইদুল', 'সাহেব', 'কাস্টমারকে', 'একশ', 'টাকা', 'বাকি', 'দিলেন।']
['আজ', 'রাতে', 'কোন', 'রূপকথা', 'নয়', '!']


[['মুনীর', 'চৌধুরী', 'এবং', 'তানভীর', 'হোসেন', 'ছোটবেলার', 'বন্ধু।'],
 ['সাইদুল', 'সাহেব', 'কাস্টমারকে', 'একশ', 'টাকা', 'বাকি', 'দিলেন।'],
 ['আজ', 'রাতে', 'কোন', 'রূপকথা', 'নয়', '!']]

In [20]:
len("")

0