<a href="https://colab.research.google.com/github/EdnahM/NLP_POS_Luhya/blob/main/POS_Luhya.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

### Natural Language Processing
### Edna Wairimu Mugoh
### C241-01-2293/2022

Below project is a POS task on Luhya Dataset-Specialized with the Busukusu Speaking

Project Parts
1. Data preprocessing
2. Training and Modelling
3. Model Evaluation
4. Developing a usable endpoint
5. Deployed POS app for Bukusu

# Data Preporcessing


### Import required libraries


In [6]:
import pandas as pd
import numpy as np
import csv

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

### Load Dataset


In [7]:
import os
os.listdir()

['.config', 'sample_data']

In [8]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [9]:
train_df =  pd.read_csv("/content/drive/MyDrive/MSC-DS-2023/combined_bukusu_train_data.csv", header=0)
test_df =  pd.read_csv("/content/drive/MyDrive/MSC-DS-2023/combined_bukusu_test_data.csv", header=0)

In [10]:
train_df.head()

Unnamed: 0,WORD,SPEECH TAG
0,BAABIYA,NN
1,BAALA,NN
2,BAABANGURA,ADJ
3,BAALA,NN
4,BALUBIRI,ADJ


In [11]:
train_df.dtypes

WORD          object
SPEECH TAG    object
dtype: object

In [12]:
train_df.shape

(54542, 2)

### Clean Dataset

In [13]:
pos_counts = train_df['SPEECH TAG'].value_counts()
print(pos_counts)

SPEECH TAG
NN            10439
V              7561
ADP            5593
nn             3253
CONJ           3113
v              2498
PRON           2416
DET            2398
PUNCT          2096
conj           1894
ADJ            1657
ADV            1072
punct          1071
PUNC            998
adp             824
pron            767
NUM             642
adv             445
adj             420
PREP            262
det             186
num             143
X                80
POS              23
PRO              18
N                16
ADJE             14
inter            12
x                11
AP               10
n                10
pro               9
con               8
Pron              7
D                 7
pr                6
AV                4
AD                4
PART              4
DP                4
cv                4
XX                3
dp                3
VV                3
ART               3
A                 3
AD[               2
SPEECH TAG        2
C                 2
C0NJ     

#### Function to standardize the various variations of pos


In [14]:
def standardize_pos(pos):
    """
    Standardize POS tags to basic presentation
    """
    pos_mapping = {
        'N': ['N', 'NN', 'NNN', 'NNNN', 'NOUN','noun', 'nn','NNC', 'NNS','nn4','NO','NU','ñn'], # Noun
        'V': ['V', 'VB', 'verb', 'vb','v','VV', 'vv'], # Verb
        'PRON':['PR','NNP','PROM','PRO','PRON', 'P','pro', 'PROUN','PR0N'], # Pronoun
        'PUNCT':['punct','punc','PUNT'], # Punctuation
        'ADJ': ['AD', 'ADJ','adj', 'AD [','ADO',' adje','AADJ','DJ','ADJE','ADDJ','AJ','AD['], # Adjective
        'ADV': ['ADV','AV','adv','DV','AV', 'AADV','DADV'], # Adverb
        'PREP':['PREP', 'prep','pre'], # Preposition
        'CONJ': ['conju', 'conj','CON','C','c', 'COJ', 'C0NJ'], # Conjuction
        'INT': ['NUM', 'num',], # Integer
        'DT': ['DET','DT','D'], # Determiner
        'INTJ': ['inter',''], # Interjection
        'XX' : ['XX', 'X'], # Unknown
        'ADP' :['AP', 'ADP','adp','ADDP','addp','APD', 'adadp','dp'], # Adposition

    }

   # Checking Null POC
    if pd.isna(pos):
      pos = 'PUNCT'
      return pos

    pos_upper = pos.upper().strip()

    for standard_pos, variations in pos_mapping.items():
        for variation in variations:
            if pos_upper == variation.upper():
                return standard_pos

    return pos


In [15]:
def standardize_pos_column(df, column_name):
    df[column_name] = df[column_name].apply(standardize_pos)
    return df

In [16]:
cleaned_df = standardize_pos_column(train_df, 'SPEECH TAG')

In [17]:
# Display the filtered DataFrame
pos_counts = cleaned_df['SPEECH TAG'].value_counts()
print("Cleaned  POS Dataframe")
print(pos_counts)

Cleaned  POS Dataframe
SPEECH TAG
N             13731
V             10064
PUNCT          8607
ADP            6439
CONJ           5023
PRON           3230
DT             2591
ADJ            2103
ADV            1525
INT             785
PREP            263
XX               94
POS              23
INTJ             14
PART              4
cv                4
ART               3
A                 3
ne                2
mm                2
SPEECH TAG        2
ON                1
TEMA              1
ABAAELESIA        1
MASA              1
HATATI            1
YEMA              1
PU                1
NM                1
YETURI            1
OMUKHAANA         1
HH                1
NE                1
OU                1
O                 1
po                1
MBOOLELE          1
BULI              1
NGA               1
KHUKHWAMA         1
CHIRUPIA          1
asp               1
b                 1
asinyikhwa        1
um                1
bakaambisi        1
chambukha         1
mala              1
pun   

#### Eliminate all rows with pos_count count of 1

In [18]:
train_df_v1 = train_df.copy()
pos_to_keep = pos_counts[pos_counts != 1].index

cleaned_df = train_df[train_df['SPEECH TAG'].isin(pos_to_keep)]

# Display the filtered DataFrame
pos_counts = cleaned_df['SPEECH TAG'].value_counts()
print("Cleaned  POS Dataframe")
print(pos_counts)


Cleaned  POS Dataframe
SPEECH TAG
N             13731
V             10064
PUNCT          8607
ADP            6439
CONJ           5023
PRON           3230
DT             2591
ADJ            2103
ADV            1525
INT             785
PREP            263
XX               94
POS              23
INTJ             14
cv                4
PART              4
ART               3
A                 3
SPEECH TAG        2
ne                2
mm                2
Name: count, dtype: int64


### Tokenization and Lemmatization on the given Words

In [19]:
import nltk
from nltk.tokenize import word_tokenize
import spacy

In [20]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [21]:
cleaned_df['WORD'] = cleaned_df['WORD'].astype(str)
cleaned_df['TOKENS'] = cleaned_df['WORD'].apply(word_tokenize)


In [22]:
nlp = spacy.load('en_core_web_sm') ## Lemmatization

In [23]:
def lemmatize_text(text):
    doc = nlp(" ".join(text))
    return [token.lemma_ for token in doc]

In [24]:
cleaned_df['LEMMAS'] = cleaned_df['TOKENS'].apply(lemmatize_text)

In [25]:
print(cleaned_df.tail(10))

                WORD SPEECH TAG           TOKENS           LEMMAS
54532           niio        ADV           [niio]           [niio]
54533       kumubili          N       [kumubili]       [kumubili]
54534  kwanyooleekha          V  [kwanyooleekha]  [kwanyooleekha]
54535             Se      PUNCT             [Se]             [Se]
54536      okhoyeele          V      [okhoyeele]      [okhoyeele]
54537       omukusie          V       [omukusie]       [omukusie]
54538          namwe        ADV          [namwe]          [namwe]
54539       omukhole          V       [omukhole]       [omukhole]
54540     balarobora          V     [balarobora]     [balarobora]
54541        emasoti          N        [emasoti]        [emasoti]


# Model Training

### Using the NLTK Library for training

#### Install and Import the required Libraries

In [26]:
! pip install nltk spacy textblob stanfordnlp pattern gensim


Collecting stanfordnlp
  Downloading stanfordnlp-0.2.0-py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pattern
  Downloading Pattern-3.6.0.tar.gz (22.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m38.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting backports.csv (from pattern)
  Downloading backports.csv-1.0.7-py2.py3-none-any.whl (12 kB)
Collecting mysqlclient (from pattern)
  Downloading mysqlclient-2.2.4.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.4/90.4 kB[0m [31m10.5 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collec

In [27]:
import nltk
import spacy
from textblob import TextBlob
import stanfordnlp
import pattern
import gensim
from nltk.corpus import treebank

#### Function to define POS:

In [28]:
def features(word):
    return {'word': word}

In [29]:
# Extract features from your dataset
data = []
for index, row in cleaned_df.iterrows():
    word = row['WORD']
    tag = row['SPEECH TAG']
    featureset = features(word)
    data.append((featureset, tag))

In [30]:
# Split the data into training and testing sets
train_size = int(0.8 * len(data))
train_set, test_set = data[:train_size], data[train_size:]

In [None]:
classifier = nltk.MaxentClassifier.train(train_set)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -3.04452        0.000
             2          -0.82474        0.867
             3          -0.57755        0.867
             4          -0.47056        0.867
             5          -0.41047        0.867
             6          -0.37192        0.867
             7          -0.34506        0.867
             8          -0.32526        0.867
             9          -0.31007        0.867
            10          -0.29804        0.867
            11          -0.28827        0.867
            12          -0.28019        0.867
            13          -0.27338        0.867
            14          -0.26758        0.867
            15          -0.26257        0.867
            16          -0.25820        0.867
            17          -0.25436        0.867
            18          -0.25095        0.867
            19          -0.24791        0.867
 

In [None]:
accuracy = nltk.classify.accuracy(classifier, test_set)
print("Accuracy:", accuracy)

#### Save the predictions to a pickle file.

In [None]:
import pickle
with open('pos_tagger.pickle', 'wb') as f:
    pickle.dump(classifier, f)

### Using the Spacy Library for training

# Model Evaluation

#