<a href="https://colab.research.google.com/github/EdnahM/NLP_POS_Luhya/blob/main/pos_luhya_maxent_random_classifier.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

### Natural Language Processing
### Edna Wairimu Mugoh
### C241-01-2293/2022

Below project is a POS task on Luhya Dataset-Specialized with the Busukusu Speaking

Project Parts
1. Data preprocessing
2. Training and Modelling
3. Model Evaluation
4. Developing a usable endpoint
5. Deployed POS app for Bukusu

# Data Preporcessing


### Import required libraries


In [None]:
import pandas as pd
import numpy as np
import csv

import warnings

warnings.filterwarnings('ignore')
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

In [None]:
! pip install --upgrade scikit-learn
import sklearn
# Check scikit-learn version
print("scikit-learn version:", sklearn.__version__)

scikit-learn version: 1.4.2


### Load Dataset


In [None]:
import os
os.listdir()

['.config', 'sample_data']

In [None]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [None]:
train_df =  pd.read_csv("/content/drive/MyDrive/MSC-DS-2023/combined_bukusu_data.csv", header=0)
test_df =  pd.read_csv("/content/drive/MyDrive/MSC-DS-2023/combined_bukusu_test_data.csv", header=0)

In [None]:
train_df.head()

Unnamed: 0,WORD,SPEECH TAG
0,chererere,X
1,khubuna,V
2,naba,V
3,siloleela,V
4,charebwao,V


In [None]:
train_df.dtypes

WORD          object
SPEECH TAG    object
dtype: object

In [None]:
train_df.shape

(327747, 2)

### Clean Dataset

In [None]:
pos_counts = train_df['SPEECH TAG'].value_counts()
print(pos_counts)

SPEECH TAG
NN            62585
V             45300
ADP           34083
nn            19518
CONJ          18650
v             14988
PRON          14471
DET           14377
PUNCT         12782
conj          11364
ADJ            9928
ADV            6427
punct          6426
PUNC           5988
adp            4944
pron           4602
NUM            3852
adv            2670
adj            2520
PREP           1572
det            1116
num             858
X               478
POS             138
PRO             108
N                96
ADJE             84
inter            72
x                66
n                60
AP               60
pro              54
con              48
D                42
Pron             42
pr               36
DP               24
cv               24
PART             24
AD               24
AV               24
ART              18
VV               18
dp               18
XX               18
A                18
AD[              12
mm               12
NU               12
vv       

#### Function to standardize the various variations of pos


In [None]:
def standardize_pos(pos):
    """
    Standardize POS tags to basic presentation
    """
    pos_mapping = {
        'N': ['N', 'NN', 'NNN', 'NNNN', 'NOUN','noun', 'nn','NNC', 'NNS','nn4','NO','NU','ñn'], # Noun
        'V': ['V', 'VB', 'verb', 'vb','v','VV', 'vv'], # Verb
        'PRON':['PR','NNP','PROM','PRO','PRON', 'P','pro', 'PROUN','PR0N'], # Pronoun
        'PUNCT':['punct','punc','PUNT'], # Punctuation
        'ADJ': ['AD', 'ADJ','adj', 'AD [','ADO',' adje','AADJ','DJ','ADJE','ADDJ','AJ','AD['], # Adjective
        'ADV': ['ADV','AV','adv','DV','AV', 'AADV','DADV'], # Adverb
        'PREP':['PREP', 'prep','pre'], # Preposition
        'CONJ': ['conju', 'conj','CON','C','c', 'COJ', 'C0NJ'], # Conjuction
        'NUM': ['NUM', 'num',], # Integer
        'DT': ['DET','DT','D'], # Determiner
        'INTJ': ['inter',''], # Interjection
        'XX' : ['XX', 'X'], # Unknown
        'ADP' :['AP', 'ADP','adp','ADDP','addp','APD', 'adadp','dp'], # Adposition

    }

   # Checking Null POC
    if pd.isna(pos):
      pos = 'PUNCT'
      return pos

    pos_upper = pos.upper().strip()

    for standard_pos, variations in pos_mapping.items():
        for variation in variations:
            if pos_upper == variation.upper():
                return standard_pos

    return pos


In [None]:
def standardize_pos_column(df, column_name):
    df[column_name] = df[column_name].apply(standardize_pos)
    return df

In [None]:
cleaned_df = standardize_pos_column(train_df, 'SPEECH TAG')

In [None]:
# Display the filtered DataFrame
pos_counts = cleaned_df['SPEECH TAG'].value_counts()
print("Cleaned  POS Dataframe")
print(pos_counts)

Cleaned  POS Dataframe
SPEECH TAG
N             82337
V             60318
PUNCT         51812
ADP           39159
CONJ          30110
PRON          19355
DT            15535
ADJ           12604
ADV            9145
NUM            4710
PREP           1578
XX              562
POS             138
INTJ             84
cv               24
PART             24
ART              18
A                18
ne               12
mm               12
SPEECH TAG       12
adk               6
pun               6
chambukha         6
mala              6
um                6
bakaambisi        6
b                 6
po                6
asinyikhwa        6
asp               6
CHIRUPIA          6
KHUKHWAMA         6
NGA               6
BULI              6
MBOOLELE          6
ON                6
O                 6
ABAAELESIA        6
MASA              6
HATATI            6
TEMA              6
YEMA              6
YETURI            6
OMUKHAANA         6
NE                6
PU                6
OU                6
HH    

#### Eliminate all rows with pos_count count of 1

In [None]:
train_df_v1 = train_df.copy()
pos_to_keep = pos_counts[pos_counts != 6].index

cleaned_df = train_df[train_df['SPEECH TAG'].isin(pos_to_keep)]

In [None]:
def filter_dataframe_by_words(df, column_name, words):
    """
      Filter Dataframe
    """
    mask = df[column_name].str.contains('|'.join(words), case=False)
    return df[~mask]

words_to_exclude = ['SPEECH TAG', 'POS', 'PART','cv','A', 'ART','mm','ne']

cleaned_df = filter_dataframe_by_words(cleaned_df, 'SPEECH TAG', words_to_exclude)


In [None]:
cleaned_df['SPEECH TAG'].value_counts()

SPEECH TAG
N        82337
V        60318
PUNCT    51812
CONJ     30110
PRON     19355
DT       15535
NUM       4710
PREP      1578
XX         562
INTJ        84
Name: count, dtype: int64

#### Small visualization on the POS count

In [None]:
pos_counts = cleaned_df['SPEECH TAG'].value_counts()
print("Cleaned  POS Dataframe")
print(pos_counts)


Cleaned  POS Dataframe
SPEECH TAG
N        82337
V        60318
PUNCT    51812
CONJ     30110
PRON     19355
DT       15535
NUM       4710
PREP      1578
XX         562
INTJ        84
Name: count, dtype: int64


In [None]:
import matplotlib.pyplot as plt

plt.figure(figsize=(10, 6))

colors = ['skyblue' if count >= 30000 else 'salmon' for count in pos_counts]
pos_counts.plot(kind='bar', color=colors)

plt.title('Frequency of Each Part of Speech', fontsize=16, fontweight='bold')
plt.xlabel('Part of Speech', fontsize=14)
plt.ylabel('Frequency', fontsize=14)
plt.xticks(fontsize=12, rotation=45, ha='right')
plt.yticks(fontsize=12)

plt.grid(axis='y', linestyle='--', alpha=0.5)
plt.gca().spines['top'].set_visible(False)
plt.gca().spines['right'].set_visible(False)
plt.tight_layout()
plt.show()

In [None]:
cleaned_df.to_csv("/content/drive/MyDrive/MSC-DS-2023/cleaned_data.csv")

### Tokenization and Lemmatization on the given Words

In [None]:
import nltk
from nltk.tokenize import word_tokenize
import spacy

In [None]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
cleaned_df['WORD'] = cleaned_df['WORD'].astype(str)
cleaned_df['TOKENS'] = cleaned_df['WORD'].apply(word_tokenize)


In [None]:
nlp = spacy.load('en_core_web_sm') ## Lemmatization

In [None]:
def lemmatize_text(text):
    doc = nlp(" ".join(text))
    return [token.lemma_ for token in doc]

In [None]:
cleaned_df['LEMMAS'] = cleaned_df['TOKENS'].apply(lemmatize_text)

In [None]:
print(cleaned_df.tail(10))

            WORD SPEECH TAG      TOKENS      LEMMAS
327734    Bioosi         DT    [Bioosi]    [Bioosi]
327735     beela      PUNCT     [beela]     [beela]
327736   mukhola          V   [mukhola]   [mukhola]
327737   bubwoni          N   [bubwoni]   [bubwoni]
327739      Wele          N      [Wele]      [Wele]
327741  Isiraeli          N  [Isiraeli]  [isiraeli]
327742  babaandu          N  [babaandu]  [babaandu]
327743      bowo       PRON      [bowo]      [bowo]
327744      nibo       PRON      [nibo]      [nibo]
327745  wareesia          V  [wareesia]  [wareesia]


# Model Training

### Using the NLTK Library for training

#### Install and Import the required Libraries

In [None]:
! pip install nltk spacy textblob stanfordnlp pattern gensim


Collecting stanfordnlp
  Downloading stanfordnlp-0.2.0-py3-none-any.whl (158 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m158.8/158.8 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pattern
  Downloading Pattern-3.6.0.tar.gz (22.2 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m22.2/22.2 MB[0m [31m46.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting backports.csv (from pattern)
  Downloading backports.csv-1.0.7-py2.py3-none-any.whl (12 kB)
Collecting mysqlclient (from pattern)
  Downloading mysqlclient-2.2.4.tar.gz (90 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m90.4/90.4 kB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collec

In [None]:
import nltk
import spacy
from textblob import TextBlob
import stanfordnlp
import pattern
import gensim
from nltk.corpus import treebank

#### Function to define POS:

In [None]:
def features(word):
    return {'word': word}

In [None]:
# Extract features from your dataset
data = []
for index, row in cleaned_df.iterrows():
    word = row['WORD']
    tag = row['SPEECH TAG']
    featureset = features(word)
    data.append((featureset, tag))

In [None]:
# Split the data into training and testing sets
train_size = int(0.8 * len(data))
train_set, test_set = data[:train_size], data[train_size:]

#### Maxentropy Classification

In [None]:
classifier = nltk.MaxentClassifier.train(train_set)

  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -2.30259        0.002
             2          -0.67039        0.937
             3          -0.45442        0.937
             4          -0.35673        0.937
             5          -0.30069        0.937
             6          -0.26427        0.937
             7          -0.23867        0.937
             8          -0.21969        0.937
             9          -0.20505        0.937
            10          -0.19341        0.937
            11          -0.18394        0.937
            12          -0.17607        0.937
            13          -0.16944        0.937
            14          -0.16377        0.937
            15          -0.15887        0.937
            16          -0.15459        0.937
            17          -0.15082        0.937
            18          -0.14747        0.937
            19          -0.14448        0.937
 

In [None]:
accuracy = nltk.classify.accuracy(classifier, test_set)
print("Accuracy:", accuracy)

#### Save the predictions to a pickle file.

In [None]:
import pickle
with open('maxentpos_tagger.pickle', 'wb') as f:
    pickle.dump(classifier, f)

#### Random Forest Classifier

In [None]:
import nltk
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.feature_extraction import DictVectorizer


In [None]:
def features(sentence, index):
    word = sentence[index]
    featureset = {'word': word,
                  'word_length': len(word),
                  'has_numbers': any(char.isdigit() for char in word),
                  'has_special_chars': any(not char.isalnum() for char in word),
                  'position_in_sentence': index}
    return featureset

In [None]:
data = []
for index, row in cleaned_df.iterrows():
    word = row['WORD']
    tag = row['SPEECH TAG']
    featureset = features(word_tokenize(word), 0)
    data.append((featureset, tag))


In [None]:
X = [d[0] for d in data]
y = [d[1] for d in data]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


#### Using the Random Forest classifier

In [None]:
vectorizer = DictVectorizer()
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)


In [None]:

classifier = RandomForestClassifier(n_estimators=100, random_state=42)
classifier.fit(X_train_vec, y_train)

#### Prediction

In [None]:
X_test_vec = vectorizer.transform(X_test)

y_pred = classifier.predict(X_test_vec)


In [None]:
print(y_pred)

#### Performance Evaluation

In [None]:
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)


#### Hyperpaparmeter Tuning
Below using the 5-folds to improve on the model


In [None]:
X_vec = vectorizer.fit_transform(X)
cv_scores = cross_val_score(classifier, X_vec, y, cv=10)
print("Cross-Validation Mean Accuracy:", cv_scores.mean())

In [None]:
print(X_vec)

#### Saving the models Output

In [None]:
import pickle
with open('/content/drive/MyDrive/MSC-DS-2023/rf_pos_tagger.pickle', 'wb') as f:
    pickle.dump(classifier, f,protocol=4)

In [None]:
import pickle
import sklearn

# Check scikit-learn version
print("scikit-learn version:", sklearn.__version__)

In [None]:
import pickle
with open('/content/drive/MyDrive/MSC-DS-2023/rfvect_pos_tagger.pickle', 'wb') as f:
    pickle.dump(vectorizer, f,protocol=4)

### Using Neural Networks for training

# Model Evaluation

#