<a href="https://colab.research.google.com/github/ConradKash/sunbird_tasks/blob/main/LanguageID_NLP_Model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Data Extraction

In [1]:
import pandas as pd
import numpy as np
import json as js
import re
import nltk; nltk.download('wordnet')
import nltk; nltk.download('punkt')
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, precision_score, recall_score
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
import warnings
warnings.simplefilter("ignore")

!wget https://raw.githubusercontent.com/SunbirdAI/salt/main/v1.2/salt-test-v1.2.jsonl

data = pd.read_json("salt-test-v1.2.jsonl", lines = True)
data

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


--2023-08-01 21:15:45--  https://raw.githubusercontent.com/SunbirdAI/salt/main/v1.2/salt-test-v1.2.jsonl
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.109.133, 185.199.108.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.109.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 217596 (212K) [text/plain]
Saving to: ‘salt-test-v1.2.jsonl’


2023-08-01 21:15:45 (51.0 MB/s) - ‘salt-test-v1.2.jsonl’ saved [217596/217596]



Unnamed: 0,text,tts-speech
0,{'eng': 'The fashion industry is starting to t...,
1,{'eng': 'An epidemic is a disease that affects...,
2,{'eng': 'Good quality honey will come from thi...,
3,{'eng': 'She was riding the bicycle while I wa...,
4,{'eng': 'The authorities are looking for two y...,
...,...,...
495,{'eng': 'Developed countries have good medical...,
496,{'eng': 'New Zealand has won the rugby world c...,
497,{'eng': 'Most primary school teachers are not ...,
498,{'eng': 'There is continued heavy gunfire in t...,


# Data Preparation

Dropping non required fields

In [2]:
data = data.drop('tts-speech', axis =1)
print(data)

                                                  text
0    {'eng': 'The fashion industry is starting to t...
1    {'eng': 'An epidemic is a disease that affects...
2    {'eng': 'Good quality honey will come from thi...
3    {'eng': 'She was riding the bicycle while I wa...
4    {'eng': 'The authorities are looking for two y...
..                                                 ...
495  {'eng': 'Developed countries have good medical...
496  {'eng': 'New Zealand has won the rugby world c...
497  {'eng': 'Most primary school teachers are not ...
498  {'eng': 'There is continued heavy gunfire in t...
499  {'eng': 'The little children waved at their pa...

[500 rows x 1 columns]


In [3]:
dataset_dict = {
    "language": [],
    "text": []
}
for row in data['text']:
    for key, value in row.items():
        dataset_dict["language"].append(key)
        dataset_dict["text"].append(value)

In [4]:
df = pd.DataFrame(dataset_dict, columns=['language', 'text'])
df

Unnamed: 0,language,text
0,eng,The fashion industry is starting to thrive again.
1,lug,Ekisaawe ky'emisono kitandise okusituka nate.
2,ach,Yub me cital ruk mapatpat manyen tye ka dongo ...
3,teo,Ageutu ikampunin luka enape apolo bobo.
4,lgg,Okalamvu suta o'diru 'diyini 'diyi 'ye e'do tutu
...,...,...
2995,lug,Abaana abato baawuubira bazadde baabwe emikono...
2996,ach,Lutino matino giwito cingi me moto lunyodogi k...
2997,teo,Apotu idwe iyogata auriakake nautatar alosit.
2998,lgg,Anzinyiri nde 'diyi ya dri yima tipika yini dr...



# Text Processing
Cleaning the data by removing noise




In [5]:
text = list(dataset_dict['text'])
lemmatizer = WordNetLemmatizer()
corpus = []

for i in range(len(text)):
    r = text[i]
    r = r.lower()
    r = r.split()
    r = [lemmatizer.lemmatize(word) for word in r]
    r = word_tokenize(str(r))
    r = ' '.join(r)
    corpus.append(r)

df['text'] = corpus


df = pd.DataFrame(df)
df.head(20)


Unnamed: 0,language,text
0,eng,"[ 'the ' , 'fashion ' , 'industry ' , 'is ' , ..."
1,lug,"[ 'ekisaawe ' , `` ky'emisono '' , 'kitandise ..."
2,ach,"[ 'yub ' , 'me ' , 'cital ' , 'ruk ' , 'mapatp..."
3,teo,"[ 'ageutu ' , 'ikampunin ' , 'luka ' , 'enape ..."
4,lgg,"[ 'okalamvu ' , 'suta ' , `` o'diru '' , `` 'd..."
5,nyn,"[ `` eby'eby'afaashoni '' , 'biriyo ' , 'nibig..."
6,eng,"[ 'an ' , 'epidemic ' , 'is ' , ' a ' , 'disea..."
7,lug,"[ 'endwadde ' , 'ebalukawo ' , `` y'endwadde '..."
8,ach,"[ 'can ' , 'mogo ' , 'obedo ' , 'two ' , 'ma '..."
9,teo,"[ 'erai ' , 'adeka ' , 'na ' , 'ikamuni ' , 'i..."


In [6]:
x = np.array(df["text"])
y = np.array(df["language"])

from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

cv = CountVectorizer(analyzer = 'char',ngram_range=(2,3))
X = cv.fit_transform(x)
print(X.toarray())
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=48)

[[ 0  0 16 ...  0  0  0]
 [ 0  0  9 ...  0  0  0]
 [ 0  0 22 ...  0  0  0]
 ...
 [ 0  0 12 ...  0  0  0]
 [ 0  0 20 ...  0  0  0]
 [ 0  0 16 ...  0  0  0]]


In [7]:

logistic_regression = LogisticRegression()

#fit the model using the training data
logistic_regression.fit(X_train,y_train)

#use model to make predictions on test data
y_pred = logistic_regression.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         ach       0.99      0.98      0.99       196
         eng       1.00      1.00      1.00       142
         lgg       0.99      1.00      1.00       162
         lug       0.97      0.98      0.98       170
         nyn       0.99      0.98      0.98       164
         teo       1.00      1.00      1.00       156

    accuracy                           0.99       990
   macro avg       0.99      0.99      0.99       990
weighted avg       0.99      0.99      0.99       990



In [8]:
print(pd.crosstab(y_test, y_pred))

col_0  ach  eng  lgg  lug  nyn  teo
row_0                              
ach    193    0    1    2    0    0
eng      0  142    0    0    0    0
lgg      0    0  162    0    0    0
lug      1    0    0  167    2    0
nyn      1    0    0    3  160    0
teo      0    0    0    0    0  156


In [9]:
model = MultinomialNB()
model.fit(X_train,y_train)
model.score(X_test,y_test)

0.998989898989899

In [None]:
def predict(ptext):
    x = cv.transform([ptext]).toarray()
    lang = model.predict(x)
    lang = le.inverse_transform(lang)
    print("The langauge is in",lang[0])

In [10]:
#print(model.predict(cv.transform(["Oliwa"]).toarray()))

In [11]:
import pickle

In [12]:
# saving both cv and model
pickle.dump(cv, open("transform.pkl", "wb"))
pickle.dump(model, open("model.pkl", "wb"))