In [262]:
import numpy as np 
import pandas as pd
from textblob import TextBlob
import nltk
import string
import collections 
from itertools import permutations 
import math 
import matplotlib.pyplot as plt 
import seaborn as sns

In [263]:
train_df = pd.read_csv("/content/train_set.csv")
test_df = pd.read_csv("/content/test_set.csv")

In [264]:
train_df.head()

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...


In [265]:
test_df.head()

Unnamed: 0,index,text
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele..."
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.
3,4,Kube inja nelikati betingevakala kutsi titsini...
4,5,Winste op buitelandse valuta.


Preprocessing

In [266]:
def pre_process(df, column):
  df[column] = df[column].str.lower()
  
  df = df.dropna()
  df[column] = df[column].astype(str)
  #df['lang_id'] = df['lang_id'].astype(str)

  return df

In [267]:
def features(df, column):
  punc = ('!', "," ,"\'" ,";" ,"\"", ".", "-" ,"?")
  vowels=['a','e','i','o','u']
  zulu_combos = ['uk']
  tswana_combos = ['og']

  same_consecutive_vowels = ['aa','ee', 'ii', 'oo', 'uu'] 
  consecutive_vowels = [''.join(p) for p in permutations(vowels,2)]
  df['word_count'] = df[column].apply(lambda x : len(x.split()))
  df['character_count'] = df[column].apply(lambda x : len(x.replace(" ","")))
  df['word_density'] = df['word_count'] / (df['character_count'] + 1)
  df['punc_count'] = df[column].apply(lambda x : len([a for a in x if a in punc]))
  df['w_char_count'] = df[column].apply(lambda x : len([a for a in x if a.casefold() == 'w']))
  df['x_char_count'] = df[column].apply(lambda x : len([a for a in x if a.casefold() == 'x']))
  df['q_char_count'] = df[column].apply(lambda x : len([a for a in x if a.casefold() == 'q']))
  df['o_char_count'] = df[column].apply(lambda x : len([a for a in x if a.casefold() == 'o']))
  df['uk_char_count'] = df['text'].apply(lambda x : sum([any(d_c in a for d_c in zulu_combos) for a in x.split()]))
  df['og_char_count'] = df['text'].apply(lambda x : sum([any(d_c in a for d_c in tswana_combos) for a in x.split()]))
  df['num_double_consec_vowels'] = df[column].apply(lambda x : sum([any(c_v in a for c_v in same_consecutive_vowels) for a in x.split()]))
  df['num_consec_vowels'] = df[column].apply(lambda x : sum([any(c_v in a for c_v in consecutive_vowels) for a in x.split()]))
  df['num_vowels'] = df[column].apply(lambda x : sum([any(v in a for v in vowels) for a in x.split()]))
  df['vowel_density'] = df['num_vowels']/df['word_count']
  df['num_punctuation'] = df[column].apply(lambda x: sum(x.count(w) for w in punc))
  df['num_unique_words'] = df[column].apply(lambda x: len(set(w for w in x.split())))
  df['num_repeated_words'] = df[column].apply(lambda x: len([w for w in collections.Counter(x.split()).values() if w > 1]))
  df['words_vs_unique'] = df['num_unique_words'] / df['word_count']
  df['encode_ascii'] = np.nan
  
  for i in range(len(df)):
      try:
          df[column].iloc[i].encode(encoding='utf-8').decode('ascii')
      except UnicodeDecodeError:
          df['encode_ascii'].iloc[i] = 0
      else:
          df['encode_ascii'].iloc[i] = 1

  return df


In [268]:
pre_process(train_df, 'text')

Unnamed: 0,lang_id,text
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...
2,eng,the province of kwazulu-natal department of tr...
3,nso,o netefatša gore o ba file dilo ka moka tše le...
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...
...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...
32997,eng,closing date for the submission of completed t...
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...


In [269]:
features(train_df, 'text')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,lang_id,text,word_count,character_count,word_density,punc_count,w_char_count,x_char_count,q_char_count,o_char_count,uk_char_count,og_char_count,num_double_consec_vowels,num_consec_vowels,num_vowels,vowel_density,num_punctuation,num_unique_words,num_repeated_words,words_vs_unique,encode_ascii
0,xho,umgaqo-siseko wenza amalungiselelo kumaziko ax...,24,197,0.121212,1,6,3,2,16,1,0,0,0,24,1.000000,1,23,1,0.958333,1.0
1,xho,i-dha iya kuba nobulumko bokubeka umsebenzi na...,31,222,0.139013,2,4,0,0,20,1,0,1,0,31,1.000000,2,30,1,0.967742,1.0
2,eng,the province of kwazulu-natal department of tr...,37,228,0.161572,1,4,1,0,19,0,0,0,6,37,1.000000,1,27,5,0.729730,1.0
3,nso,o netefatša gore o ba file dilo ka moka tše le...,40,178,0.223464,0,1,0,0,23,0,0,0,1,39,0.975000,0,31,5,0.775000,0.0
4,ven,khomishini ya ndinganyiso ya mbeu yo ewa maana...,43,197,0.217172,0,3,0,0,12,0,0,1,6,43,1.000000,0,24,8,0.558140,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32995,tsn,popo ya dipolateforomo tse ke go tlisa boetele...,49,224,0.217778,0,3,0,0,26,0,0,0,7,49,1.000000,0,35,7,0.714286,0.0
32996,sot,modise mosadi na o ntse o sa utlwe hore thaban...,63,236,0.265823,0,9,0,0,27,0,0,2,3,63,1.000000,0,47,10,0.746032,1.0
32997,eng,closing date for the submission of completed t...,37,186,0.197861,0,1,1,0,14,0,0,0,6,36,0.972973,0,32,4,0.864865,1.0
32998,xho,nawuphina umntu ofunyenwe enetyala phantsi kwa...,26,203,0.127451,0,10,1,0,16,2,0,1,0,26,1.000000,0,21,4,0.807692,1.0


In [270]:
train_df.groupby('lang_id').mean().T

lang_id,afr,eng,nbl,nso,sot,ssw,tsn,tso,ven,xho,zul
word_count,39.285333,40.233333,25.746,45.252667,45.734,26.319,44.930333,42.775667,44.261333,27.293333,26.595667
character_count,207.641,206.600667,219.967333,197.764333,201.896333,219.477333,201.834333,202.955333,202.246333,218.889,217.284333
word_density,0.189002,0.194288,0.116616,0.228022,0.225859,0.119526,0.222014,0.210104,0.218126,0.124324,0.121945
punc_count,0.693,0.341667,0.722333,0.137667,0.113,0.565333,0.238,0.264,0.177333,0.813667,0.546
w_char_count,4.108667,2.193,6.548,4.958667,3.879667,6.242333,5.336333,8.542667,6.96,6.948,6.274
x_char_count,0.013667,0.437667,0.011667,0.013,0.012333,0.006333,0.020333,3.004667,0.016667,1.117333,0.292333
q_char_count,0.004,0.263,0.783333,0.004667,0.265,0.006333,0.010333,0.045333,0.004333,1.771667,1.403667
o_char_count,12.765333,16.127333,14.426,24.641667,21.085667,10.080333,22.905,10.023667,11.143667,15.368,13.942333
uk_char_count,0.143333,0.004333,1.723667,0.054667,0.024,1.008,0.020667,0.465667,0.330667,2.038,2.191667
og_char_count,0.207333,0.128333,0.069333,0.536667,0.004333,0.020667,0.715333,0.008667,0.034333,0.037,0.051667


In [271]:
train_df.corr(method ='pearson')

Unnamed: 0,word_count,character_count,word_density,punc_count,w_char_count,x_char_count,q_char_count,o_char_count,uk_char_count,og_char_count,num_double_consec_vowels,num_consec_vowels,num_vowels,vowel_density,num_punctuation,num_unique_words,num_repeated_words,words_vs_unique,encode_ascii
word_count,1.0,0.095753,0.896367,-0.192857,-0.047636,0.041809,-0.342336,0.393537,-0.468091,0.171658,0.10085,0.360628,0.993938,-0.097495,-0.192857,0.834345,0.840256,-0.78332,-0.343848
character_count,0.095753,1.0,-0.338976,0.144686,0.239061,0.016994,0.175694,0.141122,0.241166,-0.036224,0.036813,-0.028202,0.099426,0.048299,0.144686,0.253512,-0.072808,0.168183,0.142325
word_density,0.896367,-0.338976,1.0,-0.24358,-0.143764,0.029138,-0.392277,0.3118,-0.536995,0.177363,0.0729,0.339534,0.889433,-0.109276,-0.24358,0.677454,0.816429,-0.810241,-0.388976
punc_count,-0.192857,0.144686,-0.24358,1.0,0.032683,0.007132,0.098445,-0.08868,0.118543,-0.035353,0.090413,0.010825,-0.211556,-0.169136,1.0,-0.12726,-0.186341,0.187954,0.092827
w_char_count,-0.047636,0.239061,-0.143764,0.032683,1.0,0.156345,0.078036,-0.119328,0.171096,-0.084243,-0.138312,-0.321986,-0.03361,0.134521,0.032683,0.004454,-0.046777,0.114671,0.043889
x_char_count,0.041809,0.016994,0.029138,0.007132,0.156345,1.0,0.034762,-0.159678,0.039055,-0.08849,-0.107874,-0.165619,0.044309,0.022224,0.007132,0.071338,0.036537,-0.005107,0.164965
q_char_count,-0.342336,0.175694,-0.392277,0.098445,0.078036,0.034762,1.0,0.002669,0.345804,-0.095387,-0.083802,-0.222739,-0.336094,0.079029,0.098445,-0.236065,-0.349484,0.357811,0.208092
o_char_count,0.393537,0.141122,0.3118,-0.08868,-0.119328,-0.159678,0.002669,1.0,-0.18594,0.302364,-0.025914,0.122166,0.404319,0.076875,-0.08868,0.294125,0.322237,-0.320703,-0.208135
uk_char_count,-0.468091,0.241166,-0.536995,0.118543,0.171096,0.039055,0.345804,-0.18594,1.0,-0.130079,-0.125508,-0.347879,-0.458826,0.116626,0.118543,-0.319775,-0.467753,0.484608,0.234773
og_char_count,0.171658,-0.036224,0.177363,-0.035353,-0.084243,-0.08849,-0.095387,0.302364,-0.130079,1.0,0.044248,0.093598,0.172216,-0.005266,-0.035353,0.113554,0.154295,-0.169932,-0.173666


# Set up inputs for ML

In [272]:
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier 
from sklearn import svm
from sklearn.model_selection import train_test_split 
from sklearn.decomposition import PCA 
from sklearn.preprocessing import StandardScaler 
from sklearn import metrics 
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import accuracy_score
from sklearn.metrics import log_loss

In [287]:
target = train_df['lang_id']

feature = train_df.drop(['lang_id', 'text'], axis=1)

X_train, X_test, y_train, y_test = train_test_split(feature, target, test_size=0.2) # 80% train and 20% test

In [288]:
dt_clf = DecisionTreeClassifier() 
dt_clf = dt_clf.fit(X_train,y_train)

In [289]:
y_pred = dt_clf.predict(X_test) #Predict the response for test dataset

In [290]:
accuracy_score_dt = accuracy_score(y_test, y_pred)
print(accuracy_score_dt)

0.7415151515151515


In [291]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         afr       0.96      0.96      0.96       565
         eng       0.79      0.80      0.80       572
         nbl       0.62      0.67      0.65       606
         nso       0.80      0.79      0.79       594
         sot       0.66      0.66      0.66       580
         ssw       0.80      0.80      0.80       645
         tsn       0.59      0.59      0.59       645
         tso       0.92      0.92      0.92       607
         ven       0.76      0.79      0.78       584
         xho       0.68      0.67      0.67       589
         zul       0.58      0.53      0.55       613

    accuracy                           0.74      6600
   macro avg       0.74      0.74      0.74      6600
weighted avg       0.74      0.74      0.74      6600



Random Forest Classifier

In [292]:
rf_clf = RandomForestClassifier(random_state = 42)
rf_clf = rf_clf.fit(X_train, y_train)

y_pred = rf_clf.predict(X_test)

accuracy_score_rf = accuracy_score(y_test, y_pred)
print(accuracy_score_rf)

0.8140909090909091


In [293]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         afr       0.97      0.99      0.98       565
         eng       0.88      0.88      0.88       572
         nbl       0.73      0.72      0.72       606
         nso       0.82      0.94      0.87       594
         sot       0.70      0.83      0.76       580
         ssw       0.80      0.91      0.85       645
         tsn       0.81      0.54      0.65       645
         tso       0.96      0.95      0.95       607
         ven       0.86      0.88      0.87       584
         xho       0.85      0.68      0.75       589
         zul       0.64      0.66      0.65       613

    accuracy                           0.81      6600
   macro avg       0.82      0.82      0.81      6600
weighted avg       0.82      0.81      0.81      6600



Gradient Booster Classifier

In [281]:
gb_clf = GradientBoostingClassifier()
gb_clf = gb_clf.fit(X_train, y_train)

y_pred = gb_clf.predict(X_test)

accuracy_score_gb = accuracy_score(y_test, y_pred)
print(accuracy_score_gb)

0.7571212121212121


In [282]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

         afr       0.97      0.98      0.97       587
         eng       0.81      0.84      0.82       622
         nbl       0.61      0.55      0.58       627
         nso       0.79      0.91      0.85       615
         sot       0.71      0.78      0.74       578
         ssw       0.70      0.90      0.79       597
         tsn       0.73      0.50      0.59       612
         tso       0.93      0.93      0.93       593
         ven       0.83      0.84      0.84       592
         xho       0.68      0.61      0.64       581
         zul       0.55      0.52      0.53       596

    accuracy                           0.76      6600
   macro avg       0.75      0.76      0.75      6600
weighted avg       0.75      0.76      0.75      6600



Make predictions

In [283]:
features(test_df, 'text')

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  iloc._setitem_with_indexer(indexer, value)


Unnamed: 0,index,text,word_count,character_count,word_density,punc_count,w_char_count,x_char_count,q_char_count,o_char_count,uk_char_count,og_char_count,num_double_consec_vowels,num_consec_vowels,num_vowels,vowel_density,num_punctuation,num_unique_words,num_repeated_words,words_vs_unique,encode_ascii
0,1,"Mmasepala, fa maemo a a kgethegileng a letlele...",10,49,0.200000,2,0,0,0,3,0,0,0,2,10,1.000000,2,8,1,0.800000,1.0
1,2,Uzakwaziswa ngokufaneleko nakungafuneka eminye...,10,100,0.099010,1,5,0,1,8,2,0,0,0,10,1.000000,1,10,0,1.000000,1.0
2,3,Tshivhumbeo tshi fana na ngano dza vhathu.,7,36,0.189189,1,0,0,0,2,0,0,0,1,7,1.000000,1,7,0,1.000000,1.0
3,4,Kube inja nelikati betingevakala kutsi titsini...,7,54,0.127273,1,0,0,0,1,0,0,0,0,7,1.000000,1,7,0,1.000000,1.0
4,5,Winste op buitelandse valuta.,4,26,0.148148,1,1,0,0,1,0,0,0,1,4,1.000000,1,4,0,1.000000,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5677,5678,You mark your ballot in private.,6,27,0.214286,1,0,0,0,3,0,0,0,2,6,1.000000,1,6,0,1.000000,1.0
5678,5679,Ge o ka kgetha ka bowena go se šomiše Mofani k...,66,256,0.256809,3,6,0,0,30,0,3,0,3,60,0.909091,3,43,10,0.651515,0.0
5679,5680,"E Ka kopo etsa kgetho ya hao ka hloko, hobane ...",28,108,0.256881,2,1,0,0,17,0,0,0,1,27,0.964286,2,24,3,0.857143,1.0
5680,5681,"TB ke bokudi ba PMB, mme Morero o tla lefella ...",32,124,0.256000,4,1,0,0,14,0,0,0,1,29,0.906250,4,26,4,0.812500,1.0


In [294]:
i = test_df['index']
f = test_df.drop(['index', 'text'], axis =1)

In [295]:
pred = rf_clf.predict(f)

In [296]:
sub_dict = {'index' : i,
          'lang_id': pred}
sub_df = pd.DataFrame(sub_dict)
sub_df.head()

Unnamed: 0,index,lang_id
0,1,eng
1,2,zul
2,3,ssw
3,4,ssw
4,5,ssw


Save as csv

In [297]:
sub_df.to_csv("SUB5.csv", index=False )