In [1]:
import gc
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [2]:
class_names = ['hindi', 'tamil','english']

In [3]:
train = pd.read_csv('lang_detect_train.csv').fillna(' ')
test = pd.read_csv('lang_detect_test.csv').fillna(' ')
#test = pd.read_csv('lang_detect_bank.csv', encoding = "ISO-8859-1").fillna(' ')

In [4]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,text,hindi,tamil,english
0,216406,id56852,in recent weeks as a result of a sweeping defe...,0,0,1
1,143370,fec42bc28d73c346,pal pen adhikaries angalotu pengalukum cham ur...,0,1,0
2,52271,8be632e20ed9bec6,rajasthan police kii consteble bhartee pariksh...,1,0,0
3,10572,1be80c468c2fe3ce,aaj kai maheene baad mushtak kaa phone aayaa h...,1,0,0
4,87462,ea0125a3a189f1a6,tamaam tarakkee ke baavjood duniya se garibi o...,1,0,0


In [5]:
train_text = train['text']
test_text = test['text']
all_text = pd.concat([train_text, test_text])

In [6]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=50000)
word_vectorizer.fit(all_text)
print('Word TFIDF 1/3')
train_word_features = word_vectorizer.transform(train_text)
print('Word TFIDF 2/3')
test_word_features = word_vectorizer.transform(test_text)
print('Word TFIDF 3/3')

Word TFIDF 1/3
Word TFIDF 2/3
Word TFIDF 3/3


In [7]:
import pickle
pickle.dump(word_vectorizer,open("feature.pkl","wb"))

In [8]:
# char_vectorizer = TfidfVectorizer(
#     sublinear_tf=True,
#     strip_accents='unicode',
#     analyzer='char',
#     ngram_range=(2, 6),
#     max_features=50000)
# char_vectorizer.fit(all_text)
# print('Char TFIDF 1/3')
# train_char_features = char_vectorizer.transform(train_text)
# print('Char TFIDF 2/3')
# test_char_features = char_vectorizer.transform(test_text)
# print('Char TFIDF 3/3')

In [9]:
train_features = hstack([train_word_features])
print('HStack 1/2')
test_features = hstack([test_word_features])
print('HStack 2/2')

HStack 1/2
HStack 2/2


In [10]:
train.drop('text', axis=1, inplace=True)
# del test
# del train_text
# del test_text
# del all_text
# del train_char_features
# del test_char_features
# del train_word_features
# del test_word_features
# gc.collect()

In [11]:
submission = pd.DataFrame.from_dict({'text': test['text']})

In [12]:
import joblib

In [13]:
for class_name in class_names:
    print(class_name)
    train_target = train[class_name]
    model = LogisticRegression(solver='sag')
    sfm = SelectFromModel(model, threshold=0.2)
    print(train_features.shape)
    train_sparse_matrix = sfm.fit_transform(train_features, train_target)
    print(train_sparse_matrix.shape)
    pickle.dump(sfm,open("sfm"+class_name+".pkl","wb"))
    train_sparse_matrix, valid_sparse_matrix, y_train, y_valid = train_test_split(train_sparse_matrix, train_target, test_size=0.05, random_state=144)
    test_sparse_matrix = sfm.transform(test_features)
    d_train = lgb.Dataset(train_sparse_matrix, label=y_train)
    d_valid = lgb.Dataset(valid_sparse_matrix, label=y_valid)
    watchlist = [d_train, d_valid]
    params = {'learning_rate': 0.2,
              'application': 'binary',
              'num_leaves': 31,
              'verbosity': -1,
              'metric': 'auc',
              'data_random_seed': 2,
              'bagging_fraction': 0.8,
              'feature_fraction': 0.6,
              'nthread': 4,
              'lambda_l1': 1,
              'lambda_l2': 1}
    rounds_lookup = {'hindi': 80,
                 'tamil': 50,
                 'english':30}
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[class_name],
                      valid_sets=watchlist,
                      verbose_eval=10)
    submission[class_name] = model.predict(test_sparse_matrix)
    # save the model to disk
    filename = 'lang_detect_lgbmodel_'+class_name+'.txt'
    #joblib.dump(model, filename)
    model.save_model(filename)

hindi
(177934, 50000)
(177934, 8747)
[10]	training's auc: 0.997339	valid_1's auc: 0.996231
[20]	training's auc: 0.999093	valid_1's auc: 0.99853
[30]	training's auc: 0.999486	valid_1's auc: 0.998954
[40]	training's auc: 0.999671	valid_1's auc: 0.999069
[50]	training's auc: 0.999769	valid_1's auc: 0.999138
[60]	training's auc: 0.999828	valid_1's auc: 0.999246
[70]	training's auc: 0.999862	valid_1's auc: 0.999309
[80]	training's auc: 0.999882	valid_1's auc: 0.999337
tamil
(177934, 50000)
(177934, 12127)
[10]	training's auc: 0.994828	valid_1's auc: 0.994773
[20]	training's auc: 0.997486	valid_1's auc: 0.997313
[30]	training's auc: 0.998448	valid_1's auc: 0.998282
[40]	training's auc: 0.998894	valid_1's auc: 0.998766
[50]	training's auc: 0.999195	valid_1's auc: 0.999037
english
(177934, 50000)
(177934, 10263)
[10]	training's auc: 0.994002	valid_1's auc: 0.994008
[20]	training's auc: 0.996961	valid_1's auc: 0.996676
[30]	training's auc: 0.99802	valid_1's auc: 0.997679


In [14]:
submission['truth_tamil'] = test['tamil']
submission['truth_hindi'] = test['hindi']
submission['truth_english'] = test['english']

In [15]:
submission.head(10)

Unnamed: 0,text,hindi,tamil,english,truth_tamil,truth_hindi,truth_english
0,pakistanin laguril watakits mansur sarmad aakh...,3.9e-05,0.999826,0.001473,1,0,0
1,irai ulagil irutalana chalokatiskum iraivanukq...,0.000292,0.999796,0.002762,1,0,0
2,jo log kidney kii samasya hriday rog manovaigy...,0.999975,0.000186,0.00073,0,1,0
3,name,0.764252,0.227768,0.126948,0,1,0
4,malamo chiruniro attak kalikk virumbhum andak ...,1.1e-05,0.999907,0.000522,1,0,0
5,home india koi chalati car par latak kar l...,0.997554,0.02906,0.001546,0,1,0
6,but what about our landing was lethal to the c...,0.000157,0.002393,0.993915,0,0,1
7,naan namee nallaa irukkuwants en vidu en kudum...,0.000933,0.998448,0.003,1,0,0
8,yasuki chaan jaise shaareerik chunautiyon se g...,0.999928,0.000425,0.000541,0,1,0
9,is saptaah aap kisi bijnes trip main ja sakate...,0.99998,9.4e-05,0.000587,0,1,0


In [16]:
submission['lang_detected'] = submission[['hindi','tamil','english']].idxmax(axis=1)

In [17]:
submission['actual_lang'] = submission[['truth_hindi','truth_tamil','truth_english']].idxmax(axis=1)

In [18]:
categories = {'truth_english': 'english', 'truth_hindi': 'hindi', 'truth_tamil': 'tamil'}
submission['actual_lang'] = submission['actual_lang'].map(categories)

In [19]:
lang = {'hindi':1,'tamil':2,'english':3}

In [20]:
submission['y_pred'] = submission['lang_detected'].map(lang)
submission['actual'] = submission['actual_lang'].map(lang)

In [21]:
submission.head()

Unnamed: 0,text,hindi,tamil,english,truth_tamil,truth_hindi,truth_english,lang_detected,actual_lang,y_pred,actual
0,pakistanin laguril watakits mansur sarmad aakh...,3.9e-05,0.999826,0.001473,1,0,0,tamil,tamil,2,2
1,irai ulagil irutalana chalokatiskum iraivanukq...,0.000292,0.999796,0.002762,1,0,0,tamil,tamil,2,2
2,jo log kidney kii samasya hriday rog manovaigy...,0.999975,0.000186,0.00073,0,1,0,hindi,hindi,1,1
3,name,0.764252,0.227768,0.126948,0,1,0,hindi,hindi,1,1
4,malamo chiruniro attak kalikk virumbhum andak ...,1.1e-05,0.999907,0.000522,1,0,0,tamil,tamil,2,2


In [22]:
from pycm import *
import numpy as np

In [23]:
y_actu = np.array(submission['actual'])
y_pred = np.array(submission['y_pred'])

In [25]:
cm = ConfusionMatrix(y_actu, y_pred)

In [26]:
print(cm)

Predict     1           2           3           
Actual
1           29498       184         57          

2           66          29001       333         

3           18          759         16342       





Overall Statistics : 

95% CI                                                            (0.98046,0.98238)
ACC Macro                                                         0.98761
ARI                                                               0.94889
AUNP                                                              0.98574
AUNU                                                              0.98407
Bangdiwala B                                                      0.96664
Bennett S                                                         0.97213
CBA                                                               0.97167
CSI                                                               0.95843
Chi-Squared                                                       143228.08292
Chi-Squared 

In [27]:
submission.to_csv('lang_detect_result.csv',index=False)