In [49]:
import gc
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [50]:
class_names = ['hindi', 'tamil','english']

In [62]:
train = pd.read_csv('lang_detect_train.csv').fillna(' ')
test = pd.read_csv('lang_detect_test.csv').fillna(' ')
#test = pd.read_csv('lang_detect_bank.csv', encoding = "ISO-8859-1").fillna(' ')

In [52]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,text,hindi,tamil,english
0,216406,id56852,in recent weeks as a result of a sweeping defe...,0,0,1
1,143370,fec42bc28d73c346,pal pen adhikaries angalotu pengalukum cham ur...,0,1,0
2,52271,8be632e20ed9bec6,rajasthan police kii consteble bhartee pariksh...,1,0,0
3,10572,1be80c468c2fe3ce,aaj kai maheene baad mushtak kaa phone aayaa h...,1,0,0
4,87462,ea0125a3a189f1a6,tamaam tarakkee ke baavjood duniya se garibi o...,1,0,0


In [53]:
train_text = train['text']
test_text = test['text']
all_text = pd.concat([train_text, test_text])

In [54]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=50000)
word_vectorizer.fit(all_text)
print('Word TFIDF 1/3')
train_word_features = word_vectorizer.transform(train_text)
print('Word TFIDF 2/3')
test_word_features = word_vectorizer.transform(test_text)
print('Word TFIDF 3/3')

Word TFIDF 1/3
Word TFIDF 2/3
Word TFIDF 3/3


In [55]:
# char_vectorizer = TfidfVectorizer(
#     sublinear_tf=True,
#     strip_accents='unicode',
#     analyzer='char',
#     ngram_range=(2, 6),
#     max_features=50000)
# char_vectorizer.fit(all_text)
# print('Char TFIDF 1/3')
# train_char_features = char_vectorizer.transform(train_text)
# print('Char TFIDF 2/3')
# test_char_features = char_vectorizer.transform(test_text)
# print('Char TFIDF 3/3')

In [56]:
train_features = hstack([train_word_features])
print('HStack 1/2')
test_features = hstack([test_word_features])
print('HStack 2/2')

HStack 1/2
HStack 2/2


In [57]:
train.drop('text', axis=1, inplace=True)
# del test
# del train_text
# del test_text
# del all_text
# del train_char_features
# del test_char_features
# del train_word_features
# del test_word_features
# gc.collect()

In [58]:
submission = pd.DataFrame.from_dict({'text': test['text']})

In [59]:
import joblib

In [60]:
for class_name in class_names:
    print(class_name)
    train_target = train[class_name]
    model = LogisticRegression(solver='sag')
    sfm = SelectFromModel(model, threshold=0.2)
    print(train_features.shape)
    train_sparse_matrix = sfm.fit_transform(train_features, train_target)
    print(train_sparse_matrix.shape)
    train_sparse_matrix, valid_sparse_matrix, y_train, y_valid = train_test_split(train_sparse_matrix, train_target, test_size=0.05, random_state=144)
    test_sparse_matrix = sfm.transform(test_features)
    d_train = lgb.Dataset(train_sparse_matrix, label=y_train)
    d_valid = lgb.Dataset(valid_sparse_matrix, label=y_valid)
    watchlist = [d_train, d_valid]
    params = {'learning_rate': 0.2,
              'application': 'binary',
              'num_leaves': 31,
              'verbosity': -1,
              'metric': 'auc',
              'data_random_seed': 2,
              'bagging_fraction': 0.8,
              'feature_fraction': 0.6,
              'nthread': 4,
              'lambda_l1': 1,
              'lambda_l2': 1}
    rounds_lookup = {'hindi': 80,
                 'tamil': 50,
                 'english':30}
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[class_name],
                      valid_sets=watchlist,
                      verbose_eval=10)
    submission[class_name] = model.predict(test_sparse_matrix)
    # save the model to disk
    filename = 'lang_detect_lgbmodel_'+class_name+'.txt'
    #joblib.dump(model, filename)
    model.save_model(filename)

hindi
(177934, 50000)
(177934, 8758)
[10]	training's auc: 0.997357	valid_1's auc: 0.996783
[20]	training's auc: 0.99902	valid_1's auc: 0.998348
[30]	training's auc: 0.999512	valid_1's auc: 0.998899
[40]	training's auc: 0.999682	valid_1's auc: 0.999136
[50]	training's auc: 0.999772	valid_1's auc: 0.999253
[60]	training's auc: 0.999835	valid_1's auc: 0.999276
[70]	training's auc: 0.999868	valid_1's auc: 0.999307
[80]	training's auc: 0.999884	valid_1's auc: 0.999317
tamil
(177934, 50000)
(177934, 12174)
[10]	training's auc: 0.994647	valid_1's auc: 0.994466
[20]	training's auc: 0.997343	valid_1's auc: 0.997218
[30]	training's auc: 0.998391	valid_1's auc: 0.998226
[40]	training's auc: 0.998883	valid_1's auc: 0.998749
[50]	training's auc: 0.999189	valid_1's auc: 0.999042
english
(177934, 50000)
(177934, 10277)
[10]	training's auc: 0.994053	valid_1's auc: 0.994133
[20]	training's auc: 0.996651	valid_1's auc: 0.996398
[30]	training's auc: 0.997942	valid_1's auc: 0.997726


In [63]:
submission['truth_tamil'] = test['tamil']
submission['truth_hindi'] = test['hindi']
submission['truth_english'] = test['english']

In [65]:
submission.head(10)

Unnamed: 0,text,hindi,tamil,english,truth_tamil,truth_hindi,truth_english
0,How to apply for home loan,0.016086,0.017944,0.975755,0,0,1
1,what are the Documents that are required for h...,0.0016,0.005283,0.982159,0,0,1
2,What Is the Interest Rate and Annual Percentag...,0.002231,0.004397,0.986083,0,0,1
3,Is There a Prepayment Penalty,0.000994,0.01965,0.982938,0,0,1
4,How Much Time Do You Need to Fund,0.005385,0.025521,0.908566,0,0,1
5,Home loan eligibility,0.05213,0.739912,0.355284,0,0,1
6,How to check my monthly emi,0.006831,0.106045,0.89569,0,0,1
7,How does tenure affect cost of loan,0.008564,0.09215,0.909427,0,0,1
8,What security will i have to provide,0.001396,0.022088,0.975999,0,0,1
9,maximum tenure for home loan,0.033517,0.211217,0.888633,0,0,1


In [73]:
submission['lang_detected'] = submission[['hindi','tamil','english']].idxmax(axis=1)

In [76]:
submission['actual_lang'] = submission[['truth_hindi','truth_tamil','truth_english']].idxmax(axis=1)

In [78]:
categories = {'truth_english': 'english', 'truth_hindi': 'hindi', 'truth_tamil': 'tamil'}
submission['actual_lang'] = submission['actual_lang'].map(categories)

In [81]:
lang = {'hindi':1,'tamil':2,'english':3}

In [82]:
submission['y_pred'] = submission['lang_detected'].map(lang)
submission['actual'] = submission['actual_lang'].map(lang)

In [83]:
submission.head()

Unnamed: 0,text,hindi,tamil,english,truth_tamil,truth_hindi,truth_english,lang_detected,actual_lang,y_pred,actual
0,How to apply for home loan,0.016086,0.017944,0.975755,0,0,1,english,english,3,3
1,what are the Documents that are required for h...,0.0016,0.005283,0.982159,0,0,1,english,english,3,3
2,What Is the Interest Rate and Annual Percentag...,0.002231,0.004397,0.986083,0,0,1,english,english,3,3
3,Is There a Prepayment Penalty,0.000994,0.01965,0.982938,0,0,1,english,english,3,3
4,How Much Time Do You Need to Fund,0.005385,0.025521,0.908566,0,0,1,english,english,3,3


In [91]:
from pycm import *
import numpy as np

In [92]:
y_actu = np.array(submission['actual'])
y_pred = np.array(submission['y_pred'])

In [93]:
cm = pycm.ConfusionMatrix(y_actu, y_pred)

In [94]:
print(cm)

Predict  1        2        3        
Actual
1        41       10       1        

2        0        52       0        

3        0        2        50       





Overall Statistics : 

95% CI                                                            (0.87329,0.96004)
ACC Macro                                                         0.94444
ARI                                                               0.76689
AUNP                                                              0.9375
AUNU                                                              0.9375
Bangdiwala B                                                      0.84874
Bennett S                                                         0.875
CBA                                                               0.85417
CSI                                                               0.84763
Chi-Squared                                                       245.74265
Chi-Squared DF                                                    4

In [96]:
submission.to_csv('lang_detect_result.csv',index=False)