In [16]:
import gc
import pandas as pd

from scipy.sparse import csr_matrix, hstack

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectFromModel

from sklearn.linear_model import LogisticRegression
import lightgbm as lgb

In [17]:
class_names = ['hindi', 'tamil']

In [18]:
train = pd.read_csv('lang_detect_train.csv').fillna(' ')
test = pd.read_csv('lang_detect_test.csv').fillna(' ')
#test = pd.read_csv('lang_detect_bank.csv', encoding = "ISO-8859-1").fillna(' ')

In [19]:
train.head()

Unnamed: 0.1,Unnamed: 0,id,text,hindi,tamil
0,188489,id28907,math welipaduch sudndratirgamaiya ethirfu paya...,0.0,1.0
1,81597,da3aa03d2f5c38c4,mesh raashi bhagyonnati ke achche avasar pra...,1.0,0.0
2,29813,4f2741560410793a,rajasthan lok seva garanti adhinium ke tahat...,1.0,0.0
3,67938,b5bdfe3c2a12ad56,jonpur yuva kisi bhi raashtra kii saamaajik or...,1.0,0.0
4,90047,f0e8f46330543518,awaas ke anuroop shesh raashi jamaa karen haj ...,1.0,0.0


In [20]:
train_text = train['text']
test_text = test['text']
all_text = pd.concat([train_text, test_text])

In [21]:
word_vectorizer = TfidfVectorizer(
    sublinear_tf=True,
    strip_accents='unicode',
    analyzer='word',
    token_pattern=r'\w{1,}',
    ngram_range=(1, 2),
    max_features=50000)
word_vectorizer.fit(all_text)
print('Word TFIDF 1/3')
train_word_features = word_vectorizer.transform(train_text)
print('Word TFIDF 2/3')
test_word_features = word_vectorizer.transform(test_text)
print('Word TFIDF 3/3')

Word TFIDF 1/3
Word TFIDF 2/3
Word TFIDF 3/3


In [22]:
# char_vectorizer = TfidfVectorizer(
#     sublinear_tf=True,
#     strip_accents='unicode',
#     analyzer='char',
#     ngram_range=(2, 6),
#     max_features=50000)
# char_vectorizer.fit(all_text)
# print('Char TFIDF 1/3')
# train_char_features = char_vectorizer.transform(train_text)
# print('Char TFIDF 2/3')
# test_char_features = char_vectorizer.transform(test_text)
# print('Char TFIDF 3/3')

In [23]:
train_features = hstack([train_word_features])
print('HStack 1/2')
test_features = hstack([test_word_features])
print('HStack 2/2')

HStack 1/2
HStack 2/2


In [24]:
train.drop('text', axis=1, inplace=True)
# del test
# del train_text
# del test_text
# del all_text
# del train_char_features
# del test_char_features
# del train_word_features
# del test_word_features
# gc.collect()

In [25]:
submission = pd.DataFrame.from_dict({'text': test['text']})

In [26]:
import joblib

In [27]:
for class_name in class_names:
    print(class_name)
    train_target = train[class_name]
    model = LogisticRegression(solver='sag')
    sfm = SelectFromModel(model, threshold=0.2)
    print(train_features.shape)
    train_sparse_matrix = sfm.fit_transform(train_features, train_target)
    print(train_sparse_matrix.shape)
    train_sparse_matrix, valid_sparse_matrix, y_train, y_valid = train_test_split(train_sparse_matrix, train_target, test_size=0.05, random_state=144)
    test_sparse_matrix = sfm.transform(test_features)
    d_train = lgb.Dataset(train_sparse_matrix, label=y_train)
    d_valid = lgb.Dataset(valid_sparse_matrix, label=y_valid)
    watchlist = [d_train, d_valid]
    params = {'learning_rate': 0.2,
              'application': 'binary',
              'num_leaves': 31,
              'verbosity': -1,
              'metric': 'auc',
              'data_random_seed': 2,
              'bagging_fraction': 0.8,
              'feature_fraction': 0.6,
              'nthread': 4,
              'lambda_l1': 1,
              'lambda_l2': 1}
    rounds_lookup = {'hindi': 80,
                 'tamil': 50}
    model = lgb.train(params,
                      train_set=d_train,
                      num_boost_round=rounds_lookup[class_name],
                      valid_sets=watchlist,
                      verbose_eval=10)
    submission[class_name] = model.predict(test_sparse_matrix)
    # save the model to disk
    filename = 'lang_detect_lgbmodel_'+class_name+'.txt'
    #joblib.dump(model, filename)
    model.save_model(filename)

hindi
(138159, 50000)
(138159, 8436)
[10]	training's auc: 0.997789	valid_1's auc: 0.998314
[20]	training's auc: 0.999494	valid_1's auc: 0.999579
[30]	training's auc: 0.999757	valid_1's auc: 0.999765
[40]	training's auc: 0.999846	valid_1's auc: 0.999811
[50]	training's auc: 0.999891	valid_1's auc: 0.99982
[60]	training's auc: 0.999916	valid_1's auc: 0.999841
[70]	training's auc: 0.99993	valid_1's auc: 0.999848
[80]	training's auc: 0.999939	valid_1's auc: 0.999852
tamil
(138159, 50000)
(138159, 8460)
[10]	training's auc: 0.997978	valid_1's auc: 0.99839
[20]	training's auc: 0.999512	valid_1's auc: 0.999538
[30]	training's auc: 0.999767	valid_1's auc: 0.99974
[40]	training's auc: 0.999849	valid_1's auc: 0.999785
[50]	training's auc: 0.999891	valid_1's auc: 0.999814


In [28]:
submission['truth_tamil'] = test['tamil']
submission['truth_hindi'] = test['hindi']

In [32]:
submission.tail(10)

Unnamed: 0,text,hindi,tamil,truth_tamil,truth_hindi
146,processing fee iruka,0.026888,0.973802,1,0
147,enaku interest rate kami pana mudiyuma,0.009561,0.986066,1,0
148,endha credit score neenga use panreenga,0.074277,0.960613,1,0
149,promotion rates eydhachum offer iruka,0.026888,0.973802,1,0
150,unique benefits iruka,0.026888,0.973802,1,0
151,address epadi mathuradhu,0.026888,0.973802,1,0
152,fraud transaction eypadi tadukuradhu,0.026888,0.973802,1,0
153,credit card fraud policy ena,0.004737,0.981434,1,0
154,phone number epadi mathuradhu,0.026888,0.973802,1,0
155,credit card withdrawal limit ena,0.004737,0.981434,1,0


In [30]:
submission.to_csv('lang_detect_result.csv',index=False)