In [1]:
import numpy as np
import pandas as pd
import gensim
import nltk
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
import matplotlib.pyplot as plt
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
import re
from bs4 import BeautifulSoup
import xgboost
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from time import time
from nltk.tag import pos_tag
from nltk.tokenize import word_tokenize
from sklearn.svm import SVC, LinearSVC
import lightgbm as lgbm
from lightgbm import LGBMClassifier

In [2]:
data=pd.read_csv("./Desktop/nlp_a1/train.csv")

In [3]:
data1=pd.read_csv("./Desktop/nlp_a1/train.csv")

In [None]:
data['profile'].apply(lambda x: len(x.split(' '))).sum()

In [9]:
labels = data['profession'].unique()
y2id,id2y={},{}
for i,j in enumerate(labels):
    y2id[j]=i
    id2y[i]=j
    
data2=data.replace({'profession':y2id})

In [4]:
punctuation = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
Stopwords = set(stopwords.words('english'))

In [5]:
def preprocess1(txt):
    txt = txt.lower() 
    txt = punctuation.sub(' ', txt)
    text = BAD_SYMBOLS_RE.sub('', txt) 
    txt = ' '.join(word for word in txt.split() if word not in Stopwords) # delete stopwors from text
    return txt

def preprocess2(txt):
    txt = txt.lower() 
    txt = punctuation.sub(' ', txt)
    text = BAD_SYMBOLS_RE.sub('', txt) 
    return txt

In [10]:
data['profile']=data['profile'].apply(preprocess1)
data2['profile']=data2['profile'].apply(preprocess1)
data1['profile']=data1['profile'].apply(preprocess2)

In [None]:
def change_tag(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:          
        return None

def lematize(sent):
    lemmatizer=WordNetLemmatizer()
    tag_sent = pos_tag(word_tokenize(sent))
    update_tag = map(lambda x:(x[0], change_tag(x[1])),tag_sent)
    ret=[]
    for word,tag in update_tag:
        if tag==None:
            ret.append(word)
        else:
            ret.append(lemmatizer.lemmatize(word, tag))
    return ' '.join(ret)

In [None]:
data['profile'].apply(lambda x: len(x.split(' '))).sum()

In [None]:
data['profession'].values

In [None]:
for i in data.index:
    data['profile'][i]=lematize(data['profile'][i])

In [11]:
X = data2.profile
y = data2.profession
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state = 1667)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.1, random_state = 1667)

In [None]:
# Naive Bayes Classifier for Multinomial Models

from sklearn.naive_bayes import MultinomialNB

nb = Pipeline([('vect', CountVectorizer()),
               ('tfidf', TfidfTransformer()),
               ('clf', MultinomialNB()),
              ])
nb.fit(X_train, y_train)
my_tags=data['profession'].unique().tolist()
# %%time
from sklearn.metrics import classification_report
y_pred = nb.predict(X_test)

print('accuracy %s' % accuracy_score(y_pred, y_test))
print(classification_report(y_test, y_pred,target_names=my_tags))

In [None]:
# Stocastic Gradient Classifier

from sklearn.linear_model import SGDClassifier

sgd = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                #('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, max_iter=50, tol=None)),
               ])

a=time()
sgd.fit(X_train, y_train)

y_pred_train = sgd.predict(X_train)
print('loss =', i)
print('train accuracy %s' % accuracy_score(y_pred_train, y_train))
y_pred = sgd.predict(X_test)
print('test accuracy %s' % accuracy_score(y_pred, y_test))
    
b=time()
print(b-a)
my_tags=data['profession'].unique()

print(classification_report(y_test, y_pred,target_names=my_tags))

In [None]:
# Random Forest Classifier

rfc = Pipeline([('vect', CountVectorizer(ngram_range=(1, 2))),
                #('tfidf', TfidfTransformer()),
                ('clf', RandomForestClassifier(max_depth=200, verbose=2)),
               ])

a=time()
rfc.fit(X_train, y_train)
y_pred_train = rfc.predict(X_train)
print('train accuracy %s' % accuracy_score(y_pred_train, y_train))
y_pred = rfc.predict(X_test)
print('test accuracy %s' % accuracy_score(y_pred, y_test))
b=time()
print(b-a)

my_tags=data['profession'].unique()
print(classification_report(y_test, y_pred,target_names=my_tags))

In [None]:
# Support Vector Classifier

svc = Pipeline([('vect', CountVectorizer(ngram_range=(1, 1))),
                #('tfidf', TfidfTransformer()),
                ('clf', SVC(verbose=True, class_weight='balanced', max_iter=100)),
               ])

a=time()
svc.fit(X_train, y_train)
y_pred_train = svc.predict(X_train)
print('train accuracy %s' % accuracy_score(y_pred_train, y_train))
y_pred = svc.predict(X_test)
print('test accuracy %s' % accuracy_score(y_pred, y_test))
b=time()
print(b-a)

my_tags=data['profession'].unique()
print(classification_report(y_test, y_pred,target_names=my_tags))

In [None]:
param_test = {'num_leaves': sp_randint(6, 50),
                                'min_child_samples': sp_randint(10, 400),
                                'min_child_weight': [1e-1,5e-1,5e-2, 1, 1e1, 1e2],
                                'subsample': sp_uniform(loc=0.3, scale=0.7),
                                'colsample_bytree': sp_uniform(loc=0.4, scale=0.6),
                                'reg_alpha': [100],
                                'reg_lambda': [0, 1e-1,1e-2, 5e-2,5e-1, 1, 3, 5, 10, 25, 50],
                                'learning_rate' :[1e-3, 5e-3, 1e-2, 3e-2, 5e-2, 1e-1, 2e-1],
                                'max_depth' : [3,5,10,15,20,25,30]}
fit_params = {"early_stopping_rounds":15,
                                "eval_metric" : 'mae',
                                "eval_set" : [(X_val,Y_val)],
                                'eval_names': ['valid'],
                                'verbose': 100000,
                                # 'categorical_feature': ['Sector'],
                                'sample_weight' : X_train_weight
                                }

In [15]:
# LightGBM 

vect = CountVectorizer(ngram_range=(1,2))
# tfidf = TfidfTransformer(sublinear_tf=True)

X_train_counts = vect.fit_transform(X_train)
X_val_counts = vect.transform(X_val)
# X_train_tfidf = tfidf.fit_transform(X_train_counts)

X_train_counts = X_train_counts.astype('float32')
X_val_counts = X_val_counts.astype('float32')   
y_train = y_train.astype('float32')
y_val = y_val.astype('float32')

lgbm_train = lgbm.Dataset(X_train_counts, y_train)
lgbm_eval = lgbm.Dataset(X_val_counts, y_val, reference=lgbm_train)

params = {
    'boosting_type':'gbdt',
    'objective':'multiclass',
#     'learning_rate': 0.02,
    'num_class': 28,
    'early_stopping': 100,
#     'num_iteration': 2000, 
#     'num_leaves': 31,
    'is_enable_sparse': 'true',
#     'tree_learner': 'data',
    'max_depth': 5, 
    'n_estimators': 50  
    }

clf_gbm = lgbm.train(params, train_set=lgbm_train, valid_sets=lgbm_eval)
predicted_LGBM = clf_gbm.predict(X_val_counts)

# lgbm = lgbm.LGBMClassifier(objective='multiclass', verbose=2)#, learning_rate=0.5, max_depth=20, num_leaves=50, n_estimators=120, max_bin=2000,)
# lgbm.fit(X_train_counts, y_train, verbose=2)
# predicted_LGBM = clf_LGBM.predict(X_val_counts)

print(accuracy_score(y_val, np.argmax(predicted_LGBM,axis=1)))

You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 77625
[LightGBM] [Info] Number of data points in the train set: 103562, number of used features: 26246
[LightGBM] [Info] Start training from score -3.212015
[LightGBM] [Info] Start training from score -1.205957
[LightGBM] [Info] Start training from score -2.495527
[LightGBM] [Info] Start training from score -3.374068
[LightGBM] [Info] Start training from score -2.772222
[LightGBM] [Info] Start training from score -3.934601
[LightGBM] [Info] Start training from score -3.062843
[LightGBM] [Info] Start training from score -4.062434
[LightGBM] [Info] Start training from score -2.319844
[LightGBM] [Info] Start training from score -5.579218
[LightGBM] [Info] Start training from score -3.671287
[LightGBM] [Info] Start training from score -4.604803
[LightGBM] [Info] Start training from score -5.036180
[LightGBM] [Info] Start training from score 

[3]	valid_0's multi_logloss: 2.1963
[4]	valid_0's multi_logloss: 2.12157
[5]	valid_0's multi_logloss: 2.05797
[6]	valid_0's multi_logloss: 2.00075


[7]	valid_0's multi_logloss: 1.94975
[8]	valid_0's multi_logloss: 1.90306
[9]	valid_0's multi_logloss: 1.86067
[10]	valid_0's multi_logloss: 1.82124


[11]	valid_0's multi_logloss: 1.78447
[12]	valid_0's multi_logloss: 1.75044
[13]	valid_0's multi_logloss: 1.71874
[14]	valid_0's multi_logloss: 1.6888


[15]	valid_0's multi_logloss: 1.66095
[16]	valid_0's multi_logloss: 1.63442
[17]	valid_0's multi_logloss: 1.60946
[18]	valid_0's multi_logloss: 1.58567


[19]	valid_0's multi_logloss: 1.56317
[20]	valid_0's multi_logloss: 1.54212
[21]	valid_0's multi_logloss: 1.52187
[22]	valid_0's multi_logloss: 1.50252


[23]	valid_0's multi_logloss: 1.48391
[24]	valid_0's multi_logloss: 1.46624
[25]	valid_0's multi_logloss: 1.44941
[26]	valid_0's multi_logloss: 1.43327


[27]	valid_0's multi_logloss: 1.41755
[28]	valid_0's multi_logloss: 1.40241
[29]	valid_0's multi_logloss: 1.38759
[30]	valid_0's multi_logloss: 1.37351


[31]	valid_0's multi_logloss: 1.36009
[32]	valid_0's multi_logloss: 1.34719
[33]	valid_0's multi_logloss: 1.33469
[34]	valid_0's multi_logloss: 1.32263


[35]	valid_0's multi_logloss: 1.31086
[36]	valid_0's multi_logloss: 1.29954
[37]	valid_0's multi_logloss: 1.28863
[38]	valid_0's multi_logloss: 1.27816


[39]	valid_0's multi_logloss: 1.26777
[40]	valid_0's multi_logloss: 1.25773
[41]	valid_0's multi_logloss: 1.24794
[42]	valid_0's multi_logloss: 1.23882


[43]	valid_0's multi_logloss: 1.22965
[44]	valid_0's multi_logloss: 1.22119
[45]	valid_0's multi_logloss: 1.21288
[46]	valid_0's multi_logloss: 1.20462


[47]	valid_0's multi_logloss: 1.19663
[48]	valid_0's multi_logloss: 1.18895
[49]	valid_0's multi_logloss: 1.18158
[50]	valid_0's multi_logloss: 1.17425
Did not meet early stopping. Best iteration is:
[50]	valid_0's multi_logloss: 1.17425


ValueError: Classification metrics can't handle a mix of multiclass and continuous-multioutput targets

In [22]:
print(accuracy_score(y_val, np.argmax(predicted_LGBM,axis=1)))

0.7117406795863388


In [24]:
print(classification_report(y_val, np.argmax(predicted_LGBM,axis=1),target_names=data['profession'].unique()))

                   precision    recall  f1-score   support

          teacher       0.69      0.38      0.49       467
        professor       0.58      0.91      0.71      3477
         attorney       0.88      0.76      0.81       955
          surgeon       0.78      0.52      0.63       412
     photographer       0.86      0.68      0.76       719
          painter       0.88      0.55      0.68       250
     psychologist       0.87      0.50      0.64       500
        filmmaker       0.76      0.65      0.70       232
        physician       0.85      0.82      0.84      1164
interior_designer       0.84      0.37      0.52        43
        architect       0.73      0.40      0.52       304
        dietitian       0.79      0.61      0.69       115
           pastor       0.67      0.38      0.48        82
           rapper       0.88      0.57      0.69        37
       journalist       0.80      0.41      0.54       556
          dentist       0.91      0.93      0.92       