In [None]:
import pandas as pd
import numpy as np
import re
import nltk
import datetime
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk import SnowballStemmer
from nltk.stem import LancasterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.tokenize import WhitespaceTokenizer
import inflect
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

[nltk_data] Downloading package punkt to /home/jupyter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/jupyter/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/jupyter/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


## Generate Dataset

In [None]:
class Dataset(object):
    def __init__(self, corpusSource,scoreSource):
        self._corpusSource = corpusSource
        self._scoreSource=scoreSource
        self._genX()
        
    def _readData(self, filePath1,filePath2):
        score=pd.read_csv(filePath1)
        score.drop(columns=['Unnamed: 0'],inplace=True)
        score.rename(columns={"Name":"company"},inplace=True)
        corpus=pd.read_csv(filePath2,names=['company','text'])
        corpus1=corpus.dropna()
        self.score=score
        self.corpus=corpus1
    
    def get_wordnet_pos(self,pos_tag): # POS Tagging
        if pos_tag.startswith('J'):
            return wordnet.ADJ
        elif pos_tag.startswith('V'):
            return wordnet.VERB
        elif pos_tag.startswith('N'):
            return wordnet.NOUN
        elif pos_tag.startswith('R'):
            return wordnet.ADV
        else:
            return wordnet.NOUN

    def remove_punctuation(self,words):
        """Remove punctuation from list of tokenized words"""
        new_words = []
        for word in words:
            new_word = re.sub(r'[^\w\s]', '', word)
            if new_word != '':
                new_words.append(new_word)
        return new_words

    def remove_special(self,words):
        """Remove special signs like &*"""
        new_words = []
        for word in words:
            new_word = re.sub(r'[-,$()#+&*]', '', word)
            if new_word != '':
                new_words.append(new_word)
        return new_words

    def replace_numbers(self,words):
        """Replace all interger occurrences in list of tokenized words with textual representation"""
        p = inflect.engine()
        new_words = []
        for word in words:
            try:
                if word.isdigit():
                    new_word = p.number_to_words(word)
                    new_words.append(new_word)
                else:
                    new_words.append(word)
            except:
                continue
        return new_words

    def remove_stopwords(self,words):
        """Remove stop words from list of tokenized words"""  
        stopwords = nltk.corpus.stopwords.words('english')
        myStopWords = []
        stopwords.extend(myStopWords)
        new_words = []
        for word in words:
            if word not in stopwords:
                new_words.append(word)
        return new_words


    def normalize_lemmatize(self,words):
        words = self.remove_special(words)
        words = self.remove_punctuation(words)
        words = self.replace_numbers(words)
        words = self.remove_stopwords(words)
        pos_tags = pos_tag(words) # POS Tagging
        # Lemmatize words based on tags
        words = [WordNetLemmatizer().lemmatize(t[0], self.get_wordnet_pos(t[1])) for t in pos_tags]
        # remove words with only one letter
        words = [t for t in words if len(t) > 1]
        #words = lemmatize_verbs(words)
        #words = stem_words(words)
        return words
    
    def _genX(self):
        self._readData(self._scoreSource,self._corpusSource)
        x_feature=self.corpus.values[:,1]
        for i in range(len(x_feature)):
            words = nltk.word_tokenize(x_feature[i])
            x_feature[i]=self.normalize_lemmatize(words)
        sx_feature=np.array(list(map(lambda x: " ".join(x), x_feature)))
        self.x_feature=sx_feature
        
        
data = Dataset("real whole corpus.csv","real BB company.csv")

## Build Model

In [None]:
class LDA_XGB:
    def __init__(self,data):
        self.corpus=data.corpus
        self.score=data.score
        self.x_feature=data.x_feature
        cv = CountVectorizer(max_df=0.95, min_df=2,
                        stop_words='english')
        self.df = cv.fit_transform(self.x_feature)
    def find_LDA(self,max_topics):
        n_topics = range(2, max_topics+1)
        perplexityLst = [1.0]*len(n_topics)
        lda_models = []
        for idx, n_topic in enumerate(n_topics):
            lda = LatentDirichletAllocation(n_components=n_topic,
                                            max_iter=100,
                                            learning_method='batch',
                                            evaluate_every=200,
        #                                    perp_tol=0.1, #default                                       
        #                                    doc_topic_prior=1/n_topic, #default
        #                                    topic_word_prior=1/n_topic, #default
                                            verbose=0)
            lda.fit(self.df)
            perplexityLst[idx] = lda.perplexity(self.df)
            lda_models.append(lda)
            print ("# of Topic: %d, " % n_topics[idx])
            print ("Perplexity Score %0.3f" % perplexityLst[idx])
        fig = plt.figure()
        ax = fig.add_subplot(1,1,1)
        ax.plot(n_topics, perplexityLst)
        ax.set_xlabel("# of topics")
        ax.set_ylabel("Approximate Perplexity")
        plt.grid(True)
        plt.show()
        best_index = perplexityLst.index(min(perplexityLst))
        best_n_topic = n_topics[best_index]
        best_model = lda_models[best_index]
        print ("Best # of Topic: ", best_n_topic)
        return best_model,best_n_topic
    
    def gen_LDAfeature(self,model):
        self.LDA_features = model.fit_transform(self.df)
        
    def gen_XY(self,best_n_topic):
        df_new=pd.DataFrame(np.hstack((self.corpus.values,self.LDA_features)))
        df_new.rename(columns={0:'company'},inplace=True)
        data=pd.merge(df_new, score, on=['company'], how='inner').drop_duplicates('company').reset_index(drop=True)
        X=data.iloc[:,2:2+best_n_topic].values
        y=data.iloc[:,-1]
        return X,y
    
    def select_model(self,x,y,nthread,objective,learning_rate,max_depth,
                 min_child_weight,subsample,colsample_bytree,n_estimators):
        parameters = {'nthread':nthread, #when use hyperthread, xgboost may become slower
              'objective':objective,
              'learning_rate': learning_rate, #so called `eta` value
              'max_depth': max_depth,
              'min_child_weight': min_child_weight,
              'subsample': subsample,
              'colsample_bytree': colsample_bytree,
              'n_estimators': n_estimators}
        
        xgb1 = XGBRegressor()
        xgb_grid = GridSearchCV(xgb1,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)
        xgb_grid.fit(x,y)
        print("best score:",xgb_grid.best_score_)
        print("best params:",xgb_grid.best_params_)
        self.best_model=xgb_grid.best_estimator_
        
    
    def train(self,x,y):
        self.best_model.fit(x,y)
    
    def predict(self,x):
        return self.best_model.predict(x)
            

In [None]:
model=LDA_XGB(data)

In [None]:
best_LDA,best_n_topic=model.find_LDA(20)

# of Topic: 2, 
Perplexity Score 1602.016
# of Topic: 3, 
Perplexity Score 1545.424
# of Topic: 4, 
Perplexity Score 1493.590
# of Topic: 5, 
Perplexity Score 1478.273
# of Topic: 6, 
Perplexity Score 1452.974
# of Topic: 7, 
Perplexity Score 1439.899
# of Topic: 8, 
Perplexity Score 1434.327
# of Topic: 9, 
Perplexity Score 1415.307
# of Topic: 10, 
Perplexity Score 1414.075
# of Topic: 11, 
Perplexity Score 1402.609
# of Topic: 12, 
Perplexity Score 1396.207
# of Topic: 13, 
Perplexity Score 1380.062
# of Topic: 14, 
Perplexity Score 1375.141
# of Topic: 15, 
Perplexity Score 1364.635
# of Topic: 16, 
Perplexity Score 1374.033
# of Topic: 17, 
Perplexity Score 1357.873
# of Topic: 18, 
Perplexity Score 1349.202
# of Topic: 19, 
Perplexity Score 1353.298
# of Topic: 20, 
Perplexity Score 1345.372


In [None]:
model.gen_LDAfeature(best_LDA)
X,y=gen_XY(best_n_topic)

In [None]:
trainX,trainY,testX,testY=train_test_split(X,y,test_size=0.2,shuffle=True)

In [None]:
## grid search and train
model.select_model(trainX,trainY,nthread=[4],objective=['reg:squarederror'],learning_rate=[.03, 0.05, .07],max_depth=[5, 6, 7],
                     min_child_weight=[4],subsample=[0.7],colsample_bytree=[0.7],n_estimators=[500])

Fitting 2 folds for each of 9 candidates, totalling 18 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  18 out of  18 | elapsed:   20.3s finished


GridSearchCV(cv=2, error_score=nan,
             estimator=XGBRegressor(base_score=None, booster=None,
                                    colsample_bylevel=None,
                                    colsample_bynode=None,
                                    colsample_bytree=None, gamma=None,
                                    gpu_id=None, importance_type='gain',
                                    interaction_constraints=None,
                                    learning_rate=None, max_delta_step=None,
                                    max_depth=None, min_child_weight=None,
                                    missing=nan, monotone_constraints=None,
                                    n_estima...
                                    validate_parameters=None, verbosity=None),
             iid='deprecated', n_jobs=5,
             param_grid={'colsample_bytree': [0.7],
                         'learning_rate': [0.03, 0.05, 0.07],
                         'max_depth': [5, 6, 7], 'min_chil

In [None]:
model.train(trainX,trainY)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints='',
             learning_rate=0.03, max_delta_step=0, max_depth=7,
             min_child_weight=4, missing=nan, monotone_constraints='()',
             n_estimators=500, n_jobs=4, nthread=4, num_parallel_tree=1,
             objective='reg:squarederror', random_state=0, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.7,
             tree_method='exact', validate_parameters=1, verbosity=None)

In [None]:
#training MSE
MSE = np.mean((model.predict(trainX) - trainY)**2)
print(MSE)

2.8207264822547455


In [None]:
# Testing MSE:
MSE1 = np.mean((model.predict(testX)- testY)**2)
print(MSE1)

159.43091710030242
