# import library

In [1]:
import nltk
import numpy as np
import pandas as pd
from nltk import tokenize
from nltk.stem import PorterStemmer 
from nltk.tag import StanfordPOSTagger
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score
from nltk.corpus import sentiwordnet as swn
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer



# module

In [2]:
def search_adj(sentence, keyword, skip_before_keyword=False, skip_after_keyword=False, prev_bag_of_word=1, after_bag_of_word=5) :    
    st = StanfordPOSTagger('./english-bidirectional-distsim.tagger', './stanford-postagger-3.9.2.jar')
    
    punctuation = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'] # remove it if you need punctuation 
    
    word_tokens = word_tokenize(sentence) 
    filtered_sentence = [w for w in word_tokens if not w in punctuation] 
    filtered_sentence = [] 
  
    for w in word_tokens: 
        if w not in punctuation:
            filtered_sentence.append(w)
            
    list_word_tag = st.tag([(" ").join(filtered_sentence)])
    found_keyword = not skip_before_keyword
    list_sentiment_word = []
    found_adjective = 0
    keyword_idx = -1
    for idx in range (len(list_word_tag)) :
        if (list_word_tag[idx][0] == keyword) :
            keyword_idx = idx
            break
    
    if (skip_before_keyword) :
        prev_bag_of_word = 0
    
    if (skip_after_keyword) :
        after_bag_of_word = 0
        
    for idx in range (len(list_word_tag)) :
        if (idx > keyword_idx + after_bag_of_word) :
                break
                
        if ((idx >= keyword_idx - prev_bag_of_word) and list_word_tag[idx][1] == 'JJ') :
            found_adjective += 1
            sentiment_word = list_word_tag[idx][0]
            for adverb_idx in range (idx-1, -1, -1) :
                if (list_word_tag[adverb_idx][1] == 'RB') :
                    sentiment_word = list_word_tag[adverb_idx][0] + " " + sentiment_word
                else :
                    break
            list_sentiment_word.append(sentiment_word)
            
    return (list_sentiment_word, found_adjective)

In [3]:
def preprocessing(document, skip_before_keyword=False, skip_after_keyword=False, prev_bag_of_word=1, after_bag_of_word=5) :
    act = ['acting','role playing','act',' actress','actor','role','portray','character','villain','performance', 'play', 'perform', 'doing']
    plot = ['plot','story','storyline','tale','romance','dialog','script','storyteller',' ending','storytelling','revenge','betrayal','writing']
    
    sid = SentimentIntensityAnalyzer()
    
    act_sentence = []
    plot_sentence = []
    act_score = 0
    plot_score = 0
    tokenizer = tokenize.PunktSentenceTokenizer()
    ps = PorterStemmer()
    
    list_sentence = tokenizer.sentences_from_text(document)
    for sentence in list_sentence :
        is_sentence_act = False
        is_sentence_plot = False
        list_word = tokenize.word_tokenize(sentence)
        for word in list_word :
            if (ps.stem(word.lower()) in act or word.lower() in act):
                list_adj_sentence, adj_found = search_adj(sentence, word, prev_bag_of_word=prev_bag_of_word, after_bag_of_word=after_bag_of_word)
                for adj_word in (list_adj_sentence) :
                    act_sentence.append(adj_word)
                    act_score += sid.polarity_scores(adj_word)['pos'] - sid.polarity_scores(adj_word)['neg']
            if (ps.stem(word.lower()) in plot or word.lower() in plot):
                list_adj_sentence, adj_found = search_adj(sentence, word, prev_bag_of_word=prev_bag_of_word, after_bag_of_word=after_bag_of_word)
                for adj_word in (list_adj_sentence) :
                    plot_sentence.append(adj_word)
                    plot_score +=  sid.polarity_scores(adj_word)['pos'] - sid.polarity_scores(adj_word)['neg']
#     return ([act_sentence,plot_sentence])
    return ([act_score,plot_score])

In [4]:
def list_to_sentence(list_of_word) :
    return((" ").join(list_of_word))

# CONTOH PENGGUNAAN

In [5]:
import time
start = time.time()

In [6]:
# input data
data = pd.read_excel("../input/datasetnlp_nographic_no_neutral.xlsx")
data = data
data.drop(columns=['title'], inplace=True)
for column_name in ['acting', 'plot'] :
    data.loc[data[column_name] == 'positive', column_name] = 1
    data.loc[data[column_name] == 'negative', column_name] = -1

In [7]:
# preprocessing
tokenizer = tokenize.PunktSentenceTokenizer()
data.text = data.text.apply(lambda x: preprocessing(x, prev_bag_of_word=1, after_bag_of_word=10))

'many'

'important'

'squeaky mawkish'

'good'

'minor'

'minor'

'dull'

'real'

'wild'

'main'

'superb'

'excellent'

'own'

'uplifting'

'supporting'

'excellent'

'countless'

'different'

'male'

'new'

'fine'

'complete'

'subtle'

'last'

'serious'

'subtle'

'young'

'so natural'

'special'

'pivotal'

'racist'

'pivotal'

'racist'

'fluff'

'nice'

'little'

'terse'

'puzzling'

'remarkable'

'bad'

'little'

'usual'

'fine'

'fine'

'significant'

'tough'

'tough'

'constant'

'unrelenting'

'fine'

'simple'

'social'

'polar'

'fine'

'creaky'

'creaky'

'interesting'

'old'

'interesting'

'old'

'recently dead'

'always fascinating'

'usual'

'such'

'progressive'

'multiple'

'multiple'

'superb'

'organized'

'fascinating'

'superb'

'gifted'

'significant'

'only'

'significant'

'only'

'major'

'superfluous'

'gratuitous'

'wide'

'wide'

'similar'

'too much'

'Simple'

'too shallow'

'individual'

'past'

'mesmerizing'

'old'

'impossible'

'terrific'

'interested'

'former'

'former'

'visual'

'many'

'good'

'great'

'entertaining'

'expected'

'light'

'pedestrian'

'pedestrian'

'sensitive'

'acting'

'very special'

'warm'

'young'

'young'

'full'

'weak'

'good'

'head-nodding'

'own'

'breakout'

'brilliant'

'main'

'also exceptionally memorable'

'also exceptionally memorable'

'difficult'

'many'

'terrifyingly real'

'difficult'

'many'

'terrifyingly real'

'compelling'

'alcoholic'

'unreal'

'unreal'

'tiresome'

'current'

'so minimal'

'other'

'bad'

'just awful'

'minimal'

'classy'

'classy'

'tremendous'

'physical'

'complete'

'difficult'

'exceptional'

'exceptional'

'excellent'

'easy'

'bumpy'

'so trite'

'repetitive'

'stereotyped'

'so successfully enigmatic'

'as straightforward'

'as straightforward'

'superb'

'minor'

'significant'

'significant'

"'supernatural"

'normal'

'good'

'first'

'magnificent'

'first'

'rather implausible'

'rather implausible'

'previous'

'solid'

'leading'

'hauntingly real'

'superb'

'equally strong'

'equally strong'

'lush'

'lush'

'stunningly lovely'

'same'

'too much'

'illuminating'

'entire'

'entire'

'unbelievable'

'alive'

'little'

'very touching'

'truly brilliant'

'as dark'

'deserving'

'impossibly puerile'

'such'

'poor'

'quasi-true'

'veteran'

'not real'

'wretched'

'difficult'

'little'

'little'

'right'

'rather silly'

'overlong'

'credible'

'as usual'

'good'

'good'

'musical'

'musical'

'minor'

'strong'

'excellent'

'important'

'social'

'consummate'

'solid'

'solid'

'final'

'little'

'absurd'

'superb'

'equal'

'equal'

'bravura'

'facial'

'gifted'

'stunning'

'male'

'new'

'many'

'gifted'

'gifted'

'different'

'beloved'

'other'

'playful'

'incipient'

'strange'

'impossibly difficult'

'extraordinary'

'public'

'raw'

'dirty'

'raw'

'dirty'

'main'

'unfiltered'

'military'

'magnetic'

'evil'

'very strong'

'one-line'

'direct'

'internet-like'

'very straight'

'very straight'

'very straight'

'very straight'

'requisite'

'creepy'

'cold'

'exceptional'

'supporting'

'particularly fine'

'bizarre'

'very bright'

'brilliant'

'top'

'brilliant'

'top'

'brilliant'

'top'

'original'

'original'

'superlative'

'extraordinary'

'extraordinary'

'simple'

'somewhat despondent'

'elderly'

'due'

'stellar'

'stellar'

'little'

'remarkably fine'

'elderly'

'subtle'

'ambiguous'

'good'

'criminal'

'such'

'Swiss'

'full'

'disparate'

'Swiss'

'full'

'interesting'

'such'

'little'

'pretty'

'unknown'

'unknown'

'fine'

'able'

'fairly straightforward'

'such'

'iconic'

'dichotomous'

'British'

'Indian'

'significant'

'significant'

'many'

'visual'

'surreal'

'self-conscious'

'flat'

'high-octane'

'same'

'usual'

'impossible'

'impossible'

'repetitious'

'fine'

'fine'

'intelligent'

'superb'

'intelligent'

'intelligent'

'overall'

'worthwhile'

'worthwhile'

'implausible'

'worthwhile'

'lead'

'quite good'

'momentary'

'good'

'good'

'requisite'

'good'

'evil'

'good'

'requisite'

'good'

'evil'

'good'

'creepy'

'teenaged'

'sound'

'evocative'

'personal'

'beefy'

'fine'

'firm'

'fragmented'

'three-dimensional'

'strong'

'meaty'

In [82]:
data_act = pd.DataFrame([])
# data_act['text'] = [i[0] for i in data.iloc[:,0].tolist()]
# data_act.text = data_act.text.apply(lambda x: list_to_sentence(x))
data_act['sentiment_score'] = [i[0] for i in data.iloc[:,0].tolist()]
data_act['label'] = data.iloc[:,1]
data_act.loc[data_act.label == -1,'label'] = 0

In [81]:
data_plot = pd.DataFrame([])
# data_plot['text'] = [i[1] for i in data.iloc[:,0].tolist()]
# data_plot.text = data_plot.text.apply(lambda x: list_to_sentence(x))
data_plot['sentiment_score'] = [i[1] for i in data.iloc[:,0].tolist()]
data_plot['label'] = data.iloc[:,2]
data_plot.loc[data_plot.label == -1,'label'] = 0

In [10]:
# feature engineering
# from sklearn.feature_extraction.text import CountVectorizer
# count_vect = CountVectorizer()
# X_act_counts = count_vect.fit_transform(data_act.text)
# X_act_counts.shape

In [11]:
# X_plot_counts = count_vect.fit_transform(data_plot.text)
# X_plot_counts.shape

In [12]:
# from sklearn.feature_extraction.text import TfidfTransformer
# tfidf_transformer = TfidfTransformer()
# X_act_tfidf = tfidf_transformer.fit_transform(X_act_counts)
# X_act_tfidf.shape

In [13]:
# X_plot_tfidf = tfidf_transformer.fit_transform(X_plot_counts)
# X_plot_tfidf.shape

In [107]:
# feature_act = X_act_tfidf
# feature_plot = X_plot_tfidf
feature_act = pd.DataFrame(data_act['sentiment_score'])
feature_plot = pd.DataFrame(data_plot['sentiment_score'])

from imblearn.under_sampling import TomekLinks
from imblearn.over_sampling import SMOTE

# smote = SMOTE(k_neighbors=3)
# new_feature_act, new_label_act = smote.fit_sample(feature_act.values, np.array(data_act.label))
# new_feature_plot, new_label_plot = smote.fit_sample(feature_plot.values, np.array(data_plot.label))

tl = TomekLinks()
new_feature_act, new_label_act = tl.fit_sample(feature_act.values, np.array(data_act.label))
new_feature_plot, new_label_plot = tl.fit_sample(feature_plot.values, np.array(data_plot.label))

# new_feature_act, new_label_act = (feature_act.values, np.array(data_act.label))
# new_feature_plot, new_label_plot = (feature_plot.values, np.array(data_plot.label))
display(feature_plot.shape)

(64, 1)

In [15]:
# # learning
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import confusion_matrix
# clf1 = MultinomialNB().fit(feature_act, data_act.label.tolist())
# clf2 = MultinomialNB().fit(feature_plot, data_plot.label.tolist())

In [16]:
# naiv1 = clf1.predict(X_act_tfidf)
# naiv2 = clf2.predict(X_plot_tfidf)

In [17]:
# pred_svm1 = naiv1.predict(pd.DataFrame(feature_act))
# pred_svm2 = naiv2.predict(pd.DataFrame(feature_plot))

In [111]:
from sklearn import svm
svm1 = svm.SVC().fit(new_feature_act, new_label_act)
svm2 = svm.SVC().fit(new_feature_plot, new_label_plot)



In [112]:
pred_svm1 = svm1.predict(pd.DataFrame(new_feature_act))
pred_svm2 = svm2.predict(pd.DataFrame(new_feature_plot))

In [113]:
display(f1_score(pred_svm1, new_label_act),
        f1_score(pred_svm2, new_label_plot)
       )

0.967741935483871

0.8163265306122448

In [114]:
display(pred_svm1, pred_svm2)

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1])

In [115]:
from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier(n_estimators=10)
rf1.fit(new_feature_act, new_label_act)
rf2 = RandomForestClassifier(n_estimators=10)
rf2.fit(new_feature_plot, new_label_plot)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [121]:
pred_rf1 = rf1.predict(new_feature_act)
pred_rf2 = rf2.predict(new_feature_plot)
cv10_pred_rf1 = cross_val_predict(rf1, new_feature_act, new_label_act, cv=10)
cv10_pred_rf2 = cross_val_predict(rf2, new_feature_plot, new_label_plot, cv=10)



In [122]:
rf1.score(new_feature_act, new_label_act)

0.9375

In [123]:
rf2.score(new_feature_plot, new_label_plot)

0.7301587301587301

In [125]:
display(f1_score(cv10_pred_rf1, new_label_act),
        f1_score(cv10_pred_rf2, new_label_plot)
       )

0.9508196721311476

0.7789473684210526

In [127]:
from xgboost import XGBClassifier

xgb1 = XGBClassifier(max_depth=5,objective='binary:logistic',learning_rate=0.001,min_child_weight=1,scale_pos_weight=1)
xgb2 = XGBClassifier(max_depth=5,objective='binary:logistic',learning_rate=0.001,min_child_weight=1,scale_pos_weight=1)

predict_xgb_1 = cross_val_predict(xgb1, new_feature_act, new_label_act, cv=10)
predict_xgb_2 = cross_val_predict(xgb2, new_feature_plot, new_label_plot, cv=10)

cv10_predict_xgb_1 = xgb1.fit(new_feature_act, new_label_act).predict(new_feature_act)
cv10_predict_xgb_2 = xgb2.fit(new_feature_plot, new_label_plot).predict(new_feature_plot)



In [128]:
display(f1_score(predict_xgb_1, new_label_act),
        f1_score(predict_xgb_2, new_label_plot),
        f1_score(cv10_predict_xgb_1, new_label_act),
        f1_score(cv10_predict_xgb_2, new_label_plot)
       )

0.967741935483871

0.7551020408163265

0.967741935483871

0.7912087912087912

In [134]:
def predict_sentiment(teks) :
    sentiment_score = preprocessing(teks)
    sentiment_score_act = pd.DataFrame([sentiment_score[0]], columns=["f0"])
    sentiment_score_plot = pd.DataFrame([sentiment_score[1]], columns=["f0"])
    act_sentiment = xgb1.predict(sentiment_score_act)
    plot_sentiment = xgb2.predict(sentiment_score_plot)
    
    return(act_sentiment[0], plot_sentiment[0])
    
    
act_sentiment, plot_sentiment = predict_sentiment('bad acting suck,bad,sad')
display(act_sentiment, plot_sentiment)
if (act_sentiment == 1) :
    print("acting => positive")
else :
    print("acting => negative")
if (plot_sentiment == 1) :
    print("plot => positive")
else :
    print("plot => negative")

'bad'

'acting'

'bad'

'sad'

1

1

acting => positive
plot => positive


In [136]:
display(data_act)

Unnamed: 0,sentiment_score,label
0,1.000,1
1,2.000,1
2,2.000,1
3,-1.264,1
4,0.000,1
5,1.000,1
6,-1.000,0
7,0.000,1
8,0.000,1
9,1.000,1


In [72]:
# from imblearn.over_sampling import SMOTE
# smote = SMOTE(k_neighbors=3)
# new_feature_act, new_label = smote.fit_sample(feature_act.values, np.array(data_act.label))
# new_feature_act, new_label = smote.fit_sample(feature_act.values, np.array(data_act.label))
# display(new_feature.shape)
# display(new_feature, new_label)

(120, 1)

In [None]:
print (time.time() - start)

In [None]:
from nltk.corpus import sentiwordnet as swn

a = list(swn.senti_synsets('not'))
pos = 0
neg = 0
pos=pos+a[0].pos_score()
neg=neg+a[0].neg_score()
display(neg)