# import library

In [53]:
import nltk
import numpy as np
import pandas as pd
from nltk import tokenize
from nltk.stem import PorterStemmer 
from nltk.tag import StanfordPOSTagger
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import f1_score

# module

In [2]:
def search_adj(sentence, keyword, skip_before_keyword=False, skip_after_keyword=False, prev_bag_of_word=5, after_bag_of_word=5) :    
    st = StanfordPOSTagger('./english-bidirectional-distsim.tagger', './stanford-postagger-3.9.2.jar')
    
#     stopword elemination
#     stop_words = set(stopwords.words('english')) 
#     stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation 
    punctuation = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'] # remove it if you need punctuation 
    
    word_tokens = word_tokenize(sentence) 
#     filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [w for w in word_tokens if not w in punctuation] 
    filtered_sentence = [] 
  
    for w in word_tokens: 
#         if w not in stop_words:
        if w not in punctuation:
            filtered_sentence.append(w)
            
    list_word_tag = st.tag([(" ").join(filtered_sentence)])
    found_keyword = not skip_before_keyword
    list_sentiment_word = []
    found_adjective = 0
    keyword_idx = -1
    for idx in range (len(list_word_tag)) :
        if (list_word_tag[idx][0] == keyword) :
            keyword_idx = idx
            break
    
    if (skip_before_keyword) :
        prev_bag_of_word = 0
    
    if (skip_after_keyword) :
        after_bag_of_word = 0
        
    for idx in range (len(list_word_tag)) :
        if (idx > keyword_idx + after_bag_of_word) :
                break
                
        if ((idx >= keyword_idx - prev_bag_of_word) and list_word_tag[idx][1] == 'JJ') :
            found_adjective += 1
            sentiment_word = list_word_tag[idx][0]
            for adverb_idx in range (idx-1, -1, -1) :
                if (list_word_tag[adverb_idx][1] == 'RB') :
                    sentiment_word = list_word_tag[adverb_idx][0] + " " + sentiment_word
                else :
                    break
            list_sentiment_word.append(sentiment_word)
            
    return (list_sentiment_word, found_adjective)

In [3]:
def preprocessing(document, skip_before_keyword=False, skip_after_keyword=False, prev_bag_of_word=5, after_bag_of_word=5) :
    act = ['acting','role playing','act',' actress','actor','role','portray','character','villain','performance', 'play', 'perform', 'doing']
    plot = ['plot','story','storyline','tale','romance','dialog','script','storyteller',' ending','storytelling','revenge','betrayal','writing']
    
    act_sentence = []
    plot_sentence = []
    
    tokenizer = tokenize.PunktSentenceTokenizer()
    ps = PorterStemmer()
    
    list_sentence = tokenizer.sentences_from_text(document)
    for sentence in list_sentence :
        is_sentence_act = False
        is_sentence_plot = False
        list_word = tokenize.word_tokenize(sentence)
        for word in list_word :
            if (ps.stem(word.lower()) in act):
                list_adj_sentence, adj_found = search_adj(sentence, word, prev_bag_of_word=prev_bag_of_word, after_bag_of_word=after_bag_of_word)
                for adj_word in (list_adj_sentence) :
                    act_sentence.append(adj_word)
            if (ps.stem(word.lower()) in plot):
                list_adj_sentence, adj_found = search_adj(sentence, word, prev_bag_of_word=prev_bag_of_word, after_bag_of_word=after_bag_of_word)
                for adj_word in (list_adj_sentence) :
                    plot_sentence.append(adj_word)
    return ([act_sentence,plot_sentence])

In [4]:
def list_to_sentence(list_of_word) :
    return((" ").join(list_of_word))

# CONTOH PENGGUNAAN

In [5]:
import time
start = time.time()
#your code here    

In [6]:
# input data
data = pd.read_excel("../input/datasetnlp_nographic_no_neutral.xlsx")
data = data
data.drop(columns=['title'], inplace=True)
for column_name in ['acting', 'plot'] :
    data.loc[data[column_name] == 'positive', column_name] = 1
    data.loc[data[column_name] == 'negative', column_name] = -1

In [7]:
# preprocessing
tokenizer = tokenize.PunktSentenceTokenizer()
data.text = data.text.apply(lambda x: preprocessing(x, prev_bag_of_word=1, after_bag_of_word=10))

In [8]:
data_act = pd.DataFrame([])
data_act['text'] = [i[0] for i in data.iloc[:,0].tolist()]
data_act.text = data_act.text.apply(lambda x: list_to_sentence(x))
data_act['label'] = data.iloc[:,1]

In [9]:
data_plot = pd.DataFrame([])
data_plot['text'] = [i[1] for i in data.iloc[:,0].tolist()]
data_plot.text = data_plot.text.apply(lambda x: list_to_sentence(x))
data_plot['label'] = data.iloc[:,2]

In [10]:
# feature engineering
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_act_counts = count_vect.fit_transform(data_act.text)
X_act_counts.shape

(64, 121)

In [11]:
X_plot_counts = count_vect.fit_transform(data_plot.text)
X_plot_counts.shape

(64, 45)

In [12]:
from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_act_tfidf = tfidf_transformer.fit_transform(X_act_counts)
X_act_tfidf.shape

(64, 121)

In [13]:
X_plot_tfidf = tfidf_transformer.fit_transform(X_plot_counts)
X_plot_tfidf.shape

(64, 45)

In [14]:
# # learning
# from sklearn.naive_bayes import MultinomialNB
# from sklearn.metrics import confusion_matrix
# clf1 = MultinomialNB().fit(X_act_tfidf, data_act.label.tolist())
# clf2 = MultinomialNB().fit(X_plot_tfidf, data_plot.label.tolist())
# clf3 = MultinomialNB().fit(X_graphic_tfidf, data_graphic.label.tolist())

In [15]:
# pred1 = clf1.predict(X_act_tfidf)
# pred2 = clf2.predict(X_plot_tfidf)
# pred3 = clf3.predict(X_graphic_tfidf)

In [16]:
# # scoring
# from sklearn.metrics import adjusted_mutual_info_score
# display(adjusted_mutual_info_score(pred1, data_act.label))
# display(adjusted_mutual_info_score(pred2, data_plot.label))
# display(adjusted_mutual_info_score(pred3, data_graphic.label))

In [17]:
from sklearn import svm
svm1 = svm.SVC().fit(X_act_tfidf, data_act.label.tolist())
svm2 = svm.SVC().fit(X_plot_tfidf, data_plot.label.tolist())



In [18]:
pred_svm1 = svm1.predict(X_act_tfidf)
pred_svm2 = svm2.predict(X_plot_tfidf)

In [19]:
from sklearn.metrics import adjusted_mutual_info_score
display(adjusted_mutual_info_score(pred_svm1, data_act.label))
display(adjusted_mutual_info_score(pred_svm2, data_plot.label))



0.0

-4.3132773826476156e-16

In [48]:
from sklearn.ensemble import RandomForestClassifier
rf1 = RandomForestClassifier(n_estimators=10)
rf1.fit(X_act_tfidf, data_act.label.tolist())
rf2 = RandomForestClassifier(n_estimators=10)
rf2.fit(X_plot_tfidf, data_plot.label.tolist())

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [56]:
pred_rf1 = rf1.predict(X_act_tfidf)
pred_rf2 = rf2.predict(X_plot_tfidf)
cv10_pred_rf1 = cross_val_predict(rf1, X_act_tfidf, data_act.label.tolist(), cv=10)
cv10_pred_rf2 = cross_val_predict(rf2, X_plot_tfidf, data_plot.label.tolist(), cv=10)



In [50]:
rf1.score(X_act_tfidf, data_act.label.tolist())

0.984375

In [60]:
rf2.score(X_plot_tfidf, data_plot.label.tolist())

0.8125

In [62]:
display(f1_score(cv10_pred_rf1, data_act.label.tolist()),
        f1_score(cv10_pred_rf2, data_plot.label.tolist())
       )

0.9586776859504132

0.7961165048543689

In [67]:

from xgboost import XGBClassifier

xgb1 = XGBClassifier(max_depth=5,objective='binary:logistic',learning_rate=0.001,min_child_weight=1,scale_pos_weight=1)
xgb2 = XGBClassifier(max_depth=5,objective='binary:logistic',learning_rate=0.001,min_child_weight=1,scale_pos_weight=1)

predict_xgb_1 = cross_val_predict(xgb1, X_act_tfidf, data_act.label.tolist(), cv=10)
predict_xgb_2 = cross_val_predict(xgb2, X_plot_tfidf, data_plot.label.tolist(), cv=10)

cv10_predict_xgb_1 = xgb1.fit(X_act_tfidf, data_act.label.tolist()).predict(X_act_tfidf)
cv10_predict_xgb_2 = xgb2.fit(X_plot_tfidf, data_plot.label.tolist()).predict(X_plot_tfidf)



In [68]:
display(f1_score(predict_xgb_1, data_act.label.tolist()),
        f1_score(predict_xgb_2, data_plot.label.tolist()),
        f1_score(cv10_predict_xgb_1, data_act.label.tolist()),
        f1_score(cv10_predict_xgb_2, data_plot.label.tolist())
       )

0.967741935483871

0.7924528301886793

0.9666666666666667

0.7924528301886793

In [37]:
print (time.time() - start)

576.2254271507263
