# import library

In [5]:
import nltk
import numpy as np
import pandas as pd
from nltk import tokenize
from nltk.stem import PorterStemmer 
from nltk.tag import StanfordPOSTagger
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 

# module

In [6]:
def search_adj(sentence, keyword, skip_before_keyword=False, skip_after_keyword=False, prev_bag_of_word=5, after_bag_of_word=5) :    
    st = StanfordPOSTagger('./english-bidirectional-distsim.tagger', './stanford-postagger-3.9.2.jar')
    
#     stopword elemination
#     stop_words = set(stopwords.words('english')) 
#     stop_words.update(['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}']) # remove it if you need punctuation 
    punctuation = ['.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}'] # remove it if you need punctuation 
    
    word_tokens = word_tokenize(sentence) 
#     filtered_sentence = [w for w in word_tokens if not w in stop_words] 
    filtered_sentence = [w for w in word_tokens if not w in punctuation] 
    filtered_sentence = [] 
  
    for w in word_tokens: 
#         if w not in stop_words:
        if w not in punctuation:
            filtered_sentence.append(w)
            
    list_word_tag = st.tag([(" ").join(filtered_sentence)])
    found_keyword = not skip_before_keyword
    list_sentiment_word = []
    found_adjective = 0
    keyword_idx = -1
    for idx in range (len(list_word_tag)) :
        if (list_word_tag[idx][0] == keyword) :
            keyword_idx = idx
            break
    
    if (skip_before_keyword) :
        prev_bag_of_word = 0
    
    if (skip_after_keyword) :
        after_bag_of_word = 0
        
    for idx in range (len(list_word_tag)) :
        if (idx > keyword_idx + after_bag_of_word) :
                break
                
        if ((idx >= keyword_idx - prev_bag_of_word) and list_word_tag[idx][1] == 'JJ') :
            found_adjective += 1
            sentiment_word = list_word_tag[idx][0]
            for adverb_idx in range (idx-1, -1, -1) :
                if (list_word_tag[adverb_idx][1] == 'RB') :
                    sentiment_word = list_word_tag[adverb_idx][0] + " " + sentiment_word
                else :
                    break
            list_sentiment_word.append(sentiment_word)
            
    return (list_sentiment_word, found_adjective)



def preprocessing(document, skip_before_keyword=False, skip_after_keyword=False, prev_bag_of_word=5, after_bag_of_word=5) :
    act = ['acting','role playing','act',' actress','actor','role','portray','character','villain','performance', 'play', 'perform', 'doing']
    plot = ['plot','story','storyline','tale','romance','dialog','script','storyteller',' ending','storytelling','revenge','betrayal','writing']
    graphic = ['movie',' film',' picture',' moving picture','',' motion picture',' show',' picture show',' pic',' flick',' romantic comedy', 'graphic', 'effect', 'cinematography', 'cinematographi']
    
    act_sentence = []
    plot_sentence = []
    graphic_sentence = []
    
    tokenizer = tokenize.PunktSentenceTokenizer()
    ps = PorterStemmer()
    
    list_sentence = tokenizer.sentences_from_text(document)
    for sentence in list_sentence :
        is_sentence_act = False
        is_sentence_plot = False
        is_sentence_graphic = False
        list_word = tokenize.word_tokenize(sentence)
        for word in list_word :
            if (ps.stem(word.lower()) in act):
                list_adj_sentence, adj_found = search_adj(sentence, word, prev_bag_of_word=prev_bag_of_word, after_bag_of_word=after_bag_of_word)
                for adj_word in (list_adj_sentence) :
                    act_sentence.append(adj_word)
            if (ps.stem(word.lower()) in plot):
                list_adj_sentence, adj_found = search_adj(sentence, word, prev_bag_of_word=prev_bag_of_word, after_bag_of_word=after_bag_of_word)
                for adj_word in (list_adj_sentence) :
                    plot_sentence.append(adj_word)
            if (ps.stem(word.lower()) in graphic):
                list_adj_sentence, adj_found = search_adj(sentence, word, prev_bag_of_word=prev_bag_of_word, after_bag_of_word=after_bag_of_word)
                for adj_word in (list_adj_sentence) :
                    graphic_sentence.append(adj_word)
    return ([act_sentence,plot_sentence, graphic_sentence])

def list_to_sentence(list_of_word) :
    return((" ").join(list_of_word))

# CONTOH PENGGUNAAN

In [16]:
import time
start = time.time()
#your code here    

# input data
data = pd.read_excel("../input/datasetnlp.xlsx")
data = data
data.drop(columns=['title'], inplace=True)
for column_name in ['acting', 'plot', 'graphic'] :
    data.loc[data[column_name] == 'positive', column_name] = 1
    data.loc[data[column_name] == 'neutral', column_name] = 0
    data.loc[data[column_name] == 'negative', column_name] = -1


# preprocessing
tokenizer = tokenize.PunktSentenceTokenizer()
data.text = data.text.apply(lambda x: preprocessing(x, prev_bag_of_word=1, after_bag_of_word=10))

data_act = pd.DataFrame([])
data_act['text'] = [i[0] for i in data.iloc[:,0].tolist()]
data_act.text = data_act.text.apply(lambda x: list_to_sentence(x))
data_act['label'] = data.iloc[:,1]

data_plot = pd.DataFrame([])
data_plot['text'] = [i[1] for i in data.iloc[:,0].tolist()]
data_plot.text = data_plot.text.apply(lambda x: list_to_sentence(x))
data_plot['label'] = data.iloc[:,2]

data_graphic = pd.DataFrame([])
data_graphic['text'] = [i[2] for i in data.iloc[:,0].tolist()]
data_graphic.text = data_graphic.text.apply(lambda x: list_to_sentence(x))
data_graphic['label'] = data.iloc[:,3]


# feature engineering
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_act_counts = count_vect.fit_transform(data_act.text)
X_act_counts.shape

X_plot_counts = count_vect.fit_transform(data_plot.text)
X_plot_counts.shape

X_graphic_counts = count_vect.fit_transform(data_graphic.text)
X_graphic_counts.shape

from sklearn.feature_extraction.text import TfidfTransformer
tfidf_transformer = TfidfTransformer()
X_act_tfidf = tfidf_transformer.fit_transform(X_act_counts)
X_act_tfidf.shape

X_plot_tfidf = tfidf_transformer.fit_transform(X_plot_counts)
X_plot_tfidf.shape

X_graphic_tfidf = tfidf_transformer.fit_transform(X_graphic_counts)
X_graphic_tfidf.shape

# learning
from sklearn.naive_bayes import MultinomialNB
clf1 = MultinomialNB().fit(X_act_tfidf, data_act.label.tolist())
clf2 = MultinomialNB().fit(X_plot_tfidf, data_plot.label.tolist())
clf3 = MultinomialNB().fit(X_graphic_tfidf, data_graphic.label.tolist())

pred1 = clf1.predict(X_act_tfidf)
pred2 = clf2.predict(X_plot_tfidf)
pred3 = clf3.predict(X_graphic_tfidf)

# scoring
from sklearn.metrics import adjusted_mutual_info_score
display(adjusted_mutual_info_score(pred1, data_act.label))
display(adjusted_mutual_info_score(pred2, data_plot.label))
display(adjusted_mutual_info_score(pred3, data_graphic.label))

print (time.time() - start)




array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1])

array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1])

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0])

0      1
1      1
2      1
3      1
4      1
5      1
6     -1
7      1
8      1
9      1
10     0
11     1
12     1
13     1
14     0
15     1
16     1
17     1
18    -1
19     1
20     1
21     1
22     0
23     1
24     1
25     1
26     1
27     1
28     1
29     1
30     1
31     0
32     1
33     1
34     0
35     0
36     1
37     0
38     1
39     1
40     0
41     0
42     1
43     1
44     1
45     0
46     1
47     1
48     1
49    -1
Name: label, dtype: object

0     -1
1      1
2      1
3      1
4      1
5      1
6      1
7     -1
8      1
9      1
10     0
11     0
12    -1
13     1
14     0
15     1
16     0
17     1
18     1
19     1
20     0
21    -1
22     0
23     1
24    -1
25     1
26     1
27     1
28     0
29    -1
30     0
31     0
32    -1
33     0
34     0
35    -1
36    -1
37     1
38    -1
39     1
40     1
41     1
42     1
43     1
44     0
45    -1
46     0
47     0
48     1
49     0
Name: label, dtype: object

0     0
1     0
2     1
3     0
4     1
5     0
6     0
7     0
8     0
9     0
10    0
11    0
12    1
13    1
14    0
15    1
16    0
17    0
18    0
19    0
20    0
21    0
22    0
23    0
24    0
25    0
26    0
27    0
28    0
29    0
30    1
31    0
32    0
33    0
34    0
35    0
36    0
37    0
38    0
39    0
40    1
41    0
42    0
43    0
44    0
45    0
46    0
47    0
48    0
49    0
Name: label, dtype: object

316.94719076156616
