In [1]:
import pandas as pd
import re
import numpy as np
import joblib
import pickle
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsRestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score

In [2]:
df = pd.read_csv('labeling.csv', sep=';')
# print(df)

In [3]:
#konversi label ke polaritas
def convert(polarity):
    if polarity == 'positif':
        return 1
    elif polarity == 'netral':
        return 0
    else:
        return -1


In [4]:
df['Polarity'] = df['label'].apply(convert)

In [5]:
X = df['STOP_REMOVAL']
y = df['Polarity']

In [7]:
#vectorizer = CountVectorizer()
#X = vectorizer.fit_transform(df['isi'])

bow_transformer = CountVectorizer()
print(df['STOP_REMOVAL'].shape)
X = bow_transformer.fit_transform(df['STOP_REMOVAL'])

# np.set_printoptions(threshold=np.inf)
# file = open('response.txt', 'w')
# file.write(str(np.array((X[0]))))
# file.close()

#word_list = bow_transformer.get_feature_names();    
#count_list = X.toarray().sum(axis=0)
# print('word_list = ', word_list)
# print('count_list = ', count_list)

#dict_count_word = dict(zip(word_list,count_list))
#sorted_dict_count_word = sorted(dict_count_word.items(), key=lambda kv: kv[1], reverse = True)[:5]
#print('sorted_dict_count_word = ', sorted_dict_count_word)

# print(X.toarray())
print('Shape of Sparse Matrix: ', X.shape)
print('Amount of Non-Zero occurrences: ', X.nnz)

# save the Count Vectorized to disk
filename1 = 'count_vectorized1.pkl'
pickle.dump(bow_transformer, open(filename1, 'wb'))

#TFID Transform
tf_transform = TfidfTransformer(use_idf=False).fit(X)
X = tf_transform.transform(X)
#print(X.shape)
#print(X)

# save the TFID to disk
filename1 = 'tfid_transform1.pkl'
pickle.dump(tf_transform, open(filename1, 'wb'))

(1545,)
Shape of Sparse Matrix:  (1545, 2542)
Amount of Non-Zero occurrences:  16393


In [8]:
density = (100.0 * X.nnz / (X.shape[0] * X.shape[1]))
print('Density: {}'.format((density)))


Density: 0.4174018877677032


In [9]:
#splitting data
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2)
# print(X_test)

In [10]:
#classifier data
nb = MultinomialNB()
nb.fit(X_train, y_train)

print(nb)

preds = nb.predict(X_test)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)


In [11]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

          -1       0.74      0.86      0.80       158
           0       0.79      0.70      0.74       140
           1       0.50      0.09      0.15        11

    accuracy                           0.76       309
   macro avg       0.68      0.55      0.56       309
weighted avg       0.76      0.76      0.75       309



In [12]:
from io import StringIO
classification = classification_report(y_test, preds)
s = StringIO(classification)
with open('classification.csv', 'w') as f:
    for line in s:
        f.write(line)

In [13]:
print(accuracy_score(y_test, preds))

0.7605177993527508


In [14]:
accuracy = accuracy_score(y_test, preds)
a = np.asarray([accuracy])
np.savetxt("accuracy.csv", a, delimiter=",", fmt='%s')

In [15]:
Final = df[['tanggal', 'user_name', 'author', 'isi', 'STOP_REMOVAL','label', 'Polarity']]

In [16]:
Final = Final.rename(columns={'tanggal':'Tanggal', 'user_name':'User', 'author':'Author', 'isi':'Isi', 'STOP_REMOVAL':'Stop_Removal', 'label':'Label'})

In [17]:
Final.to_csv('hasil_analysis.csv')

In [18]:
# Fit the model on training set
model = LogisticRegression()
model.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [33]:
# save the model to disk
filename = 'model_analisis.pkl'
pickle.dump(model, open(filename, 'wb'))

In [34]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, y_test)
print(result)

0.7119741100323624


In [35]:
loaded_model.predict(X_test)

array([-1,  1, -1, -1,  0, -1, -1,  0, -1, -1, -1, -1, -1,  0,  0,  0, -1,
       -1,  1,  0, -1, -1, -1, -1,  0,  0, -1, -1, -1, -1,  0,  0,  0,  0,
       -1, -1,  0,  0, -1, -1,  0, -1, -1,  0, -1, -1, -1, -1, -1, -1, -1,
       -1, -1,  1,  1,  1,  1,  0,  0, -1, -1, -1,  0,  0,  0, -1, -1, -1,
       -1,  0, -1, -1, -1, -1, -1,  0, -1, -1,  0, -1,  0, -1, -1, -1, -1,
        0, -1, -1,  0,  0, -1, -1, -1, -1, -1, -1,  0, -1,  0, -1, -1,  0,
       -1, -1, -1, -1,  0, -1,  0, -1, -1, -1, -1, -1, -1,  0,  0,  0, -1,
        0, -1, -1, -1, -1, -1,  0, -1,  0,  0, -1, -1,  0, -1,  0,  0, -1,
        0,  0, -1,  0,  0,  0,  0,  0, -1, -1,  0, -1, -1, -1,  0,  0, -1,
       -1,  0, -1, -1,  0, -1, -1,  0,  0, -1,  0, -1, -1,  0, -1, -1, -1,
        0,  0,  0, -1,  0,  0,  0,  0,  0, -1,  0, -1, -1, -1, -1, -1, -1,
       -1, -1, -1,  1,  0, -1, -1,  0, -1, -1, -1, -1, -1, -1,  0,  0, -1,
       -1, -1, -1, -1,  0, -1,  0, -1, -1,  1,  0,  0,  0, -1,  0,  0, -1,
        0, -1, -1, -1,  0