In [28]:
# -*- coding: utf-8 -*-
"""
Created on Wed Nov  8 19:58:25 2017
Purpose: Given a set of tweets with pre-defined labels, use 3 Supervised Learning algorithms to 
predict the sentiment of tweets that were directed at US airlines. 
Compare the accuracy of the three algorithms.
@author: Simone
"""
#------------------------------------------------------------------------------Import the dataset and the libraries

# Import the libraries
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [29]:
# import StemmerFactory class
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# stemming process
sentence = 'Perekonomian Indonesia sedang dalam pertumbuhan yang membanggakan'
output   = stemmer.stem(sentence)

print(output)
# ekonomi indonesia sedang dalam tumbuh yang bangga

print(stemmer.stem('Mereka meniru-nirukannya'))
# mereka tiru

ekonomi indonesia sedang dalam tumbuh yang bangga
mereka tiru


In [30]:
corpus = []
for line in open('data/neg.txt'):
    line = stemmer.stem(line)
    corpus.append(line)

for line in open('data/pos.txt'):
    line = stemmer.stem(line)
    corpus.append(line)

KeyboardInterrupt: 

In [None]:
len(corpus)

In [None]:
print(corpus[:10])

In [None]:
print(corpus[-10:])

In [None]:
# Create the Bag of Words for the independent variable
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer() #initially 15000+ returned.
X = cv.fit_transform(corpus).toarray()

In [None]:
print(X.shape)

In [None]:
X[:10]

In [None]:
y = []
for i in range(1,2001):
    if i<=1000:
        y.append(0)
    else:
        y.append(1)

In [None]:
print(y)

In [None]:
from sklearn.preprocessing import LabelEncoder
labelencoder_y = LabelEncoder()
y = labelencoder_y.fit_transform(y)

In [None]:
y.shape

In [None]:
#------------------------------------------------------------------------------Model Training and Testing

# Split the dataset into the Training set and Test set
from sklearn.cross_validation import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 123)

In [None]:
# Fitting Naive Bayes to the Training set
from sklearn.naive_bayes import GaussianNB
bayes_classifier = GaussianNB()
bayes_classifier.fit(X_train, y_train)

In [None]:
# Fitting the Decision Tree model to the Training set
from sklearn.tree import DecisionTreeClassifier
dec_tree_classifier = DecisionTreeClassifier()
dec_tree_classifier.fit(X_train, y_train)

In [None]:
# Fitting the Random Forest model to the Training set
from sklearn.ensemble import RandomForestClassifier
ranforest_classifier = RandomForestClassifier()
ranforest_classifier.fit(X_train, y_train)

In [None]:
# Predicting the Test set results for different models
y_pred_bayes = bayes_classifier.predict(X_test)
y_pred_dec_tree = dec_tree_classifier.predict(X_test)
y_pred_ranforest = ranforest_classifier.predict(X_test)

In [None]:
# Making the Confusion Matrix for different models
from sklearn.metrics import confusion_matrix
bayes_cm = confusion_matrix(y_test, y_pred_bayes)
dec_tree_cm = confusion_matrix(y_test, y_pred_dec_tree)
ranforest_cm = confusion_matrix(y_test, y_pred_ranforest)

In [None]:
#------------------------------------------------------------------------------Evaluation:

#My Own from CM:
bayes_negative_accuracy = bayes_cm[0][0]/(bayes_cm[0][0]+bayes_cm[0][1])*100
print("Bayes Negative Accuracy: " + str(bayes_negative_accuracy))

bayes_positive_accuracy = bayes_cm[1][1]/(bayes_cm[1][0]+bayes_cm[1][1])*100
print("Bayes Positive Accuracy: " + str(bayes_positive_accuracy))

dec_tree_negative_accuracy = dec_tree_cm[0][0]/(dec_tree_cm[0][0]+dec_tree_cm[0][1])*100
print("Decision Tree Negative Accuracy: " + str(dec_tree_negative_accuracy))

dec_tree_positive_accuracy = dec_tree_cm[1][1]/(dec_tree_cm[1][0]+dec_tree_cm[1][1])*100
print("Decision Tree Positive Accuracy: " + str(dec_tree_positive_accuracy))

ranforest_negative_accuracy = ranforest_cm[0][0]/(ranforest_cm[0][0]+ranforest_cm[0][1])*100
print("Random Forest Negative Accuracy: " + str(ranforest_negative_accuracy))

ranforest_positive_accuracy = bayes_cm[1][1]/(ranforest_cm[1][0]+ranforest_cm[1][1])*100
print("Random Forest Positive Accuracy: " + str(ranforest_positive_accuracy))

In [None]:
print(y_pred_bayes[:37])
print(y_pred_dec_tree[:37])
print(y_pred_ranforest[:37])
print(y_test[:37])

In [None]:
#Classification Report

from sklearn.metrics import classification_report
bayes_report = classification_report(y_test, y_pred_bayes)
print(bayes_report)

dec_tree_report = classification_report(y_test, y_pred_dec_tree)
print(dec_tree_report)

ranforest_report = classification_report(y_test, y_pred_ranforest)
print(ranforest_report)

In [None]:
dec_tree = dec_tree_classifier.score(X_test, y_test)
print(dec_tree)

In [None]:
ranforest = ranforest_classifier.score(X_test, y_test)
print(ranforest)

In [None]:
# import pickle
# # save the model to disk
# pickle.dump(dec_tree_classifier, open('dec_tree.sav', 'wb'))
# pickle.dump(ranforest_classifier, open('ranforest.sav', 'wb'))

In [None]:
# # load the model from disk
# loaded_model = pickle.load(open('ranforest.sav', 'rb'))
# result = loaded_model.score(X_test, y_test)
# print(result)