In [306]:
import pandas

df_train = pandas.read_csv('./train.csv', quotechar="\'")
df_test = pandas.read_csv('./test.labeled.csv', quotechar="\'")

In [307]:
import re
import string
import demoji
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer

demoji.download_codes()

def clean_tweets(data):
    new_data = data.copy()

    username_hash = r'[#@]\w+'
    punctuation = '[%s]+' % re.escape(string.punctuation)
    special_char = r'[^0-9a-zA-Z\s]+'
    number = r'[0-9]+'
    space = r'\s{2,}'
    space_begin_end = r'^\s+|\s+$'
    url = r'https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{1,256}\.[a-z]{2,4}\b([-a-zA-Z0-9@:%_\+.~#?&//=]*)'
    char_ref = r'&\w+;'

    for i in range(len(new_data)):
        old_str = str(new_data[i])

        old_str = re.sub(username_hash, '', old_str)
        old_str = re.sub(url, '', old_str)
        old_str = re.sub(char_ref, ' ', old_str)
        old_str = re.sub(punctuation, '', old_str)
        old_str = re.sub(number, '', old_str)
        old_str = re.sub(space_begin_end, '', old_str)
        old_str = re.sub(space, '', old_str)
        old_str = demoji.replace(old_str, '')
        old_str = re.sub(special_char, '', old_str)

        new_data[i] = old_str

    return new_data

def case_fold(data):
    new_data = data.copy()
    return list(map(lambda s: s.lower(), new_data))

def tokenize(data):
    new_data = data.copy()
    return list(map(lambda s: s.split(' '), new_data))

def stem(data):
    new_data = data.copy()
    stemmer = StemmerFactory().create_stemmer()

    return list(map(lambda s: stemmer.stem(s), new_data))

Downloading emoji data ...
... OK (Got response in 0.43 seconds)
Writing emoji data to /Users/bangkodir/.demoji/codes.json ...
... OK


In [308]:
cleaned_train = clean_tweets(df_train['tweet'])
case_folded_train = case_fold(cleaned_train)
tweet_train = stem(case_folded_train)

cleaned_test = clean_tweets(df_test['tweet'])
case_folded_test = case_fold(cleaned_test)
tweet_test = stem(case_folded_test)

In [309]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(use_idf=True)
tweet_train_vect = tfidf_vect.fit_transform(tweet_train)
tweet_test_vect = tfidf_vect.transform(tweet_test)

In [310]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm_c = SVC(kernel='linear')
svm_c.fit(tweet_train_vect, df_train['category'])

pred = svm_c.predict(tweet_test_vect)

pos = []
neg = []
total_raw_tweets = len(tweet_test)

for i in range(total_raw_tweets):
    tweet = tweet_test[i]
    if pred[i] == 'positif':
        pos.append(tweet)
    elif pred[i] == 'negatif':
        neg.append(tweet)

pos_percent = (len(pos) / total_raw_tweets) * 100
neg_percent = (len(neg) / total_raw_tweets) * 100
accuracy = accuracy_score(df_test['category'], pred) * 100

print(f"Positive Percentage : {pos_percent}")
print(f"Negative Percentage : {neg_percent}")
print(f"Classification Accuracy : {accuracy}")

Positive Percentage : 25.0
Negative Percentage : 75.0
Classification Accuracy : 80.06912442396313


In [312]:
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
import numpy as np

def plot_coefficients(classifier, feature_names, top_features=20):
    coef = np.array(classifier.coef_).flatten()

    top_positive_coefficients = np.argsort(coef)[-top_features:]
    top_negative_coefficients = np.argsort(coef)[:top_features]
    top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])

    # create plot
    plt.figure(figsize=(15, 5))

    colors = []
    for c in coef[top_coefficients]:
        if c < 0:
            colors.append('red')
        else:
            colors.append('blue')

    plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
    feature_names = np.array(feature_names)
    plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')

    plt.show()

cv = CountVectorizer()
cv.fit(tweet_test)

plot_coefficients(svm_c, cv.get_feature_names())

ValueError: WRITEBACKIFCOPY base is read-only

In [311]:
# from sklearn.feature_extraction.text import CountVectorizer

# def get_features(data):
#     cv = CountVectorizer()
#     cv.fit(data)

#     return cv.get_feature_names()

# pos_features = get_features(pos)
# neg_features = get_features(neg)

In [305]:
# TODO : test our model here