In [110]:
import pandas

df_train = pandas.read_csv('./train.csv', quotechar="\'")
df_test = pandas.read_csv('./test.labeled.csv', quotechar="\'")

In [111]:
import re
import string
import demoji
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import CountVectorizer

demoji.download_codes()
def clean_tweets(data):
    new_data = data.copy()

    username_hash = r'[#@]\w+'
    punctuation = '[%s]+' % re.escape(string.punctuation)
    special_char = r'[^0-9a-zA-Z\s]+'
    number = r'[0-9]+'
    space = r'\s{2,}'
    space_begin_end = r'^\s+|\s+$'
    url = r'(https?|www):\/{1,}\w+\W+\w+\/{1,}\w+'
    char_ref = r'&\w+;'

    for i in range(len(new_data)):
        old_str = str(new_data[i])

        new_data[i] = re.sub(char_ref, ' ', old_str)
        new_data[i] = re.sub(username_hash, '', old_str)
        new_data[i] = re.sub(url, '', old_str)
        new_data[i] = re.sub(punctuation, '', old_str)
        new_data[i] = re.sub(number, '', old_str)
        new_data[i] = re.sub(space_begin_end, '', old_str)
        new_data[i] = re.sub(space, '', old_str)
        new_data[i] = demoji.replace(old_str, '')
        new_data[i] = re.sub(special_char, '', old_str)

    return new_data

def case_fold(data):
    new_data = data.copy()
    return list(map(lambda s: s.lower(), new_data))

def tokenize(data):
    new_data = data.copy()
    return list(map(lambda s: s.split(' '), new_data))

def stem(data):
    new_data = data.copy()
    stemmer = StemmerFactory().create_stemmer()

    return list(map(lambda s: stemmer.stem(s), new_data))

Downloading emoji data ...
... OK (Got response in 0.46 seconds)
Writing emoji data to /home/kodir/.demoji/codes.json ...
... OK


In [112]:
tweet_train = stem(case_fold(clean_tweets(df_train['tweet'])))
tweet_test = stem(case_fold(clean_tweets(df_test['tweet'])))

In [113]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer(use_idf=True)
tweet_train_vect = tfidf_vect.fit_transform(tweet_train)
tweet_test_vect = tfidf_vect.transform(tweet_test)

In [114]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score

svm_c = SVC(kernel='linear')
svm_c.fit(tweet_train_vect, df_train['category'])

pred = svm_c.predict(tweet_test_vect)

pos = list(filter(lambda s: s == 'positif', pred))
neg = list(filter(lambda s: s == 'negatif', pred))

total = len(df_test['tweet'])
pos_percent = (len(pos) / total) * 100
neg_percent = (len(neg) / total) * 100
accuracy = accuracy_score(df_test['category'], pred) * 100

print(f"Positive Percentage : {pos_percent}")
print(f"Negative Percentage : {neg_percent}")
print(f"Classification Accuracy : {accuracy}")

Positive Percentage : 24.88479262672811
Negative Percentage : 75.11520737327189
Classification Accuracy : 80.4147465437788


In [115]:
# from sklearn.feature_extraction.text import CountVectorizer
# import matplotlib.pyplot as plt
# import numpy as np

# def plot_coefficients(classifier, feature_names, top_features=20):
#     coef = np.array(classifier.coef_).flatten()

#     top_positive_coefficients = np.argsort(coef)[-top_features:]
#     top_negative_coefficients = np.argsort(coef)[:top_features]
#     top_coefficients = np.hstack([top_negative_coefficients, top_positive_coefficients])

#     # create plot
#     plt.figure(figsize=(15, 5))
#     colors = ['red' if c < 0 else 'blue' for c in coef[top_coefficients]]
#     plt.bar(np.arange(2 * top_features), coef[top_coefficients], color=colors)
#     feature_names = np.array(feature_names)
#     plt.xticks(np.arange(1, 1 + 2 * top_features), feature_names[top_coefficients], rotation=60, ha='right')

#     plt.show()

# cv = CountVectorizer()
# cv.fit(tweet_test)

# plot_coefficients(svm_c, cv.get_feature_names())

In [116]:
# TODO : test our model here