In [71]:
import pandas as pd
from collections import Counter, deque
import re
import numpy as np
import random

from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler, scale, MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split

MAX_LEN = 12
VALID_LEN_POS = 750
VALID_LEN_NEU = 2270
LIST_NAME = ['positive', 'negetive', 'neutral']

df = pd.read_csv("all-data.csv", encoding="latin1")
df.head()

Unnamed: 0,target,News
0,neutral,"According to Gran , the company has no plans t..."
1,neutral,Technopolis plans to develop in stages an area...
2,negative,The international electronic industry company ...
3,positive,With the new production plant the company woul...
4,positive,According to the company 's updated strategy f...


In [72]:
# target updating
def classify(data):
    if data=="positive":
        return 1
    elif data=="negative":
        return 2
    elif data=="neutral":
        return 3
    

df["target"] = list(map(classify, df["target"]))
# we must sort to can balace data
df.sort_values('target', inplace=True)
df.head()

Unnamed: 0,target,News
775,1,Our standardised services have met with a posi...
933,1,21 December 2010 - Finnish industrial machiner...
932,1,`` We 've been feeling quite positive about th...
931,1,`` We have tailored our solutions to meet Sole...
930,1,`` We are pleased to deliver the Basware Invoi...


In [63]:
Counter(df['target'])

Counter({1: 1363, 2: 604, 3: 2879})

In [64]:
# data must be balanced
df = df[VALID_LEN_POS:]
df = df[:-VALID_LEN_NEU]
Counter(df['target'])

Counter({1: 613, 2: 604, 3: 609})

In [65]:
lst = []
for t in df['News'].values:
    texts = t.lower()
    texts = re.sub("[%;,.]+", repl="", string=texts)
    #texts = "<start> "  + texts + "<end>"
    lst.append(texts)


strings_all = ''.join(i for i in lst)
strings_all = re.sub(' +', repl=' ', string=strings_all)
strings_all = Counter(strings_all.split(' '))

string_each = []
for l in lst:
    texts = re.sub(' +', repl=' ', string=l)
    each = texts.split(' ')
    string_each.append(Counter(each[:-1]))
    

# Tokenization with the formula available in the README.md file 
def Tokenizer(each, group):
    value = []
    for count in each:
        val = []
        for k in count:
            if group[k]!=0:
                point_each = count[k] / len(count)
                point_all = np.log(len(each) / group[k])
                val.append(point_each*point_all)
                
        value.append(val)
        
    datasets = []
    for target, features in zip(df['target'], value):
        features = features[:MAX_LEN]
        if len(features)<MAX_LEN:
            mean = np.mean(features)
            length = MAX_LEN - len(features)
            for i in range(length):
                features.append(mean)
            datasets.append([target, features])
    
    # exiting the sort mode 
    random.shuffle(datasets)
    
    return datasets
        

dataset = Tokenizer(string_each, strings_all)
dataset[0]

[2,
 [0.2801152823228134,
  0.052549498064015855,
  1.0728404373078446,
  0.6767563341307332,
  0.4638861691590854,
  0.27736265105625957,
  0.6599301861798212,
  0.49763436546008194,
  0.49763436546008194,
  0.49763436546008194,
  0.49763436546008194,
  0.49763436546008194]]

In [66]:
X = []
y = []
for label, feature in dataset:
    X.append(feature)
    y.append(label)
    

X = np.array(X, dtype=np.float64)
y = np.array(y)

# Scaling data 
scaler = StandardScaler()
X = scaler.fit_transform(X)

#X = scale(X)

x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.1)

model = SVC(kernel="rbf")
model.fit(x_train, y_train)

In [67]:
model.score(x_test, y_test)

0.7

In [68]:
# show some sample predict
for real, pred in zip(y_test, model.predict(x_test)):
    print(f"real is {LIST_NAME[real-1]}, prediction is {LIST_NAME[real-1]}, \n")

real is positive, prediction is positive, 

real is neutral, prediction is neutral, 

real is neutral, prediction is neutral, 

real is positive, prediction is positive, 

real is neutral, prediction is neutral, 

real is neutral, prediction is neutral, 

real is negetive, prediction is negetive, 

real is neutral, prediction is neutral, 

real is neutral, prediction is neutral, 

real is neutral, prediction is neutral, 

real is neutral, prediction is neutral, 

real is negetive, prediction is negetive, 

real is positive, prediction is positive, 

real is negetive, prediction is negetive, 

real is neutral, prediction is neutral, 

real is neutral, prediction is neutral, 

real is neutral, prediction is neutral, 

real is neutral, prediction is neutral, 

real is neutral, prediction is neutral, 

real is positive, prediction is positive, 

real is neutral, prediction is neutral, 

real is neutral, prediction is neutral, 

real is neutral, prediction is neutral, 

real is negetive, pr

In [69]:
# for KNeighborsClassifier algoritm
model = KNeighborsClassifier()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.4

In [70]:
# for DecisionTreeClassifier algoritm
model = DecisionTreeClassifier()
model.fit(x_train, y_train)
model.score(x_test, y_test)

0.4666666666666667