In [1]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn import model_selection

In [2]:
train_data = pd.read_csv("train.csv")
test_data = pd.read_csv("test.csv")

train_data.head()

Unnamed: 0,review,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,positive
1,Homelessness (or Houselessness as George Carli...,positive
2,Brilliant over-acting by Lesley Ann Warren. Be...,positive
3,This is easily the most underrated film inn th...,positive
4,This is not the typical Mel Brooks film. It wa...,positive


In [3]:
train_data.loc[train_data['sentiment'] == 'positive', 'sentiment'] = 1
train_data.loc[train_data['sentiment'] == 'negative', 'sentiment'] = 0
test_data.loc[test_data['sentiment'] == 'positive', 'sentiment'] = 1
test_data.loc[test_data['sentiment'] == 'negative', 'sentiment'] = 0
train_data.head()

Unnamed: 0,review,sentiment
0,Bromwell High is a cartoon comedy. It ran at t...,1
1,Homelessness (or Houselessness as George Carli...,1
2,Brilliant over-acting by Lesley Ann Warren. Be...,1
3,This is easily the most underrated film inn th...,1
4,This is not the typical Mel Brooks film. It wa...,1


In [4]:
Y = train_data['sentiment']
Y = Y.astype('int')

Y_test = test_data['sentiment']
Y_test = Y_test.astype('int')

In [5]:
print(len(train_data))
print(len(test_data))

25000
25000


In [6]:
import re, string
from sklearn.feature_extraction.text import TfidfVectorizer


In [7]:
re_punct = re.compile(f'([{string.punctuation}<>])')
def tokenize(text): 
    text = text.lower()
    list_elem =  re_punct.sub(r' \1 ', text).split()
    result = [elem for elem in list_elem if elem not in string.punctuation]
    return result

In [8]:
vectorizer = TfidfVectorizer(tokenizer=tokenize, ngram_range=(1, 2), max_features=10000)
X = vectorizer.fit_transform(train_data['review'])
X_test = vectorizer.transform(test_data['review'])

In [9]:
from sklearn.svm import SVC

In [10]:
classif = SVC(C=0.5, kernel='linear', gamma='auto')


In [11]:
classif.fit(X, Y)

SVC(C=0.5, gamma='auto', kernel='linear')

In [12]:
from sklearn.metrics import accuracy_score

In [13]:
y_pred_test = classif.predict(X_test)
print("test accuracy with TF-IDF: ", accuracy_score(Y_test, y_pred_test))

test accuracy with TF-IDF:  0.89736


In [24]:
weight = classif.coef_.toarray()

In [23]:
weight.toarray()

array([[-0.59429853, -0.21550691, -1.74646158, ..., -0.61180898,
        -0.0949914 , -0.14190561]])

In [15]:
import pickle

In [25]:
with open('svm_weight', "wb") as f:
    pickle.dump(weight, f)

In [21]:
weight.toarray().shape

(1, 10000)

In [19]:
weight[0][0]

<1x10000 sparse matrix of type '<class 'numpy.float64'>'
	with 10000 stored elements in Compressed Sparse Row format>