In [121]:
import numpy as np
import pandas as pd

### Load data

In [122]:
# open train data 

with open('train_with_label.txt', 'r') as f: 
  data = f.readlines() 

X_train = [] 
y_train = [] 

for i in range(len(data)) : 
  cur = data[i].strip().split('\t') 
  X_train.append(cur[1]+cur[2]) 
  y_train.append(cur[3]) 


In [123]:
# open dev data 
 
with open('dev_with_label.txt', 'r') as f: 
  data = f.readlines() 
 
X_test = [] 
y_test = [] 

for i in range(len(data)) : 
  cur = data[i].strip().split('\t') 
  X_test.append(cur[1]+cur[2]) 
  y_test.append(cur[3]) 


In [124]:
# open test data
 
with open('test_without_label.txt', 'r') as f: 
  data = f.readlines() 

test = [] 
test_id = [] 
 
for i in range(len(data)) : 
  cur = data[i].strip().split('\t') 
  test_id.append(cur[0]) 
  test.append(cur[1]+cur[2]) 


### Data processing

In [125]:
import re 
import nltk 

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 
from nltk.util import ngrams 
from nltk.stem import WordNetLemmatizer 

nltk.download('punkt') 
nltk.download('stopwords') 
nltk.download('wordnet') 
nltk.download('omw-1.4') 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [126]:
def data_processing(text): 
  pre_words = re.sub('[^A-Za-z]', ' ', text) 
  pre_words = pre_words.lower() 
  
  tokenized_words = word_tokenize(pre_words) 
  stops = set(stopwords.words('english')) 
  
  tokenized_words_remove = [] 
  for w in tokenized_words: 
    if w not in stops: 
      tokenized_words_remove.append(w) 
  
  lem = WordNetLemmatizer() 
  for i in range(len(tokenized_words_remove)): 
    tokenized_words_remove[i] = lem.lemmatize(tokenized_words_remove[i]) 

  return(" ".join(tokenized_words_remove)) 

In [127]:
X_train = [data_processing(i) for i in X_train] 
X_test = [data_processing(i) for i in  X_test] 
X = X_train + X_test 

test = [data_processing(i) for i in test] 

In [128]:
from sklearn.feature_extraction.text import CountVectorizer 

vectorizer = CountVectorizer(max_features=10000) 
             
X = np.asarray(X).astype("U")             
X_train = np.asarray(X_train).astype("U") 
X_test = np.asarray(X_test).astype("U") 
test = np.asarray(test).astype("U") 

X_features = vectorizer.fit_transform(X) 
X_train_features = vectorizer.transform(X_train) 
X_test_features = vectorizer.transform(X_test) 
test_features = vectorizer.transform(test) 

y = y_train + y_test 

### Modeling

In [129]:
from sklearn.svm import SVC 
from sklearn.model_selection import GridSearchCV 

params = { 
    'C' : [2, 3, 4, 5], 
    # 'gamma' : [0.01, 0.1, 1, 10,100, 1000], 
    'kernel' : ['rbf'] 
} 

clf = GridSearchCV(SVC(random_state = 0), params, cv=5) 
clf.fit(X_train_features, y_train)

print(clf.best_params_) 
print(clf.score(X_train_features, y_train)) 
print(clf.score(X_test_features, y_test))

{'C': 2, 'kernel': 'rbf'}
0.9835663478047584
0.600828729281768


In [130]:
clf = SVC(kernel = 'rbf', C=2, random_state=0) 
clf.fit(X_features, y) 

pred = clf.predict(test_features) 
 
result = [test_id, pred] 
result = pd.DataFrame(result).T 

result.head() 

Unnamed: 0,0,1
0,test_id_0,1
1,test_id_1,1
2,test_id_2,1
3,test_id_3,0
4,test_id_4,0


In [131]:
result.to_csv('test.txt', index=False, header=None, sep='\t') 