In [178]:
import numpy as np
import pandas as pd

## Load Data


In [179]:
# open train data 

with open('train_with_label.txt', 'r') as f: 
  data = f.readlines() 

X_train = [] 
y_train = [] 

for i in range(len(data)) : 
  cur = data[i].strip().split('\t') 
  X_train.append(cur[1]+cur[2]) 
  y_train.append(cur[3]) 


In [180]:
# open dev data 
 
with open('dev_with_label.txt', 'r') as f: 
  data = f.readlines() 
 
X_test = [] 
y_test = [] 

for i in range(len(data)) : 
  cur = data[i].strip().split('\t') 
  X_test.append(cur[1]+cur[2]) 
  y_test.append(cur[3]) 


In [181]:
# open test data
 
with open('test_without_label.txt', 'r') as f: 
  data = f.readlines() 

test = [] 
test_id = [] 
 
for i in range(len(data)) : 
  cur = data[i].strip().split('\t') 
  test_id.append(cur[0]) 
  test.append(cur[1]+cur[2]) 


## Data processing

In [182]:
import re 
import nltk 

from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
from nltk.stem.porter import PorterStemmer 
from nltk.util import ngrams 
from nltk.stem import WordNetLemmatizer 

nltk.download('punkt') 
nltk.download('stopwords') 
nltk.download('wordnet') 
nltk.download('omw-1.4') 

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [183]:
def data_processing(text): 
  pre_words = re.sub('[^A-Za-z]', ' ', text) 
  pre_words = pre_words.lower() 
  
  tokenized_words = word_tokenize(pre_words) 
  stops = set(stopwords.words('english')) 
  
  tokenized_words_remove = [] 
  for w in tokenized_words: 
    if w not in stops: 
      tokenized_words_remove.append(w) 
  
  lem = WordNetLemmatizer() 
  for i in range(len(tokenized_words_remove)): 
    tokenized_words_remove[i] = lem.lemmatize(tokenized_words_remove[i]) 

  return(" ".join(tokenized_words_remove)) 

In [184]:
X_train = [data_processing(i) for i in X_train] 
X_test = [data_processing(i) for i in  X_test] 
X = X_train + X_test 

test = [data_processing(i) for i in test] 

In [185]:
from sklearn.feature_extraction.text import CountVectorizer 

vectorizer = CountVectorizer(max_features=10000) 
             
X = np.asarray(X).astype("U")             
X_train = np.asarray(X_train).astype("U") 
X_test = np.asarray(X_test).astype("U") 
test = np.asarray(test).astype("U") 

X_features = vectorizer.fit_transform(X).todense() 
X_train_features = vectorizer.transform(X_train).todense()  
X_test_features = vectorizer.transform(X_test).todense()  
test_features = vectorizer.transform(test).todense()  

y_train = [int(i) for i in y_train]
y_test = [int(i) for i in y_test]
y = y_train + y_test 

## Modeling

In [186]:
import torch
import torch.optim as optim
torch.manual_seed(1)

import random
random.seed(1)

device = 'cuda' if torch.cuda.is_available() else 'cpu'

if device =='cuda':
    torch.cuda.manual_seed_all(1)

In [187]:
X_features = torch.FloatTensor(X_features).to(device)
X_train_features = torch.FloatTensor(X_train_features).to(device)
X_test_features = torch.FloatTensor(X_test_features).to(device)

y_train = torch.LongTensor(y_train).to(device)
y_test = torch.LongTensor(y_test).to(device)
y = torch.LongTensor(y).to(device)

test_features = torch.FloatTensor(test_features).to(device)

In [189]:
linear1 = torch.nn.Linear(10000, 512)
linear2 = torch.nn.Linear(512, 256)
linear3 = torch.nn.Linear(256, 2)

relu = torch.nn.ReLU()
dropout = torch.nn.Dropout(p=0.3)

model = torch.nn.Sequential(linear1, relu, dropout,
                            linear2, relu, dropout,
                            linear3).to(device)
                      

In [190]:
model.train()
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for stop in range(2000):
    optimizer.zero_grad()
    hypothesis = model(X_train_features)
    cost = loss(hypothesis, y_train)
    cost.backward()
    optimizer.step()

    if stop % 100 == 0:
        print(stop, cost.item())

0 0.6941550374031067
100 0.3217655420303345
200 0.08035217225551605
300 0.024747591465711594
400 0.011090864427387714
500 0.006890471093356609
600 0.0048704189248383045
700 0.003326995065435767
800 0.0026196730323135853
900 0.002280087675899267
1000 0.0018150962423533201
1100 0.001629456179216504
1200 0.0015871826326474547
1300 0.0015493796672672033
1400 0.0010908066760748625
1500 0.0013478504261001945
1600 0.0014300077455118299
1700 0.0013230439508333802
1800 0.0011023521656170487
1900 0.0014218721771612763


In [191]:
model.eval()
with torch.no_grad():
    predict = torch.nn.functional.softmax(model(X_test_features), dim=1)
    predict = torch.argmax(predict, dim=1)
    correct = y_test == predict.float()
    accuracy = correct.sum().item()/len(correct)
    print(accuracy)

0.70525


## Predict

In [192]:
final_model = torch.nn.Sequential(linear1, relu, dropout,
                            linear2, relu, dropout,
                            linear3).to(device)

final_model.train()
loss = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-4)

for stop in range(2000):
    optimizer.zero_grad()
    hypothesis = final_model(X_features)
    cost = loss(hypothesis, y)
    cost.backward()
    optimizer.step()

    if stop % 100 == 0:
        print(stop, cost.item())

0 1.1904971599578857
100 0.09577173739671707
200 0.030580949038267136
300 0.01669861562550068
400 0.010655930265784264
500 0.007596219889819622
600 0.005634463392198086
700 0.004892973694950342
800 0.003925013355910778
900 0.0031242305412888527
1000 0.002840819302946329
1100 0.002551558194682002
1200 0.002266542986035347
1300 0.002353779738768935
1400 0.0022114263847470284
1500 0.0021080458536744118
1600 0.0019282345892861485
1700 0.002005541929975152
1800 0.0017615482211112976
1900 0.0019121746299788356


In [193]:
final_model.eval()
with torch.no_grad():
    predict = torch.nn.functional.softmax(model(test_features), dim=1)
    predict = torch.argmax(predict, dim=1)

predict = predict.cpu().numpy()

In [195]:
result = [test_id, predict] 
result = pd.DataFrame(result).T 

result.head() 

Unnamed: 0,0,1
0,test_id_0,1
1,test_id_1,1
2,test_id_2,0
3,test_id_3,0
4,test_id_4,0


In [196]:
result.to_csv('test.txt', index=False, header=None, sep='\t') 