In [1]:
import pickle
import numpy as np
import pandas as pd
import numpy as np
import pandas as pd
import torch
from torch import nn
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch import Tensor
import sklearn
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
import copy

In [2]:
# ./test/test_texts.dat
# ./train/train_labels.txt
# ./train/train_texts.dat

In [3]:
file_name = './train/train_texts.dat'
with open(file_name,'rb') as f:
    train_texts = pickle.load(f)

In [4]:
file_name1 = './train/train_labels.txt'
# train_labels = np.loadtxt(file_name1)
train_label = pd.read_table(file_name1,header=None,dtype=int)

In [5]:
file_name = './test/test_texts.dat'
with open(file_name,'rb') as f:
    test_texts = pickle.load(f)

In [6]:
vectorizer = TfidfVectorizer(max_features = 10000)
vectors_train = vectorizer.fit_transform(train_texts)
vectors_train = vectors_train.toarray()
vectors_train = pd.DataFrame(vectors_train)
vectors_train.shape

(11314, 10000)

In [7]:
test = vectorizer.transform(test_texts)
test = test.toarray()
test = pd.DataFrame(test)

In [8]:
num_train = int(0.8 * vectors_train.shape[0]) # 划分训练样本和验证集样本
indices = np.arange( vectors_train.shape[0])
np.random.shuffle(indices)  # shuffle 顺序
train_indices = indices[:num_train]
valid_indices = indices[num_train:]

# 提取训练集和验证集的特征
train_features = vectors_train.iloc[train_indices].values.astype(np.float32)
train_features = torch.from_numpy(train_features)
valid_features = vectors_train.iloc[valid_indices].values.astype(np.float32)
valid_features = torch.from_numpy(valid_features)
train_valid_features = vectors_train[:vectors_train.shape[0]].values.astype(np.float32)
train_valid_features = torch.from_numpy(train_valid_features)

test = test.values.astype(np.float32)
x_test = torch.from_numpy(test)

# 提取训练集和验证集的label
train_labels = train_label.values[train_indices]#.astype(np.int16)
train_labels = torch.from_numpy(train_labels).squeeze()
valid_labels = train_label.values[valid_indices]#.astype(np.int16)
valid_labels = torch.from_numpy(valid_labels).squeeze()
train_valid_labels = train_label.values[:, None]#.astype(np.int16)
train_valid_labels = torch.from_numpy(train_valid_labels)

In [10]:
def get_model(feat_dim,output_dim = 20 ):
    net = nn.Sequential(
        nn.Linear(int(feat_dim), int(feat_dim/8)),
        nn.BatchNorm1d(int(feat_dim/8)),
        nn.ReLU(inplace=True),
        nn.Linear(int(feat_dim/8),int(feat_dim/16)),
        nn.BatchNorm1d(int(feat_dim/16)),
        nn.ReLU(inplace=True),
        nn.Linear(int(feat_dim/16), output_dim)
        #nn.Softmax(output_dim,20)
        )
    return net
           
        
def get_data(x,y,batch_size,shuffle):
    dataset = TensorDataset(x,y)
    return DataLoader(dataset, batch_size, shuffle=shuffle,num_workers=4)

def train_model(model,x_train,y_train,x_valid,y_valid,epochs,batch_size,lr,weight_decay,use_gpu):
    
    if use_gpu:
        model = model.cuda()
    train_data = get_data(x_train,y_train,batch_size,True)

    optimizer = torch.optim.Adam(model.parameters(), lr=lr, weight_decay=weight_decay)
    criterion = nn.CrossEntropyLoss()
    best_accu = 0
    for e in  range(epochs):
        model.train()
        for data in train_data:
            x,y = data
            if use_gpu:
                x = x.cuda()
                y = y.cuda()
                
            out = model(x)
            loss = criterion(out, y)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
        if use_gpu:
                x_valid = x_valid.cuda()
        with torch.no_grad():
            output = model(x_valid)
        output = Tensor.cpu(output)
        out = output.numpy()
        print('epoch:{}'.format(e))
        accu = (np.argmax(out,axis=1) == y_valid.numpy()).sum()/len(y_valid)
        print('accuracy:{}'.format(accu))
        '''
        if accu > best_accu:
            best_model = copy.deepcopy(model)
    torch.save(best_model,"model_1.t7")'''

def predict(x_test):
    net = torch.load('./model_1.t7')
    a = net(x_test).detach().numpy()
    a = np.argmax(a,axis=1)
    with open('./ans.txt','w') as f:
        for i in a:
            f.write('{}'.format(i))
            f.write('\n')
    

In [11]:
# 可以调整的超参
batch_size = 64
epochs = 100
lr = 0.001
wd = 0
use_gpu = False
feat_dim = vectors_train.shape[1]
net = get_model(feat_dim)
train_model(net, train_features, train_labels, valid_features, valid_labels, epochs, 
            batch_size, lr, wd, use_gpu)
#predict(x_test)

epoch:0
accuracy:0.894387980556783
epoch:1
accuracy:0.894387980556783
epoch:2
accuracy:0.9001325673884224
epoch:3
accuracy:0.898806893504198
epoch:4
accuracy:0.9049933716305789
epoch:5
accuracy:0.9027839151568714
epoch:6
accuracy:0.9072028281042863
epoch:7
accuracy:0.9049933716305789
epoch:8
accuracy:0.9058771542200619
epoch:9
accuracy:0.9063190455148034
epoch:10
accuracy:0.9072028281042863
epoch:11
accuracy:0.9067609368095448
epoch:12
accuracy:0.9058771542200619
epoch:13
accuracy:0.9063190455148034
epoch:14
accuracy:0.9054352629253204
epoch:15
accuracy:0.9058771542200619
epoch:16
accuracy:0.9080866106937693
epoch:17
accuracy:0.9080866106937693
epoch:18
accuracy:0.9098541758727353
epoch:19
accuracy:0.9080866106937693
epoch:20
accuracy:0.9076447193990278
epoch:21
accuracy:0.9080866106937693
epoch:22
accuracy:0.9085285019885109
epoch:23
accuracy:0.8144056562085726
epoch:24
accuracy:0.8550596553247901
epoch:25
accuracy:0.8784798939460893
epoch:26
accuracy:0.8815731330092798
epoch:27
accur

In [103]:
#predict(x_test)

In [None]:
a = [[1,2,3],[4,6,3]]
a = np.array(a)
a = np.argmax(a,axis=1)
with open('./ans.txt','w') as f:
        for i in a:
            f.write('{}'.format(i))
            f.write('\n')

In [None]:
'''
a=[[1,2,3],[4,5,6],[9,8,7]]
a=np.array(a)
np.argmax(a,axis=1)
b = np.array([1,2,0])
(np.argmax(a,axis=1) == b).sum()/len(b)'''

In [81]:
def accu(a,b):
    c = a.numpy().astype(np.int16)
    d = b.numpy().astype(np.int16)
    num = d.shape[0]
    count = 0
    for i in range(num):
        if c[i] == d[i]:
            count += 1
    return count/num

In [None]:
        #print(out)
        ''' 
        print(output.shape) 
        
        print('epoch:{}'.format(e))
        m = torch.mean(torch.eq(torch.argmax(out, dim=1), torch.argmax(y_valid, dim=1)).double())
        print(m)
        #print('accuracy:{}'.format(accu(output,y_valid)))'''