## NLP 数据处理+ 分类
### 采用了fake news dataset <https://www.kaggle.com/clmentbisaillon/fake-and-real-news-dataset>
### 使用pyTorch 框架或TF 主要学习数据处理，怎么处理文本到向量的转变
### 部分代码参考 <https://www.kaggle.com/rushinaik/mission-torch-1#Train-Test-Splitting>

In [2]:
#import
import torch 
import torch.nn as nn 
import numpy as np 
import pandas as pd 

#TfidVectorizer
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer

import nltk
import random
import scipy
from torch import optim
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

seed_val = 1234
random.seed(seed_val)
np.random.seed(seed_val)
torch.manual_seed(seed_val)
torch.cuda.manual_seed_all(seed_val)

In [3]:
#overview the data
df1 = pd.read_csv('D:/ML_data_sql/news/True.csv')
df2 = pd.read_csv('D:/ML_data_sql/news/Fake.csv')
df1['label'] = 1
df2['label'] = 0
df = pd.concat([df1, df2], axis=0)

del df1 
del df2
df.head()

Unnamed: 0,title,text,subject,date,label
0,"As U.S. budget fight looms, Republicans flip t...",WASHINGTON (Reuters) - The head of a conservat...,politicsNews,"December 31, 2017",1
1,U.S. military to accept transgender recruits o...,WASHINGTON (Reuters) - Transgender people will...,politicsNews,"December 29, 2017",1
2,Senior U.S. Republican senator: 'Let Mr. Muell...,WASHINGTON (Reuters) - The special counsel inv...,politicsNews,"December 31, 2017",1
3,FBI Russia probe helped by Australian diplomat...,WASHINGTON (Reuters) - Trump campaign adviser ...,politicsNews,"December 30, 2017",1
4,Trump wants Postal Service to charge 'much mor...,SEATTLE/WASHINGTON (Reuters) - President Donal...,politicsNews,"December 29, 2017",1


In [4]:
print(f'Shape of the dataset: {df.shape}')
print(f'\nSum of nulls:\n{df.isna().sum()}')
#发现没有空的 空的可能要补上之类的

Shape of the dataset: (44898, 5)

Sum of nulls:
title      0
text       0
subject    0
date       0
label      0
dtype: int64


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 44898 entries, 0 to 23480
Data columns (total 5 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   title    44898 non-null  object
 1   text     44898 non-null  object
 2   subject  44898 non-null  object
 3   date     44898 non-null  object
 4   label    44898 non-null  int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ MB


In [6]:
#preprocessing 
def normalise_text(text):
    #re  to normalise
    text = text.str.lower()
    text = text.str.replace(r"\#",'')
    text = text.str.replace(r'http\S+','URL')
    text = text.str.replace(r'@','')
    text = text.str.replace(r"[^A-Za-z0-9()!?\'\`\"]",' ')
    text = text.str.replace("\s{2,}"," ")

    return text

df['text'] = df['title'] +" "+df['text']
df['text'] = normalise_text(df['text'])
del df['title']
del df['subject']
del df['date']
#split train and test
#7:3
#word -> vector  simple
X_train,X_test,y_train,y_test = train_test_split(df['text'],df['label'],test_size=0.3)
#create vector 
vectorize = TfidfVectorizer(ngram_range=(1,1),max_features=10000)

X_train = vectorize.fit_transform(X_train)
X_test = vectorize.fit_transform(X_test)
 
#稠密矩阵
X_train = torch.tensor(X_train.todense()).float() 
X_test = torch.Tensor(X_test.todense()).float()
y_train = torch.tensor(y_train.values)
y_test = torch.tensor(y_test.values)
print(X_train.shape)

torch.Size([31428, 10000])


In [7]:
# %%time
# model = nn.Sequential(
#                 nn.Linear(X_train.shape[1],128 ),
#                 nn.ReLU(),
#                 nn.Dropout(0.1),
#                 nn.Linear(128, df['label'].nunique()),
#                 nn.LogSoftmax(dim=1)
# )
# # defining the loss 
# criterion = nn.NLLLoss()

# # Forward pass, get our logits
# logps = model(X_train)

# # Calculate the loss with the logits and the labels
# loss = criterion(logps, y_train)


# loss.backward()

# # Optimizers require the parameters to optimize and a learning rate
# optimizer = optim.Adam(model.parameters(), lr=0.002)

# train_losses = []
# test_losses = []
# test_accuracies = []

# epochs = 100
# for e in range(epochs):
#     optimizer.zero_grad()

#     output = model.forward(X_train)
#     loss = criterion(output, y_train)
#     loss.backward()
#     train_loss = loss.item()
#     train_losses.append(train_loss)
    
#     optimizer.step()
    
    
#     # Turn off gradients for validation, saves memory and computations
#     with torch.no_grad():
#         model.eval()
#         log_ps = model(X_test)
#         test_loss = criterion(log_ps, y_test)
#         test_losses.append(test_loss)

#         ps = torch.exp(log_ps)
#         top_p, top_class = ps.topk(1, dim=1)
#         equals = top_class == y_test.view(*top_class.shape)
#         test_accuracy = torch.mean(equals.float())
#         test_accuracies.append(test_accuracy)

#     model.train()
#     if (e+1)%10==0:
#         print(f"Epoch: {e+1}/{epochs}.. ",
#               f"Training Loss: {train_loss:.3f}.. ",
#               f"Test Loss: {test_loss:.3f}.. ",
#               f"Test Accuracy: {test_accuracy:.3f}")


In [8]:
%%time
#train acc = 1  test acc = 0.62
from torch.utils.data import DataLoader,Dataset
class NLPDataset(Dataset):
    def __init__(self, x, y=None):
        self.x = x
        # label is required to be a LongTensor
        self.y = y
        if y is not None:
            self.y = torch.LongTensor(y)
    def __len__(self):
        return len(self.x)
    def __getitem__(self, index):
        X = self.x[index]
        if self.y is not None:
            Y = self.y[index]
            return X, Y
        else:
            return X

batch_size= 1024
train_set = NLPDataset(X_train,y_train)
test_set = NLPDataset(X_test,y_test)
train_loader = DataLoader(train_set,batch_size = batch_size,shuffle=True)
test_loader = DataLoader(test_set,batch_size=batch_size,shuffle=False)
#model 
import torch.nn.functional as F
class LR(nn.Module):
    def __init__(self):
        super(LR,self).__init__()
        #tensor 50* 50*1
        self.fc = nn.Sequential(
            nn.Linear(10000, 1024),
            nn.ReLU(),
            nn.Linear(1024, 512),
            nn.ReLU(),
            nn.Linear(512, 128), 
            nn.ReLU(), 
            nn.Linear(128,2)
        )
    def forward(self,x):
        out = self.fc(x)
        #out = F.softmax(out,dim=1)
        return out 
class ANN(nn.Module):
    
    def __init__(self):
        
        super(ANN,self).__init__() # Inhertiting
        
        self.linear1 = nn.Linear(10000,2000) # IN 5008 OUT 2000
        self.relu1 = nn.ReLU() # Actfunc 1
        
        self.linear2 = nn.Linear(2000,500) # IN 2000 OUT 500
        self.relu2 = nn.ReLU()
        
        self.linear3 = nn.Linear(500,100) # IN 500 OUT 100
        self.relu3 = nn.ReLU()
        
        self.linear4 = nn.Linear(100,20) # IN 100 OUT 20
        self.relu4 = nn.ReLU()
        
        self.linear5 = nn.Linear(20,2) # IN 20 OUT 2 | OUTPUT 
        
    
    def forward(self,x):
        
        out = self.linear1(x) # Input Layer 
        out = self.relu1(out)
        
        out = self.linear2(out) # Hidden Layer 1 
        out = self.relu2(out)
        
        out = self.linear3(out) # Hidden Layer 2 
        out = self.relu3(out)
        
        out = self.linear4(out) # Hidden Layer 3 
        out = self.relu4(out)

        
        out = self.linear5(out) # Output Layer
        
        return out
# class TextCNN(nn.Module):
#     def __init__(self):
#         super(TextCNN,self).__init__()
#         self.cnn = nn.Sequential(
#             nn.Conv1d(in_channels=1,out_channels=1,kernel_size=3,padding=1),
#             nn.ReLU(),
#             nn.Conv1d(in_channels=1,out_channels=1,kernel_size=3,padding=1),
#             nn.ReLU(),
#             nn.Conv1d(in_channels=1,out_channels=1,kernel_size=3,padding=1),
#             nn.ReLU(),
#         )
#         self.fc = nn.Linear(10000,2)
#     def forward(self,x):
#         x = self.cnn(x)
#         x = x.view(x.size(0),-1)
#         out = self.fc(x)
#         return out
    
#settings 
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# device = 'cpu'
model = ANN().to(device)
cirection = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(),lr =0.002)
import time
epochs = 100


Wall time: 2.61 s


In [9]:
from sklearn.metrics import accuracy_score,confusion_matrix
for epoch in range(epochs):
    epoch_start_time = time.time()
    train_acc =0.0
    val_acc =0.0
    train_loss = 0.0
    val_loss =0.0
    y_pred_total = []
    y_true= []
    y_val_total = []
    y_val_true= []
    model.train()
    for i,data in enumerate(train_loader):
        optimizer.zero_grad()
        x,y = data[0].to(device),data[1].to(device)
        #print(y.shape)
        y_pred = model(x)
        #print(y_pred.shape)
        loss = cirection(y_pred,y.long())
        loss.backward()
        optimizer.step()
        y_pred_total.append(np.argmax(y_pred.cpu().detach().numpy(),axis=1))
        # print(len(y_pred_total))
        y_true.append(y.cpu().detach().numpy())
        # print(len(y_true))
        # break
        #train_acc +=np.sum(np.argmax(y_pred.cpu().data.numpy(),axis=1)== y.cpu().numpy())
        train_loss +=loss.item()
    model.eval()
    with torch.no_grad():
        for i,data in enumerate(test_loader):
            valx ,valy = data[0].to(device),data[1].to(device)
            val_pred = model(valx)
            y_val_total.append(val_pred.cpu().detach().numpy())
            y_val_true.append(valy.cpu().detach().numpy())
            batch_loss = cirection(val_pred,valy.long())
            #val_acc +=np.sum(np.argmax(val_pred.cpu().data.numpy(),axis=1)== valy.cpu().numpy())
            val_loss +=batch_loss.item()
    #print(y_pred_total.size,y_true.shape)
    mm = [j for i in y_pred_total for j in i]
    mm2 = [j for i in y_true for j in i]
    #print(len(merge_list),len(merge_list2))
    #train_acc = accuracy_score(merge_list,merge_list2)
    #print(train_acc)
    #print(len(y_pred_total[0]),len(y_true[0]))
    #break
    m1 = [j for i in y_val_total for j in i]
    m2 = [j for i in y_val_true for j in i]
    #print(len(mm),len(mm2))
    train_acc = accuracy_score(mm,mm2)
    test_acc = accuracy_score(m1,m2)
    if epoch %10==0: 
        print('[%03d/%03d] %2.2f sec(s) Train Acc: %3.6f Loss: %3.6f | Val Acc: %3.6f loss: %3.6f' % \
            (epoch + 1, epochs, time.time()-epoch_start_time, \
                train_acc, train_loss/train_set.__len__(), val_acc, val_loss/test_set.__len__()))


ValueError: Classification metrics can't handle a mix of continuous-multioutput and binary targets