In [157]:
!pip install torchsummary



In [None]:
from google.colab import drive
drive.mount("/content/drive")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import random
import pandas as pd
import numpy as np
import os
from copy import deepcopy

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import preprocessing
from sklearn.metrics import f1_score

import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader

from tqdm.auto import tqdm

import warnings
warnings.filterwarnings(action='ignore')

In [None]:
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')

In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

In [None]:
os.getcwd()

'/content'

In [None]:
train = pd.read_csv("/content/drive/MyDrive/Project/open/train.csv")
test = pd.read_csv("/content/drive/MyDrive/Project/open/test.csv")

In [None]:
train.head()

Unnamed: 0,ID,first_party,second_party,facts,first_party_winner
0,TRAIN_0000,Phil A. St. Amant,Herman A. Thompson,"On June 27, 1962, Phil St. Amant, a candidate ...",1
1,TRAIN_0001,Stephen Duncan,Lawrence Owens,Ramon Nelson was riding his bike when he suffe...,0
2,TRAIN_0002,Billy Joe Magwood,"Tony Patterson, Warden, et al.",An Alabama state court convicted Billy Joe Mag...,1
3,TRAIN_0003,Linkletter,Walker,Victor Linkletter was convicted in state court...,0
4,TRAIN_0004,William Earl Fikes,Alabama,"On April 24, 1953 in Selma, Alabama, an intrud...",1


In [None]:
test.head()

Unnamed: 0,ID,first_party,second_party,facts
0,TEST_0000,Salerno,United States,The 1984 Bail Reform Act allowed the federal c...
1,TEST_0001,Milberg Weiss Bershad Hynes and Lerach,"Lexecon, Inc.",Lexecon Inc. was a defendant in a class action...
2,TEST_0002,No. 07-582\t Title: \t Federal Communications ...,"Fox Television Stations, Inc., et al.","In 2002 and 2003, Fox Television Stations broa..."
3,TEST_0003,Harold Kaufman,United States,During his trial for armed robbery of a federa...
4,TEST_0004,Berger,Hanlon,"In 1993, a magistrate judge issued a warrant a..."


In [None]:
vectorizer = TfidfVectorizer(min_df = 4, analyzer = 'word', ngram_range=(3,5))

def make_vector(vectorizer,dataframe,mode):
    if mode:
        x_facts = vectorizer.fit_transform(dataframe['facts'])

    else:
        x_facts = vectorizer.transform(dataframe['facts'])

    x_party1 = vectorizer.transform(dataframe['first_party'])
    x_party2 = vectorizer.transform(dataframe['second_party'])

    x = np.concatenate([x_party1.todense(), x_party2.todense(), x_facts.todense()], axis=1)

    return x


In [None]:
def class_weights(df):
    class_counts = df["first_party_winner"].value_counts()
    class_weights = 1.0/class_counts
    class_weights = class_weights/class_weights.min()
    class_weights = class_weights.to_dict()
    class_weights = {k: v for k, v in sorted(class_weights.items(), key=lambda item: item[0])}
    class_weights = list(class_weights.values())
    class_weights = torch.FloatTensor(class_weights).to(device)
    return class_weights
class_weights = class_weights(train)

In [None]:
x_train = make_vector(vectorizer,train,True)
y_train = train['first_party_winner']
x_test = make_vector(vectorizer,test,False)

In [None]:
input_size_first = x_train.shape[1]
print(input_size_first)

40671


In [None]:
class CustomDataset(Dataset):

    def __init__(self,data,label,mode = "Train"):
        self.data = data
        self.label = label
        self.mode = mode

    def __getitem__(self,idx):
        vectorized_data = torch.FloatTensor(self.data[idx]).squeeze(0)
        if self.mode == "Train":
            answer = self.label.values[idx].item()
            return vectorized_data,answer

        else:
            return vectorized_data

    def __len__(self):
        return len(self.data)


In [None]:
train_dataset = CustomDataset(data = x_train,label = y_train,mode = "Train")
test_dataset = CustomDataset(data = x_test,label = y_train,mode = "Test")



In [None]:
batch_size = 32


train_dataloader = DataLoader(train_dataset,batch_size = batch_size,shuffle= True)
test_dataloader = DataLoader(test_dataset,batch_size = batch_size,shuffle = False )


In [None]:
class BaseModel(nn.Module):

    def __init__(self,input_dim):
        super().__init__()
        self.feature_extraction_layer = nn.Sequential(
            nn.Linear(in_features = input_dim,out_features = 2048),
            nn.BatchNorm1d(2048),
            nn.LeakyReLU(),
            nn.Linear(in_features = 2048,out_features = 1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),
            nn.Linear(in_features = 1024,out_features = 1024),
            nn.BatchNorm1d(1024),
            nn.LeakyReLU(),
            nn.Linear(in_features = 1024,out_features = 512),
            nn.BatchNorm1d(512),
            nn.LeakyReLU(),
            nn.Linear(in_features = 512,out_features = 256))

        self.classifier = nn.Sequential(
            nn.Dropout(p = 0.3),
            nn.Linear(in_features = 256,out_features = 2),

        )

    def forward(self,x):
        output = self.feature_extraction_layer(x)
        output = self.classifier(output)

        return output


In [None]:
model = BaseModel(input_dim = input_size_first)
criterion = nn.CrossEntropyLoss(weight = class_weights)
optimizer  = optim.Adam(model.parameters(),lr= 0.0001)
num_epochs = 40


In [None]:
def train(model,dataloader,num_epochs,criterion,optimizer,device):
    model.to(device)
    model.train()
    lowest_loss = np.inf
    for epoch in range(num_epochs):
        epoch_loss = 0.0

        for batch in dataloader:
            x = batch[0].to(device)
            y = batch[1].to(device)
            pred = model(x)
            #print(pred.size())
            #print(y.size())
            loss = criterion(pred.squeeze(),y.long())

            loss.backward()
            optimizer.step()


            epoch_loss += loss.item()
        epoch_loss /= len(dataloader)
        if epoch_loss < lowest_loss:
            best_model = deepcopy(model)
            lowest_loss = epoch_loss

            print(epoch)
        print(f"epoch: {epoch+1} loss : {epoch_loss}")
    return best_model

In [None]:
trained_model = train(model,train_dataloader,num_epochs,criterion,optimizer,device)

0
epoch: 1 loss : 1.0446024162647052
epoch: 2 loss : 1.048695356800006
2
epoch: 3 loss : 0.5024668100552682
3
epoch: 4 loss : 0.3969330063376289
4
epoch: 5 loss : 0.1867145565105546
5
epoch: 6 loss : 0.11125074926044216
epoch: 7 loss : 0.1859285829605512
epoch: 8 loss : 0.16783460174281992
8
epoch: 9 loss : 0.10489585238851686
epoch: 10 loss : 0.13622127308581883
epoch: 11 loss : 0.11979019064575624
epoch: 12 loss : 0.13778837446200973
epoch: 13 loss : 0.11240550234222797
epoch: 14 loss : 0.12909618631690534
14
epoch: 15 loss : 0.04519527257143944
epoch: 16 loss : 0.05906453457756756
16
epoch: 17 loss : 0.028100253641726695
epoch: 18 loss : 0.030731014173380408
epoch: 19 loss : 0.06260786989871003
19
epoch: 20 loss : 0.02136762438045173
epoch: 21 loss : 0.055783631541825404
21
epoch: 22 loss : 0.009929354480302419
epoch: 23 loss : 0.024565942950032033
epoch: 24 loss : 0.018657327544769904
epoch: 25 loss : 0.009939054231480102
epoch: 26 loss : 0.0355341484556028
epoch: 27 loss : 0.01421

In [None]:
def test(model,dataloader,criterion,optimizer,device):

    model.to(device)
    model.eval()
    answer_list = []

    with torch.no_grad():
        for sentence in iter(dataloader):
            x = sentence.to(device)
            answer = model(x)
            answer_list += answer.argmax(1).detach().cpu().numpy().tolist()

    return answer_list



In [None]:
answer_list = test(trained_model,test_dataloader,criterion,optimizer,device)

In [None]:
submit1 = pd.read_csv("/content/drive/MyDrive/Project/open/sample_submission.csv")
submit1.head()


Unnamed: 0,ID,first_party_winner
0,TEST_0000,0
1,TEST_0001,0
2,TEST_0002,0
3,TEST_0003,0
4,TEST_0004,0


In [None]:
submit1['first_party_winner'] = answer_list


In [None]:
submit1.head()


Unnamed: 0,ID,first_party_winner
0,TEST_0000,0
1,TEST_0001,1
2,TEST_0002,1
3,TEST_0003,1
4,TEST_0004,1


In [None]:
submit1["first_party_winner"].value_counts()

1    890
0    350
Name: first_party_winner, dtype: int64

In [None]:
submit1.to_csv("answer4.csv",index = False)