In [2]:
import pandas as pd
import numpy as np
from torch import nn
import torch
from torch.autograd import Variable
from torch.utils.data import DataLoader, Dataset, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
import torch.nn.functional as F

In [3]:
data = pd.read_csv('YoutubeComplete.csv',delimiter='\t')
data = data[['CONTENT','CLASS']]

# Drop na
data = data.dropna()
data.head()

Unnamed: 0,CONTENT,CLASS
0,i love this so much. AND also I Generate Free ...,1.0
1,http://www.billboard.com/articles/columns/pop-...,1.0
2,Hey guys! Please join me in my fight to help a...,1.0
3,http://psnboss.com/?ref=2tGgp3pV6L this is the...,1.0
4,Hey everyone. Watch this trailer!!!!!!!! http...,1.0


In [4]:
features = data.CONTENT.values
labels = data.CLASS.values
num_words = 1000

In [5]:
t = Tokenizer(num_words=1000)
t.fit_on_texts(features)

In [6]:
features = t.texts_to_matrix(features, mode='tfidf')

In [7]:
features.shape

(1606, 1000)

In [8]:
# Building model
class Model(nn.Module):
    def __init__(self, input, hidden, output):
        super(Model, self).__init__()
        self.l1 = nn.Linear(input, hidden)
        self.l2 = nn.Linear(hidden , hidden)
        self.l3 = nn.Linear(hidden, 2)
    
    def forward(self, x):
        out = F.relu(self.l1(x))
        out = F.relu(self.l2(out))
        out = self.l3(out)
        return out        

In [9]:
input = 1000
hidden = 100
output = 2

In [10]:
model = Model(input, hidden, output)

In [11]:
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, shuffle=True, random_state=34)


In [25]:
optimizer = torch.optim.Adam(model.parameters(), lr=0.01)
criterion = nn.CrossEntropyLoss()

In [20]:
def train(epochs):
    x_train = Variable(torch.from_numpy(features_train)).float()
    y_train = Variable(torch.from_numpy(labels_train)).long()
    model.train()
    for epoch in range(epochs):
        optimizer.zero_grad()
        y_pred = model(x_train)
        loss = criterion(y_pred, y_train)
        print ("epoch #",epoch)
        print ("loss: ", loss.item())
        pred = torch.max(y_pred, 1)[1].eq(y_train).sum()
        print ("acc:(%) ", 100*pred/len(x_train))
        loss.backward()
        optimizer.step()

In [21]:
def test(epochs):
    model.eval()
    x_test = Variable(torch.from_numpy(features_test)).float()
    y_test = Variable(torch.from_numpy(labels_test)).long()
    for epoch in range(epochs):
        with torch.no_grad():
            y_pred = model(x_test)
            loss = criterion(y_pred, y_test)
            print ("epoch #",epoch)
            print ("loss: ", loss.item())
            pred = torch.max(y_pred, 1)[1].eq(y_test).sum()
            print ("acc (%): ", 100*pred/len(x_test))

In [22]:
train(10)

epoch # 0
loss:  0.6999464631080627
acc:(%)  tensor(40.0332)
epoch # 1
loss:  0.6065702438354492
acc:(%)  tensor(50.0831)
epoch # 2
loss:  0.4859074056148529
acc:(%)  tensor(77.9900)
epoch # 3
loss:  0.37152785062789917
acc:(%)  tensor(94.8505)
epoch # 4
loss:  0.27058711647987366
acc:(%)  tensor(97.1761)
epoch # 5
loss:  0.18117663264274597
acc:(%)  tensor(98.5880)
epoch # 6
loss:  0.10746495425701141
acc:(%)  tensor(99.0864)
epoch # 7
loss:  0.05896689370274544
acc:(%)  tensor(99.3355)
epoch # 8
loss:  0.033976756036281586
acc:(%)  tensor(99.5017)
epoch # 9
loss:  0.02282559685409069
acc:(%)  tensor(99.5017)


In [23]:
test(10)

epoch # 0
loss:  0.9145722389221191
acc (%):  tensor(91.2935)
epoch # 1
loss:  0.9145722389221191
acc (%):  tensor(91.2935)
epoch # 2
loss:  0.9145722389221191
acc (%):  tensor(91.2935)
epoch # 3
loss:  0.9145722389221191
acc (%):  tensor(91.2935)
epoch # 4
loss:  0.9145722389221191
acc (%):  tensor(91.2935)
epoch # 5
loss:  0.9145722389221191
acc (%):  tensor(91.2935)
epoch # 6
loss:  0.9145722389221191
acc (%):  tensor(91.2935)
epoch # 7
loss:  0.9145722389221191
acc (%):  tensor(91.2935)
epoch # 8
loss:  0.9145722389221191
acc (%):  tensor(91.2935)
epoch # 9
loss:  0.9145722389221191
acc (%):  tensor(91.2935)


In [None]:
pred = model(torch.from_numpy(features_test).float())

In [None]:
pred = torch.max(pred,1)[1]

In [None]:
p_train = model(torch.from_numpy(features_train).float())
p_train = torch.max(p_train,1)[1]
p_train = p_train.data.numpy()
accuracy_score(labels_train, p_train)

In [None]:
from sklearn.metrics import confusion_matrix

In [None]:
cm = confusion_matrix(labels_test, pred)
cm

In [13]:
import sys,os
sys.path.insert(0,os.path.join('Code/Preprocess'))

In [14]:
from DataPreparation import dataPrepare
fetchData = dataPrepare()

In [15]:
X,y =  fetchData.deepLearningInput('YoutubeComplete.csv')

In [23]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.3,shuffle=True, random_state=34)
x_val,_,y_val,_ = train_test_split(X_test,y_test, test_size=0.25, random_state=1)

In [27]:
x_train = Variable(torch.from_numpy(X_train)).float()
y_train = Variable(torch.from_numpy(y_train.numpy())).long()
# y_train = y_train.unsqueeze(1)
model.train()
for epoch in range(20):
    optimizer.zero_grad()
    y_pred = model(x_train)
    loss = criterion(y_pred, y_train)
    print ("epoch #",epoch)
    print ("loss: ", loss.item())
    pred = torch.max(y_pred, 1)[1].eq(y_train).sum()
    print ("acc:(%) ", (100*pred/len(x_train)).item())
    loss.backward()
    optimizer.step()

epoch # 0
loss:  0.7040508389472961
acc:(%)  42.542015075683594
epoch # 1
loss:  0.6144607067108154
acc:(%)  94.32772827148438
epoch # 2
loss:  0.4574204981327057
acc:(%)  95.69327545166016
epoch # 3
loss:  0.28568124771118164
acc:(%)  96.21849060058594
epoch # 4
loss:  0.15670649707317352
acc:(%)  96.32353210449219
epoch # 5
loss:  0.09152691066265106
acc:(%)  96.42857360839844
epoch # 6
loss:  0.06836235523223877
acc:(%)  96.74369812011719
epoch # 7
loss:  0.06219467520713806
acc:(%)  96.84873962402344
epoch # 8
loss:  0.06022853031754494
acc:(%)  96.84873962402344
epoch # 9
loss:  0.056714434176683426
acc:(%)  96.95378112792969
epoch # 10
loss:  0.05257551744580269
acc:(%)  97.05882263183594
epoch # 11
loss:  0.05144520103931427
acc:(%)  97.05882263183594
epoch # 12
loss:  0.05420515686273575
acc:(%)  96.42857360839844
epoch # 13
loss:  0.05541239678859711
acc:(%)  96.53361511230469
epoch # 14
loss:  0.05333348363637924
acc:(%)  96.53361511230469
epoch # 15
loss:  0.0513363592326641

In [30]:
model.eval()
x_test = Variable(torch.from_numpy(x_val)).float()
y_test = Variable(torch.from_numpy(y_val.to_numpy())).long()
for epoch in range(10):
    with torch.no_grad():
        y_pred = model(x_test)
        loss = criterion(y_pred, y_test)
        print ("epoch #",epoch)
        print ("loss: ", loss.item())
        pred = torch.max(y_pred, 1)[1].eq(y_test).sum()
        print ("acc (%): ", (100*pred/len(x_test)).item())

epoch # 0
loss:  0.3851505219936371
acc (%):  86.60131072998047
epoch # 1
loss:  0.3851505219936371
acc (%):  86.60131072998047
epoch # 2
loss:  0.3851505219936371
acc (%):  86.60131072998047
epoch # 3
loss:  0.3851505219936371
acc (%):  86.60131072998047
epoch # 4
loss:  0.3851505219936371
acc (%):  86.60131072998047
epoch # 5
loss:  0.3851505219936371
acc (%):  86.60131072998047
epoch # 6
loss:  0.3851505219936371
acc (%):  86.60131072998047
epoch # 7
loss:  0.3851505219936371
acc (%):  86.60131072998047
epoch # 8
loss:  0.3851505219936371
acc (%):  86.60131072998047
epoch # 9
loss:  0.3851505219936371
acc (%):  86.60131072998047
