In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

In [2]:
DIR = 'data/YouTube-Spam-Collection-v1'
FILE_NAMES = ['Youtube01-Psy.csv', 'Youtube02-KatyPerry.csv', 'Youtube03-LMFAO.csv', 'Youtube04-Eminem.csv','Youtube04-Eminem.csv']

In [3]:
dfs = []

for f in FILE_NAMES:
    path = DIR+'/'+f
    df = pd.read_csv(path)
    new_df = pd.DataFrame(columns=['text','label'])
    new_df['text'] = df['CONTENT']
    new_df['label'] = df['CLASS']
    dfs.append(new_df)

df_train = pd.concat(dfs[:4])
df_val_test = dfs[4]

df_val, df_test = train_test_split(df_val_test,
                                     test_size=250, 
                                     random_state=123, 
                                     stratify=df_val_test.label)

df_dev = df_train.sample(100, random_state=123)


In [4]:
LABELED_PATH = DIR+'/'+'L_data.csv'
UNLABELED_PATH = DIR+'/'+'U_data.csv'
VAL_PATH = DIR +'/' +'V_data.csv'
TEST_PATH = DIR+ '/'+'test_data.csv'

df_train.to_csv(UNLABELED_PATH)
df_dev.to_csv(LABELED_PATH)
df_val.to_csv(VAL_PATH)
df_test.to_csv(TEST_PATH)

In [5]:
df_train

Unnamed: 0,text,label
0,"Huh, anyway check out this you[tube] channel: ...",1
1,Hey guys check out my new channel and our firs...,1
2,just for test I have to say murdev.com,1
3,me shaking my sexy ass on my channel enjoy ^_^ ﻿,1
4,watch?v=vtaRGgvGtWQ Check this out .﻿,1
...,...,...
443,SUBSCRIBE TO MY CHANNEL X PLEASE!. SPARE,1
444,Check out my videos guy! :) Hope you guys had ...,1
445,3 yrs ago I had a health scare but thankfully ...,1
446,Rihanna looks so beautiful with red hair ;)﻿,0


In [6]:
df_dev

Unnamed: 0,text,label
84,Nice﻿,0
332,Almost a billion﻿,0
276,http://vimeo.com/106865403﻿,1
10,I love this song so much﻿,0
343,"Something to dance to, even if your sad JUST ...",0
...,...,...
156,"Search ""Chubbz Dinero - Ready Or Not "" Thanks ﻿",1
237,Check out this video on YouTube:﻿,1
17,Check out our Channel for nice Beats!!﻿,1
406,"Like this comment, guys i just started up a ne...",1


In [7]:
df_test

Unnamed: 0,text,label
327,im M.E.S an aspiring young rapper with high ho...,1
54,Eminem and Rihanna sing the song very well.﻿,0
299,Lol thats the guy from animal planet and lost....,0
156,watch youtube video &quot;EMINEM -YTMA artist ...,1
181,You guys should check out this EXTRAORDINARY w...,1
...,...,...
142,Share Eminem&#39;s Artist of the Year video so...,1
446,Rihanna looks so beautiful with red hair ;)﻿,0
312,Hey guys I&#39;m 87 cypher im 11 years old and...,1
382,Lil m !!!!! Check hi out!!!!! Does live the wa...,1


In [8]:
df_val

Unnamed: 0,text,label
392,awesome﻿,0
114,Do you need more instagram followers or photo ...,1
97,eminem is a ginius stop!﻿,0
414,Not bad﻿,0
160,Tell us the title so i can like and subscribe ...,1
...,...,...
205,I know that maybe no one will read this but PL...,1
198,2015 and more....﻿,0
353,Fire..﻿,0
335,Okay trust me I&#39;m doing a favor. You NEED ...,1


In [108]:
vectorizer = CountVectorizer(ngram_range=(1, 2))

all_text = np.append(np.append(np.append(np.array(df_train['text']), np.array(df_dev['text'])),np.array(df_val['text'])),np.array(df_test['text']))
vectorizer.fit(all_text)

CountVectorizer(ngram_range=(1, 2))

In [109]:
U_feats = vectorizer.transform(np.array(df_train['text']))
U_feats_label = np.array(df_train['label'])
print(U_feats.shape)

with open(DIR+'/'+'U_feats.npy', 'wb') as f:
    np.save(f, U_feats.todense())
    np.save(f, U_feats_label)

(1586, 16634)


In [110]:
L_feats = vectorizer.transform(np.array(df_dev['text']))
L_feats_label = np.array(df_dev['label'])
print(L_feats.shape)

with open(DIR+'/'+'L_feats.npy', 'wb') as f:
    np.save(f, L_feats.todense())
    np.save(f, L_feats_label)

(100, 16634)


In [111]:
V_feats = vectorizer.transform(np.array(df_val['text']))
V_feats_label = np.array(df_val['label'])
print(V_feats.shape)

with open(DIR+'/'+'V_feats.npy', 'wb') as f:
    np.save(f, V_feats.todense())
    np.save(f, V_feats_label)

(198, 16634)


In [112]:
test_feats = vectorizer.transform(np.array(df_test['text']))
test_feats_label = np.array(df_test['label'])
print(test_feats.shape)

with open(DIR+'/'+'test_feats.npy', 'wb') as f:
    np.save(f, test_feats.todense())
    np.save(f, test_feats_label)

(250, 16634)


In [113]:
def create_prcoessed_files(feats_file, rule_labels_file, processed_file):
    with open(feats_file,'rb') as f:
        feats = np.load(f,allow_pickle=True)
        feats_label = np.load(f,allow_pickle=True)
    with open(rule_labels_file,'rb') as f:
        rule_labels = np.load(f,allow_pickle=True)


    f = open(processed_file,'wb')

#     x : feature representation of instances
#     shape : [num_instances, num_features]
    np.save(f, feats)
    
#     l : Class Labels assigned by rules
#     shape : [num_instances, num_rules]
    np.save(f, rule_labels)
    
#     m : Rule coverage mask
#     A binary matrix of shape [num_instances, num_rules]
    m = rule_labels>=0 + 0
    np.save(f, m)
    
#     L : Instance labels
#     shape : [num_instances, ]
    np.save(f, feats_label)
    
#     r : A binary matrix of shape [num_instances, num_rules]
#     r[i][j]=1 if jth rule was associated with ith instance
    r = np.zeros(rule_labels.shape)
    np.save(f, r)
    
    
    print("feats : ", feats.shape)
    print("rule_labels : ",rule_labels.shape)
    print("m : ",m.shape)
    print("feats_label : ",feats_label.shape)
    print("r : ", r.shape)
    
    
    

In [114]:
feats_file = 'data/YouTube-Spam-Collection-v1/L_feats.npy'
rule_labels_file = 'data/YouTube-Spam-Collection-v1/L_L.npy'
preprocessed_file = 'data/YouTube-Spam-Collection-v1/L_preprocess.npy'
create_prcoessed_files(feats_file, rule_labels_file,preprocessed_file)

feats :  (100, 16634)
rule_labels :  (100, 9)
m :  (100, 9)
feats_label :  (100,)
r :  (100, 9)


In [115]:
feats_file = 'data/YouTube-Spam-Collection-v1/U_feats.npy'
rule_labels_file = 'data/YouTube-Spam-Collection-v1/U_L.npy'
preprocessed_file = 'data/YouTube-Spam-Collection-v1/U_preprocess.npy'
create_prcoessed_files(feats_file, rule_labels_file,preprocessed_file)

feats :  (1586, 16634)
rule_labels :  (1586, 9)
m :  (1586, 9)
feats_label :  (1586,)
r :  (1586, 9)


In [116]:
feats_file = 'data/YouTube-Spam-Collection-v1/V_feats.npy'
rule_labels_file = 'data/YouTube-Spam-Collection-v1/V_L.npy'
preprocessed_file = 'data/YouTube-Spam-Collection-v1/V_preprocess.npy'
create_prcoessed_files(feats_file, rule_labels_file,preprocessed_file)

feats :  (198, 16634)
rule_labels :  (198, 9)
m :  (198, 9)
feats_label :  (198,)
r :  (198, 9)


In [117]:
feats_file = 'data/YouTube-Spam-Collection-v1/test_feats.npy'
rule_labels_file = 'data/YouTube-Spam-Collection-v1/test_L.npy'
preprocessed_file = 'data/YouTube-Spam-Collection-v1/test_preprocess.npy'
create_prcoessed_files(feats_file, rule_labels_file,preprocessed_file)

feats :  (250, 16634)
rule_labels :  (250, 9)
m :  (250, 9)
feats_label :  (250,)
r :  (250, 9)
