## Split dataset in Train, Test and Validation 

In [2]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Load the data. 

In [2]:
df1 = pd.read_json("./data/Sarcasm_Headlines_Dataset.json", lines=True)
df2 = pd.read_json("./data/Sarcasm_Headlines_Dataset_v2.json", lines=True)
# re-order attibute columns in df2
df2 = df2[['article_link','headline','is_sarcastic']]
df = pd.concat([df1, df2], axis=0)
df = df.drop(['article_link'], axis=1)
print(len(df))
df.head()

55328


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


### Drop duplicates.

In [3]:
df.sort_values("headline", inplace=True)
df.drop_duplicates(subset="headline",
                   keep='first', inplace=True)
df.head()

Unnamed: 0,headline,is_sarcastic
5891,"""eco-warrior"" vandana shiva, at $40,000 a spee...",0
18992,"""how do we allow a gunman to come into our chi...",0
2387,"""i am equal to any man,"" says stern woman who ...",1
3144,"""i woke up like dis"": why my disability is the...",0
1777,"""remembering george haley: the greatest americ...",0


In [4]:
len(df)

28503

### Reset the index.

In [5]:
df.reset_index(inplace=True, drop=True)

In [6]:
df.head()

Unnamed: 0,headline,is_sarcastic
0,"""eco-warrior"" vandana shiva, at $40,000 a spee...",0
1,"""how do we allow a gunman to come into our chi...",0
2,"""i am equal to any man,"" says stern woman who ...",1
3,"""i woke up like dis"": why my disability is the...",0
4,"""remembering george haley: the greatest americ...",0


### Split the data into Train, Validation and Test sets.

- Train: 70% -> 19.952
- Validation: 10% -> 2.850
- Test: 20% -> 5.700




In [7]:
train, test = train_test_split(df, test_size=0.20)
train, val = train_test_split(train, test_size=0.12498)

(22802, 2)


In [8]:
print('train:',train.shape)
print('test:',test.shape)
print('val:',val.shape)

train: (19952, 2)
test: (5701, 2)
val: (2850, 2)


### Store datasets in separate CSVs

In [9]:
train.to_csv('./data/train.csv', index=True)
test.to_csv('./data/test.csv', index=True)
val.to_csv('./data/val.csv', index=True)

In [3]:
train = pd.read_csv("./data/train.csv")
val = pd.read_csv("./data/val.csv")
test = pd.read_csv("./data/test.csv")

In [4]:
train

Unnamed: 0.1,Unnamed: 0,headline,is_sarcastic
0,23789,the great vanishing,0
1,15323,mccain gets hammered at local vfw,1
2,18908,pissed off from a lack of sleep? you might be ...,0
3,21878,sitting inside cardboard box the safest 6-year...,1
4,15129,marcellus williams prosecutor drew scrutiny in...,0
...,...,...,...
19947,6769,deadlocked supreme court: 'someone's voting tw...,1
19948,18686,pennsylvania's congressional delegation will n...,0
19949,11805,humiliated team of cuban doctors forced to con...,1
19950,6052,condo board member thinks bylaw cover-up might...,1


In [10]:
(val.is_sarcastic == 0).sum()

1508

In [11]:
(val.is_sarcastic == 1).sum()

1342