# Split dataset in Train, Test and Validation 

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

### Load the data. 

In [2]:
df1 = pd.read_json("./data/Sarcasm_Headlines_Dataset.json", lines=True)
df2 = pd.read_json("./data/Sarcasm_Headlines_Dataset_v2.json", lines=True)
# re-order attibute columns in df2
df2 = df2[['article_link','headline','is_sarcastic']]
df = pd.concat([df1, df2], axis=0)
df = df.drop(['article_link'], axis=1)
print(len(df))
df.head()

55328


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


### Drop duplicates.

In [3]:
df.sort_values("headline", inplace=True)
df.drop_duplicates(subset="headline",
                   keep='first', inplace=True)
df.head()

Unnamed: 0,headline,is_sarcastic
5891,"""eco-warrior"" vandana shiva, at $40,000 a spee...",0
18992,"""how do we allow a gunman to come into our chi...",0
2387,"""i am equal to any man,"" says stern woman who ...",1
3144,"""i woke up like dis"": why my disability is the...",0
1777,"""remembering george haley: the greatest americ...",0


In [4]:
len(df)

28503

### Reset the index.

In [5]:
df.reset_index(inplace=True, drop=True)

In [6]:
df.head()

Unnamed: 0,headline,is_sarcastic
0,"""eco-warrior"" vandana shiva, at $40,000 a spee...",0
1,"""how do we allow a gunman to come into our chi...",0
2,"""i am equal to any man,"" says stern woman who ...",1
3,"""i woke up like dis"": why my disability is the...",0
4,"""remembering george haley: the greatest americ...",0


### Split the data into Train, Validation and Test sets.

- Train: 70% -> 19.952
- Validation: 10% -> 2.850
- Test: 20% -> 5.700




In [7]:
train, test = train_test_split(df, test_size=0.20)
train, val = train_test_split(train, test_size=0.12498)

(22802, 2)


In [8]:
print('train:',train.shape)
print('test:',test.shape)
print('val:',val.shape)

train: (19952, 2)
test: (5701, 2)
val: (2850, 2)


### Store datasets in separate CSVs

In [9]:
train.to_csv('./data/train.csv', index=True)
test.to_csv('./data/test.csv', index=True)
val.to_csv('./data/val.csv', index=True)

# Check sizes of the different partitions

In [11]:
train = pd.read_csv("./data/train.csv")
val = pd.read_csv("./data/val.csv")
test = pd.read_csv("./data/test.csv")

In [15]:
test

Unnamed: 0.1,Unnamed: 0,headline,is_sarcastic
0,16840,nevada secretary of state unveils new 'i voted...,1
1,8605,famed chef homaro cantu found dead,0
2,28447,your high school boyfriend still smoking cigar...,1
3,576,'the chew' co-host wants her kids to know moms...,0
4,10348,gummi bear emerges from digestive tract unharmed,1
...,...,...,...
5696,16929,new comic features aquaman as 45-year-old sing...,1
5697,26143,"uber driver denies ride to woman in labor, sti...",0
5698,23763,the gift is giving,0
5699,26593,voters shocked christie botched such an easy p...,1


In [21]:
(test.is_sarcastic == 0).sum()

2989

In [22]:
(test.is_sarcastic == 1).sum()

2712

# Analysis of the data

In [4]:
df1 = pd.read_json("./data/Sarcasm_Headlines_Dataset.json", lines=True)
df2 = pd.read_json("./data/Sarcasm_Headlines_Dataset_v2.json", lines=True)
# re-order attibute columns in df2
df2 = df2[['article_link','headline','is_sarcastic']]
df = pd.concat([df1, df2], axis=0)
df = df.drop(['article_link'], axis=1)
print(len(df))
df.head()

55328


Unnamed: 0,headline,is_sarcastic
0,former versace store clerk sues over secret 'b...,0
1,the 'roseanne' revival catches up to our thorn...,0
2,mom starting to fear son's web series closest ...,1
3,"boehner just wants wife to listen, not come up...",1
4,j.k. rowling wishes snape happy birthday in th...,0


In [5]:
df.sort_values("headline", inplace=True)
df.drop_duplicates(subset="headline",
                   keep='first', inplace=True)
df.head()

Unnamed: 0,headline,is_sarcastic
5891,"""eco-warrior"" vandana shiva, at $40,000 a spee...",0
18992,"""how do we allow a gunman to come into our chi...",0
2387,"""i am equal to any man,"" says stern woman who ...",1
3144,"""i woke up like dis"": why my disability is the...",0
1777,"""remembering george haley: the greatest americ...",0


In [6]:
df.reset_index(inplace=True, drop=True)

Let's see how many instances of each type we have after cleaning the data.

In [7]:
(df.is_sarcastic == 0).sum()

14951

In [8]:
(df.is_sarcastic == 1).sum()

13552

We have 14951 non-sarcastic instances and 13552 sarcastic instances.

In [10]:
df.head()

Unnamed: 0,headline,is_sarcastic
0,"""eco-warrior"" vandana shiva, at $40,000 a spee...",0
1,"""how do we allow a gunman to come into our chi...",0
2,"""i am equal to any man,"" says stern woman who ...",1
3,"""i woke up like dis"": why my disability is the...",0
4,"""remembering george haley: the greatest americ...",0


### There are duplicates in the 2nd dataset:

In [33]:
df2 = pd.read_json("./data/Sarcasm_Headlines_Dataset_v2.json", lines=True)
df2 = df2[['article_link','headline','is_sarcastic']]
df2 = df2.drop(['article_link'], axis=1)
df2.sort_values(by=['headline'], inplace=True)


In [34]:
print(df2['headline'].duplicated().any()) # True
print(df2["headline"].is_unique)


True
False


### There are duplicates in the 1st dataset:

In [35]:
df1 = pd.read_json("./data/Sarcasm_Headlines_Dataset.json", lines=True)
df1 = df1.drop(['article_link'], axis=1)
df1.sort_values(by=['headline'], inplace=True)


In [36]:
print(df1['headline'].duplicated().any()) # True
print(df1["headline"].is_unique)


True
False


Therefore, as there are duplicates in both datasets, I first join them just in case there are some different instances between both datasets and then I remove the duplicates.