In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv('SMSSpamCollection',sep='\t',names=['label','text'])
print("Total number of messages: ",len(df))
df.head(5)

Total number of messages:  5572


Unnamed: 0,label,text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


# Label Encoding

In [3]:
def encode(text):
    if text == 'spam':
        return 1
    elif text == 'ham':
        return 0
df['label'] = df['label'].apply(encode)

In [None]:
df.to_csv('raw_data.csv', index=False)

In [7]:
print("Raw data shape" , df.shape)
df.head(5)

Raw data shape (5572, 2)


Unnamed: 0,label,text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


# Train/Validation/Test Split

In [11]:
train_df, temp_df = train_test_split(
    df, test_size=0.30, random_state=7, stratify=df["label"]
)

val_df, test_df = train_test_split(
    temp_df, test_size=0.50, random_state=7, stratify=temp_df["label"]
)

print(train_df.shape, val_df.shape, test_df.shape)


(3900, 2) (836, 2) (836, 2)


In [12]:
print("Train distribution:")
print(train_df.value_counts("label", normalize=True))

print("\nValidation distribution:")
print(val_df.value_counts("label", normalize=True))

print("\nTest distribution:")
print(test_df.value_counts("label", normalize=True))

Train distribution:
label
0    0.865897
1    0.134103
Name: proportion, dtype: float64

Validation distribution:
label
0    0.866029
1    0.133971
Name: proportion, dtype: float64

Test distribution:
label
0    0.866029
1    0.133971
Name: proportion, dtype: float64


# Storing the splits as train.csv/validation.csv/test.csv


In [13]:
train_df.to_csv("train.csv", index=False)
val_df.to_csv("validate.csv", index=False)
test_df.to_csv("test.csv", index=False)

In [14]:
for file in ["train.csv","validate.csv","test.csv"]:
    df = pd.read_csv(file)
    print("\n", file)
    print(df.iloc[:,0].value_counts())



 train.csv
label
0    3377
1     523
Name: count, dtype: int64

 validate.csv
label
0    724
1    112
Name: count, dtype: int64

 test.csv
label
0    724
1    112
Name: count, dtype: int64


In [15]:
for file in ["train.csv","validate.csv","test.csv"]:
    df = pd.read_csv(file)
    print("\n", file)
    print(df.iloc[:,0].value_counts())


 train.csv
label
0    3377
1     523
Name: count, dtype: int64

 validate.csv
label
0    724
1    112
Name: count, dtype: int64

 test.csv
label
0    724
1    112
Name: count, dtype: int64


In [16]:
import hashlib
def file_hash(path):
    with open(path,'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

print("Train hash:", file_hash("train.csv"))
print("Test hash:", file_hash("test.csv"))
print("Validate hash:", file_hash("validate.csv"))

Train hash: 7504d5c77d03a8d9830e32b0cd70f86f
Test hash: 5bae99ecb2b43673b939536fc92453f1
Validate hash: fdc127bba804c051dd61d060b06dc147


In [17]:
import hashlib
def file_hash(path):
    with open(path,'rb') as f:
        return hashlib.md5(f.read()).hexdigest()

print("Train hash:", file_hash("train.csv"))
print("Test hash:", file_hash("test.csv"))
print("Validate hash:", file_hash("validate.csv"))

Train hash: d1343521f676b4aac5b433bd071751c4
Test hash: 8caf3d770b12cd8b9bc62a40a5bc57a7
Validate hash: 54af1dc6dfe38c671d5b1d42e5a864b1
