In [20]:
import pandas as pd
import csv
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import sklearn
from sklearn.model_selection import train_test_split 
from scipy import sparse
import nltk
import random

In [2]:
!pip install nltk


[notice] A new release of pip available: 22.1.2 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip


Loading the raw data

In [6]:
messages = pd.read_csv('SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE,
                           names=["label", "message"])

In [6]:
nltk.download('stopwords')

[nltk_data] Error loading stopwords: <urlopen error [WinError 10054]
[nltk_data]     An existing connection was forcibly closed by the
[nltk_data]     remote host>


False

In [4]:
def split_into_lemmas(message):
    message = str.format(message,'utf-8').lower()
    words = TextBlob(message).words 
    return [word.lemma for word in words]

In [12]:
bow_transformer = CountVectorizer(analyzer=split_into_lemmas).fit(messages['message'])

In [13]:
messages_bow = bow_transformer.transform(messages['message'])
tfidf_transformer = TfidfTransformer().fit(messages_bow)
messages_tfidf = tfidf_transformer.transform(messages_bow)

In [21]:
random.seed(100)

In [22]:
messages_train, messages_test, y_train, y_test = train_test_split(messages_tfidf, messages['label'], test_size= 0.2)

In [23]:
message_train, messages_valid, classify_train, classify_valid = train_test_split(messages_train, y_train, test_size=0.2)

In [24]:
train_dataset = pd.DataFrame(message_train.todense())
classify_df = classify_train.to_frame()
classify_df = classify_df.reset_index(drop=True)
train_data = pd.concat([train_dataset,classify_df],axis = 1)

In [25]:
test_dataset = pd.DataFrame(messages_test.todense())
test_classify = y_test.to_frame()
test_classify = test_classify.reset_index(drop=True)
test_data = pd.concat([test_dataset,test_classify],axis = 1)

In [26]:
valid_dataset = pd.DataFrame(messages_valid.todense())
valid_classify = classify_valid.to_frame()
valid_classify = valid_classify.reset_index(drop=True)
valid_data = pd.concat([valid_dataset,valid_classify],axis = 1)

In [27]:
train_data.to_csv('train.csv')
test_data.to_csv('test.csv')
valid_data.to_csv('validation.csv')

Initialising dvc

In [None]:
!cd .. && dvc init

Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/iterative/dvc>


Adding all the 3 splitted csv files to dvc

In [None]:
!dvc add ./data/train.csv
!dvc add ./data/validation.csv
!dvc add ./data/test.csv


To track the changes with git, run:

	git add 'data\.gitignore' train.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true

To track the changes with git, run:

	git add 'data\.gitignore' validation.csv.dvc

To enable auto staging, run:

	dvc config core.autostage true

To track the changes with git, run:

	git add test.csv.dvc 'data\.gitignore'

To enable auto staging, run:

	dvc config core.autostage true


In [None]:
!dvc config core.autostage true

Adding google drive folder as a remote data storage

In [None]:
!cd .. && dvc remote add --default myremote gdrive://1WM_n-19W7nOe2Pyr-3yHzAUOFyvEgPOy

Setting 'myremote' as a default remote.


In [None]:
!dvc remote modify myremote gdrive_acknowledge_abuse true

Pushing dvc tracked files to remote storage

In [None]:
!dvc push

3 files pushed


In [None]:
#2nd data split
data_split(raw_data, 121)

In [None]:
!dvc status

test.csv.dvc:
	changed outs:
		modified:           Assignment 2\data\test.csv
train.csv.dvc:
	changed outs:
		modified:           Assignment 2\data\train.csv
validation.csv.dvc:
	changed outs:
		modified:           Assignment 2\data\validation.csv


In [None]:
!dvc push

3 files pushed


Checkout for the different versions of the data splitting

In [None]:
!git log

commit b4e31b9f1fb6fd99a4f0f5eaee62322c65d0393f
Author: Soham Biswas <biswassoham434@gmail.com>
Date:   Tue Feb 28 00:16:57 2023 +0530

    Second Split Random Seed 121

commit 250db97be7ad21504f968454f6cf9cd55cad6bd1
Author: Soham Biswas <biswassoham434@gmail.com>
Date:   Tue Feb 28 00:07:45 2023 +0530

    First Split Random Seed 42

commit efe6d6c31d44abd0961ce32546915d7381ae2d97
Author: Soham Biswas <biswassoham434@gmail.com>
Date:   Tue Feb 28 00:07:15 2023 +0530

    dvc remote added and pushed

commit 2683509f5b8a4d5735d52e8e6ef0dba594985f9d
Author: Soham Biswas <biswassoham434@gmail.com>
Date:   Tue Feb 28 00:02:41 2023 +0530

    dvc init

commit 32d0797cf3e7baea5831db53520785188d28fe9a
Author: Soham Biswas <biswassoham434@gmail.com>
Date:   Tue Feb 28 00:01:44 2023 +0530

    dvc deleted for fresh start

commit a8cd5cea8129ba08b71b89fe78f42aa63563b417
Author: Soham Biswas <biswassoham434@gmail.com>
Date:   Mon Feb 27 23:59:51 2023 +0530

    dvc deleted for fresh start

commi

Checkout for 1st version

In [None]:
!git checkout 250db97be7ad21504f968454f6cf9cd55cad6bd1

Note: switching to '250db97be7ad21504f968454f6cf9cd55cad6bd1'.

You are in 'detached HEAD' state. You can look around, make experimental
changes and commit them, and you can discard any commits you make in this
state without impacting any branches by switching back to a branch.

If you want to create a new branch to retain commits you create, you may
do so (now or later) by using -c with the switch command. Example:

  git switch -c <new-branch-name>

Or undo this operation with:

  git switch -

Turn off this advice by setting config variable advice.detachedHead to false

HEAD is now at 250db97 First Split Random Seed 42


In [None]:
!dvc checkout

M       Assignment 2\data\train.csv
M       Assignment 2\data\test.csv
M       Assignment 2\data\validation.csv


Function to get the distribution of sms labels in the splitted data

In [None]:
def label_dist(path):
    data = pd.read_csv(path)

    ham_count = list(data.iloc[:,1]).count("ham")
    spam_count = list(data.iloc[:,1]).count("spam")

    print("Ham: {}, Spam: {}".format(ham_count, spam_count))

For 1st Split (Random Seed: 42)

In [None]:
train_path = './data/train.csv'
val_path = './data/validation.csv'
test_path = './data/test.csv'

print("First Split (Random Seed: 42)")
print("\nTraining dataset:")
label_dist(train_path)
print("\nValidation dataset:")
label_dist(val_path)
print("\nTesting dataset:")
label_dist(test_path)

First Split (Random Seed: 42)

Training dataset:
Ham: 3909, Spam: 605

Validation dataset:
Ham: 440, Spam: 62

Testing dataset:
Ham: 478, Spam: 80


Checkout for 2nd Split

In [None]:
!git checkout b4e31b9f1fb6fd99a4f0f5eaee62322c65d0393f

Previous HEAD position was 250db97 First Split Random Seed 42
HEAD is now at b4e31b9 Second Split Random Seed 121


In [None]:
!dvc checkout

M       Assignment 2\data\train.csv
M       Assignment 2\data\validation.csv
M       Assignment 2\data\test.csv


For 2nd Split (Random Seed: 121)

In [None]:
print("2nd Split (Random Seed: 121)")
print("\nTraining dataset:")
label_dist(train_path)
print("\nValidation dataset:")
label_dist(val_path)
print("\nTesting dataset:")
label_dist(test_path)

2nd Split (Random Seed: 121)

Training dataset:
Ham: 3910, Spam: 604

Validation dataset:
Ham: 430, Spam: 72

Testing dataset:
Ham: 487, Spam: 71
