In [88]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [89]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
import csv
import re
import string

nltk.download('stopwords')
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer

nltk.download('omw-1.4')

! pip install wordcloud
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [90]:
raw_data = pd.read_csv('/content/drive/MyDrive/smsspamcollection/SMSSpamCollection',sep='\t',header=None)
raw_data.columns=['Label','Text']
raw_data.head()

Unnamed: 0,Label,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [91]:
raw_data.isna().sum()

Label    0
Text     0
dtype: int64

In [92]:
raw_data.groupby('Label').describe()

Unnamed: 0_level_0,Text,Text,Text,Text
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4825,4516,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


In [93]:
raw_data.loc[raw_data.Label == 'spam', 'Label'] = 1
raw_data.loc[raw_data.Label == 'ham', 'Label'] = 0

In [94]:
raw_data.head()

Unnamed: 0,Label,Text
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


**Splitting data into Training, Validation and Test Set and saving them as csv**

In [95]:
X_train, X_test, y_train, y_test = train_test_split(raw_data.Text, raw_data['Label'], test_size=0.15, random_state=101)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=101)

In [96]:
## checking the data profile with seed 101
print("Train set - number of 0s:", sum(y_train == 0), "number of 1s:", sum(y_train == 1))
print("Validation set - number of 0s:", sum(y_val == 0), "number of 1s:", sum(y_val== 1))
print("Test set - number of 0s:", sum(y_test== 0), "number of 1s:", sum(y_test== 1))

Train set - number of 0s: 3479 number of 1s: 546
Validation set - number of 0s: 614 number of 1s: 97
Test set - number of 0s: 732 number of 1s: 104


In [97]:
df_train = pd.DataFrame(list(zip(X_train,y_train)),columns=["Text",'Label'])
df_test = pd.DataFrame(list(zip(X_test,y_test)),columns=["Text",'Label'])
df_val = pd.DataFrame(list(zip(X_val,y_val)),columns=["Text",'Label'])
df_val.to_csv('data/Validation_Data.csv',index=False) 
df_test.to_csv('data/Test_Data.csv',index=False) 
df_train.to_csv('data/Training_Data.csv',index=False)

In [98]:
raw_data.to_csv('/content/drive/MyDrive/Raw_Data.csv',index=False)

**Working with Git and DVC to track data**

In [99]:
! pip install dvc

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [100]:
! git init

Reinitialized existing Git repository in /content/.git/


In [102]:
#! dvc init

[31mERROR[39m: failed to initiate DVC - '.dvc' exists. Use `-f` to force.
[0m

In [103]:
!dvc add data

!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      !Collecting targets          |0.00 [00:00,     ?file/s]                                                      [?25l[32m⠋[0m Checking graph[2K[32m⠋[0m Checking graph
[?25h[1A[2KAdding...:   0% 0/1 [00:00<?, ?file/s]Adding...:   0% 0/1 [00:00<?, ?file/s{'info': ''}]Adding...:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
Building data objects from data          |0.00 [00:00,      ?obj/s][A
                                                                   [A
![A
Building data objects from data          |0.00 [00:00,      ?obj/s][A
                                                                   [A
![A
  0% |          |0/? [00:00<?,    ?files/s][A
                                           [A
![A
Building data objects from data          |0.00 [00:00,      ?obj/s][A
                                

In [104]:
! git add data.dvc

In [106]:
 ! git config --global user.email "ayush.srivastava.mnnit@gmail.com"

In [108]:
! git config --global user.name "Ayush Srivastava"

In [109]:
! git commit -m "Seed Changes to 105"

[master (root-commit) 0add420] Seed Changes to 105
 4 files changed, 11 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 create mode 100644 data.dvc


In [70]:
#!dvc config core.autostage true

[0m

**Adding google drive folder as a remote data storage**

In [72]:
#!dvc remote add --default myremote gdrive://1MKDgCZxNyYrFNLhGYNIfOK8mQpD1rpmr

Setting 'myremote' as a default remote.
[0m

In [73]:
#!dvc remote modify myremote gdrive_acknowledge_abuse true

[0m

Pushing dvc tracked files to remote storage


In [74]:
#!dvc push

!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      [31mERROR[39m: unexpected error - gdrive is supported, but requires 'dvc-gdrive' to be installed: No module named 'dvc_gdrive'

[33mHaving any troubles?[0m Hit us up at [34mhttps://dvc.org/support[0m, we are always happy to help!
[0m

In [75]:
!pip install dvc-gdrive

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Collecting dvc-gdrive
  Downloading dvc_gdrive-2.19.1-py3-none-any.whl (11 kB)
Collecting pydrive2[fsspec]>=1.15.0
  Downloading PyDrive2-1.15.1-py3-none-any.whl (44 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.6/44.6 KB[0m [31m5.0 MB/s[0m eta [36m0:00:00[0m
Collecting pyOpenSSL>=19.1.0
  Downloading pyOpenSSL-23.0.0-py3-none-any.whl (57 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m57.3/57.3 KB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pyOpenSSL, pydrive2, dvc-gdrive
Successfully installed dvc-gdrive-2.19.1 pyOpenSSL-23.0.0 pydrive2-1.15.1


In [76]:
!dvc push

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=710796635688-iivsgbgsb6uv1fap6635dhvuei09o66c.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8090%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.appdata&access_type=offline&response_type=code&approval_prompt=force

[31mERROR[39m: interrupted by the user
Traceback (most recent call last):
  File "/usr/local/bin/dvc", line 8, in <module>
    sys.exit(main())
  File "/usr/local/lib/python3.8/dist-packages/dvc/cli/__init__.py", line 242, in main
    analytics.collect_and_send_report(args, ret)
  File "/usr/local/lib/python3.8/dist-packages/dvc/analytics.py", line 37, in collect_and_send_report
    daemon(["analytics", fobj.name])
  File "/usr/local/lib/python3.8/dist-packages/dvc/daemon.py", line 106, in daemon
    daemonize(["daemon", "-q", *args])
  File "/usr/local/lib/python3.8/dist-packages/dvc/daemon.py", line 11

In [78]:
import dvc_gdrive

In [83]:
!dvc push

Your browser has been opened to visit:

    https://accounts.google.com/o/oauth2/auth?client_id=710796635688-iivsgbgsb6uv1fap6635dhvuei09o66c.apps.googleusercontent.com&redirect_uri=http%3A%2F%2Flocalhost%3A8090%2F&scope=https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive+https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.appdata&access_type=offline&response_type=code&approval_prompt=force

[31mERROR[39m: interrupted by the user
[0m

In [110]:
!dvc status

!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      !Building data objects from data          |0.00 [00:00,      ?obj/s]                                                                   Data and pipelines are up to date.
[0m

In [111]:
!git log

[33mcommit 0add420a54876fcf081b884c4cb7378577c7f0eb[m[33m ([m[1;36mHEAD -> [m[1;32mmaster[m[33m)[m
Author: Ayush Srivastava <ayush.srivastava.mnnit@gmail.com>
Date:   Mon Feb 27 08:53:11 2023 +0000

    Seed Changes to 105


In [112]:
! git log --oneline

[33m0add420[m[33m ([m[1;36mHEAD -> [m[1;32mmaster[m[33m)[m Seed Changes to 105


In [113]:
X_train, X_test, y_train, y_test = train_test_split(raw_data.Text, raw_data['Label'], test_size=0.15, random_state=105)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=105)

In [114]:
df_train = pd.DataFrame(list(zip(X_train,y_train)),columns=["Text",'Label'])
df_test = pd.DataFrame(list(zip(X_test,y_test)),columns=["Text",'Label'])
df_val = pd.DataFrame(list(zip(X_val,y_val)),columns=["Text",'Label'])
df_val.to_csv('data/Validation_Data.csv',index=False) 
df_test.to_csv('data/Test_Data.csv',index=False) 
df_train.to_csv('data/Training_Data.csv',index=False)

In [115]:
train = pd.read_csv('data/Training_Data.csv')
val = pd.read_csv('data/Validation_Data.csv')
test = pd.read_csv('data/Test_Data.csv')

In [116]:
## checking the data profile before update
print("Train set - number of 0s:", sum(train['Label'] == 0), "number of 1s:", sum(train['Label'] == 1))
print("Validation set - number of 0s:", sum(val['Label'] == 0), "number of 1s:", sum(val['Label'] == 1))
print("Test set - number of 0s:", sum(test['Label'] == 0), "number of 1s:", sum(test['Label'] == 1))

Train set - number of 0s: 3482 number of 1s: 543
Validation set - number of 0s: 607 number of 1s: 104
Test set - number of 0s: 736 number of 1s: 100


In [117]:
! dvc add data

!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      !Collecting targets          |0.00 [00:00,     ?file/s]                                                      [?25l[32m⠋[0m Checking graph[2K[32m⠋[0m Checking graph
[?25h[1A[2KAdding...:   0% 0/1 [00:00<?, ?file/s]Adding...:   0% 0/1 [00:00<?, ?file/s{'info': ''}]Adding...:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
Building data objects from data          |0.00 [00:00,      ?obj/s][A
                                                                   [A
![A
Building data objects from data          |0.00 [00:00,      ?obj/s][A
                                                                   [A
![A
  0% |          |0/? [00:00<?,    ?files/s][A
                                           [A
![A
  0%|          |Transferring                          0/? [00:00<?,     ?file/s][A
Transferring:   0% 

In [119]:
! git add data.dvc

In [120]:
! git commit -m "changed to 105 random state"

[master 673efb2] changed to 105 random state
 1 file changed, 1 insertion(+), 1 deletion(-)


In [122]:
! git checkout HEAD^1 data.dvc      
! dvc checkout 

Updated 1 path from 2fa9119
Checkout:   0% 0/3 [00:00<?, ?file/s{'info': ''}]  
![A
Building data objects from data          |0.00 [00:00,      ?obj/s][A
Checkout:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |/content/.dvc/cache/4e/fb18e77773850.00/? [00:00<?,        ?B/s][A
  0% 0.00/60.6k [00:00<?, ?B/s{'info': ''}]                                     [A
Checkout: 100% 1/1 [00:00<00:00, 83.14file/s{'info': ''}]
![A
  0%|          |/content/.dvc/cache/cb/04bcdf531d3e0.00/? [00:00<?,        ?B/s][A
  0% 0.00/332k [00:00<?, ?B/s{'info': ''}]                                      [A
  0% Checkout|          |2/? [00:00<00:00, 143.16file/s] 
![A
  0%|          |/content/.dvc/cache/41/b0261ce9302a0.00/? [00:00<?,        ?B/s][A
  0% 0.00/65.2k [00:00<?, ?B/s{'info': ''}]                                     [A
[33mM[0m       data/
[0m

In [85]:
!dvc checkout

!If DVC froze, see `hardlink_lock` in <[36mhttps://man.dvc.org/config#core[39m>                                                                      !  0% Checkout|          |0/? [00:00<?,     ?file/s]Checkout:   0% 0/3 [00:00<?, ?file/s{'info': ''}]  
![A
Building data objects from data          |0.00 [00:00,      ?obj/s][A
                                                                   [A                                                 [0m

In [123]:
train = pd.read_csv('data/Training_Data.csv')
val = pd.read_csv('data/Validation_Data.csv')
test = pd.read_csv('data/Test_Data.csv')

In [124]:
print("Train set - number of 0s:", sum(train['Label'] == 0), "number of 1s:", sum(train['Label'] == 1))
print("Validation set - number of 0s:", sum(val['Label'] == 0), "number of 1s:", sum(val['Label'] == 1))
print("Test set - number of 0s:", sum(test['Label'] == 0), "number of 1s:", sum(test['Label'] == 1))

Train set - number of 0s: 3479 number of 1s: 546
Validation set - number of 0s: 614 number of 1s: 97
Test set - number of 0s: 732 number of 1s: 104


In [125]:
! git checkout HEAD data.dvc      
! dvc checkout 

Updated 1 path from a01df26
Checkout:   0% 0/3 [00:00<?, ?file/s{'info': ''}]  
![A
Building data objects from data          |0.00 [00:00,      ?obj/s][A
Checkout:   0% 0/1 [00:00<?, ?file/s{'info': ''}]
![A
  0%|          |/content/.dvc/cache/e7/a0749ec82b2b0.00/? [00:00<?,        ?B/s][A
  0% 0.00/58.8k [00:00<?, ?B/s{'info': ''}]                                     [A
Checkout: 100% 1/1 [00:00<00:00, 75.58file/s{'info': ''}]
![A
  0%|          |/content/.dvc/cache/48/28a74e10d58e0.00/? [00:00<?,        ?B/s][A
  0% 0.00/333k [00:00<?, ?B/s{'info': ''}]                                      [A
  0% Checkout|          |2/? [00:00<00:00, 132.04file/s] 
![A
  0%|          |/content/.dvc/cache/00/5d263fd061a90.00/? [00:00<?,        ?B/s][A
  0% 0.00/66.2k [00:00<?, ?B/s{'info': ''}]                                     [A
[33mM[0m       data/
[0m

In [126]:
train = pd.read_csv('data/Training_Data.csv')
val = pd.read_csv('data/Validation_Data.csv')
test = pd.read_csv('data/Test_Data.csv')

In [127]:
print("Train set - number of 0s:", sum(train['Label'] == 0), "number of 1s:", sum(train['Label'] == 1))
print("Validation set - number of 0s:", sum(val['Label'] == 0), "number of 1s:", sum(val['Label'] == 1))
print("Test set - number of 0s:", sum(test['Label'] == 0), "number of 1s:", sum(test['Label'] == 1))

Train set - number of 0s: 3482 number of 1s: 543
Validation set - number of 0s: 607 number of 1s: 104
Test set - number of 0s: 736 number of 1s: 100
