## Importing Libraries

In [1]:
# importing required modules
from zipfile import ZipFile
from urllib.request import urlopen
from io import BytesIO

In [None]:
# !pip install wordcloud

In [2]:
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import nltk
import csv
import re
import string


nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from wordcloud import WordCloud

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\shrey\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Reading data

In [3]:
git = "https://github.com/Deadshot-07/Applied-Machine-Learning/blob/T/Assignment_1/smsspamcollection.zip?raw=true"

In [4]:
def download_and_unzip(url, extract_to='.'):
    http_response = urlopen(url)
    zipfile = ZipFile(BytesIO(http_response.read()))
    zipfile.extractall(path=extract_to)

In [5]:
download_and_unzip(git)

In [6]:
%ls

 Volume in drive C is OS
 Volume Serial Number is 8CB0-500E

 Directory of c:\Users\shrey\OneDrive\Documents\Semester-4\AML\Applied-Machine-Learning\Assignment_2

27-02-2023  20:29    <DIR>          .
27-02-2023  20:28    <DIR>          ..
27-02-2023  20:29    <DIR>          Data
27-02-2023  20:07            51,946 prepare.ipynb
27-02-2023  20:28                14 Read
27-02-2023  20:29             5,868 readme
27-02-2023  20:29           477,907 SMSSpamCollection
27-02-2023  20:28           203,415 smsspamcollection.zip
               5 File(s)        739,150 bytes
               3 Dir(s)  36,424,224,768 bytes free


In [7]:
messages = pd.read_csv('./SMSSpamCollection', sep='\t', quoting=csv.QUOTE_NONE, names=["Label", "Message"])

In [8]:
messages.head()

Unnamed: 0,Label,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [9]:
messages.isna().sum()

Label      0
Message    0
dtype: int64

In [10]:
messages.groupby('Label').describe()

Unnamed: 0_level_0,Message,Message,Message,Message
Unnamed: 0_level_1,count,unique,top,freq
Label,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
ham,4827,4518,"Sorry, I'll call later",30
spam,747,653,Please call our customer service representativ...,4


Converting ham to 0 and spam to 1's in the label columns.  
Note that we can also do this manually

In [None]:
encode = LabelEncoder()
messages['Label'] = encode.fit_transform(messages['Label'])
messages['Label'].value_counts()

0    4827
1     747
Name: Label, dtype: int64

In [12]:
messages.head()

Unnamed: 0,Label,Message
0,0,"Go until jurong point, crazy.. Available only ..."
1,0,Ok lar... Joking wif u oni...
2,1,Free entry in 2 a wkly comp to win FA Cup fina...
3,0,U dun say so early hor... U c already then say...
4,0,"Nah I don't think he goes to usf, he lives aro..."


## Preprocessing data

In [13]:
STOPWORDS = set(stopwords.words('english'))

def remove_punct(text):
    pattern = re.compile(r'[^\w\s]')
    return pattern.sub(r'', text)

def to_lower(text):
    return text.lower()

def stopwords(text):
    a = " ".join([word for word in str(text).split() if word not in STOPWORDS])
    if a != '':
        return a
    else:
        return np.nan

def word_tokenizer(text):
    return word_tokenize(text)

def lemmatization(text):
    wnl = WordNetLemmatizer()
    textlist = word_tokenize(text)
    lemmatized_string = ' '.join([wnl.lemmatize(words) for words in textlist])
    return lemmatized_string

In [14]:
corpus = messages['Message'].apply(to_lower)

In [15]:
corpus = corpus.map(remove_punct)

In [16]:
corpus = corpus.map(stopwords)

In [17]:
corpus = corpus.replace(np.nan, '', regex=True)

In [18]:
messages['Text'] = corpus.map(lemmatization)

In [19]:
messages.head()

Unnamed: 0,Label,Message,Text
0,0,"Go until jurong point, crazy.. Available only ...",go jurong point crazy available bugis n great ...
1,0,Ok lar... Joking wif u oni...,ok lar joking wif u oni
2,1,Free entry in 2 a wkly comp to win FA Cup fina...,free entry 2 wkly comp win fa cup final tkts 2...
3,0,U dun say so early hor... U c already then say...,u dun say early hor u c already say
4,0,"Nah I don't think he goes to usf, he lives aro...",nah dont think go usf life around though


## Splitting data into Training, Validation and Test Set and saving them as csv

In [20]:
X_train, X_test, y_train, y_test = train_test_split(messages.Text, messages['Label'], test_size=0.15, random_state=11)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=11)

In [21]:
## checking the data profile with seed 101
print("Train set - number of 0s:", sum(y_train == 0), "number of 1s:", sum(y_train == 1))
print("Validation set - number of 0s:", sum(y_val == 0), "number of 1s:", sum(y_val== 1))
print("Test set - number of 0s:", sum(y_test== 0), "number of 1s:", sum(y_test== 1))

Train set - number of 0s: 3483 number of 1s: 543
Validation set - number of 0s: 622 number of 1s: 89
Test set - number of 0s: 722 number of 1s: 115


In [22]:
df_train = pd.DataFrame(list(zip(X_train,y_train)),columns=["Text",'Label'])
df_test = pd.DataFrame(list(zip(X_test,y_test)),columns=["Text",'Label'])
df_val = pd.DataFrame(list(zip(X_val,y_val)),columns=["Text",'Label'])
df_val.to_csv('./Data/Validation Data.csv',index=False) 
df_test.to_csv('./Data/Test Data.csv',index=False) 
df_train.to_csv('./Data/Training Data.csv',index=False) 

## Working with Git and DVC to track data

In [29]:
# ! pip install dvc
# ! git init
! dvc init


[notice] A new release of pip available: 22.1.2 -> 23.0.1
[notice] To update, run: python.exe -m pip install --upgrade pip
Initialized empty Git repository in C:/Users/shrey/OneDrive/Documents/Semester-4/AML/Applied-Machine-Learning/Assignment_2/.git/
Initialized DVC repository.

You can now commit the changes to git.

+---------------------------------------------------------------------+
|                                                                     |
|        DVC has enabled anonymous aggregate usage analytics.         |
|     Read the analytics documentation (and how to opt-out) here:     |
|             <https://dvc.org/doc/user-guide/analytics>              |
|                                                                     |
+---------------------------------------------------------------------+

What's next?
------------
- Check out the documentation: <https://dvc.org/doc>
- Get help and share ideas: <https://dvc.org/chat>
- Star us on GitHub: <https://github.com/it

In [30]:
! dvc add Data
! git add Data.dvc
! git commit -m "Tracking Data"


To track the changes with git, run:

	git add Data.dvc .gitignore

To enable auto staging, run:

	dvc config core.autostage true
[master (root-commit) 82dfe80] Tracking Data
 4 files changed, 11 insertions(+)
 create mode 100644 .dvc/.gitignore
 create mode 100644 .dvc/config
 create mode 100644 .dvcignore
 create mode 100644 Data.dvc


In [48]:
! dvc remote add -d -f myremote gdrive://1LdQMltsMk95fJoO5RnM821C7EO5Wvj7k

Setting 'myremote' as a default remote.


In [49]:
! dvc push

4 files pushed


### Splitting the data into three parts again

In [50]:
X_train, X_test, y_train, y_test = train_test_split(messages.Text, messages['Label'], test_size=0.15, random_state=100)
X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.15, random_state=100)

In [51]:
## checking the data profile with seed 100
print("Train set - number of 0s:", sum(y_train == 0), "number of 1s:", sum(y_train == 1))
print("Validation set - number of 0s:", sum(y_val == 0), "number of 1s:", sum(y_val== 1))
print("Test set - number of 0s:", sum(y_test== 0), "number of 1s:", sum(y_test== 1))

Train set - number of 0s: 3470 number of 1s: 556
Validation set - number of 0s: 617 number of 1s: 94
Test set - number of 0s: 740 number of 1s: 97


In [52]:
df_train = pd.DataFrame(list(zip(X_train,y_train)),columns=["Text",'Label'])
df_test = pd.DataFrame(list(zip(X_test,y_test)),columns=["Text",'Label'])
df_val = pd.DataFrame(list(zip(X_val,y_val)),columns=["Text",'Label'])
df_val.to_csv('Data/Validation Data.csv',index=False) 
df_test.to_csv('Data/Test Data.csv',index=False) 
df_train.to_csv('Data/Training Data.csv',index=False) 

In [53]:
! dvc add Data
! git add Data.dvc
! git commit -m "Seed Changes"
! dvc push


To track the changes with git, run:

	git add Data.dvc

To enable auto staging, run:

	dvc config core.autostage true
[master 7328f9a] Seed Changes
 1 file changed, 1 insertion(+), 1 deletion(-)
4 files pushed


In [54]:
! git log --oneline

7328f9a Seed Changes
82dfe80 Tracking Data


Getting the first version i.e, seed 11

In [None]:
! git checkout HEAD^1 Data.dvc      
! dvc checkout 

In [None]:
train = pd.read_csv('./Data/Training Data.csv')
val = pd.read_csv('./Data/Validation Data.csv')
test = pd.read_csv('./Data/Test Data.csv')

In [None]:
## checking the data profile before update
print("Train set - number of 0s:", sum(train['Label'] == 0), "number of 1s:", sum(train['Label'] == 1))
print("Validation set - number of 0s:", sum(val['Label'] == 0), "number of 1s:", sum(val['Label'] == 1))
print("Test set - number of 0s:", sum(test['Label'] == 0), "number of 1s:", sum(test['Label'] == 1))

Train set - number of 0s: 3483 number of 1s: 543
Validation set - number of 0s: 622 number of 1s: 89
Test set - number of 0s: 722 number of 1s: 115



Getting the second version i.e, seed 100

In [None]:
! git checkout HEAD Data.dvc      
! dvc checkout 

Updated 0 paths from acb22c4


In [None]:
train = pd.read_csv('Data/Training Data.csv')
val = pd.read_csv('Data/Validation Data.csv')
test = pd.read_csv('Data/Test Data.csv')

In [None]:
## checking the data profile before update
print("Train set - number of 0s:", sum(train['Label'] == 0), "number of 1s:", sum(train['Label'] == 1))
print("Validation set - number of 0s:", sum(val['Label'] == 0), "number of 1s:", sum(val['Label'] == 1))
print("Test set - number of 0s:", sum(test['Label'] == 0), "number of 1s:", sum(test['Label'] == 1))

Train set - number of 0s: 3470 number of 1s: 556
Validation set - number of 0s: 617 number of 1s: 94
Test set - number of 0s: 740 number of 1s: 97
