# N-Gram Text Classification

## Name: Srinitish Srinivasn
## Reg.No: 21BAI1394

In [1]:
import pandas as pd 
import numpy as np 
import nltk 
from nltk.tokenize import word_tokenize 

from nltk.corpus import stopwords,wordnet
from nltk.stem import SnowballStemmer,WordNetLemmatizer

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")
nltk.download("wordnet")
nltk.download("stopwords")

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report,f1_score,accuracy_score,confusion_matrix,roc_curve,auc,roc_auc_score



[nltk_data] Downloading package punkt to /Users/smudge/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/smudge/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /Users/smudge/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/smudge/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Using N-Gram Method

In [2]:
#Get the dataset
#Spam classification Dataset from UCI Repository

import os 
from dotenv import load_dotenv

load_dotenv('.env')
path=os.getenv("spam_classification")

dataset=pd.read_csv(path,encoding='ISO-8859-1')
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [3]:
#Remove punctuations
import re
import string

def preprocess(text):
    text=text.lower()
    text=text.strip()

    text=re.compile('<.&?').sub('',text)
    text=re.compile('[%s]'% re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 

    return text

def stopword(text):
    a=[i for i in text.split() if i not in stopwords.words('english')]

    return ' '.join(a)

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
wl=WordNetLemmatizer()

def lemmatizer(word):
    word_pos_tags=nltk.pos_tag(word_tokenize(word))

    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)]
    return " ".join(a)



  text = re.sub('\s+', ' ', text)


In [4]:
#Preprocess the text

def total_preprocess(text):
    return lemmatizer(stopword(preprocess(text)))

dataset['v2']=dataset['v2'].apply(lambda x:total_preprocess(x))

dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,go jurong point crazy available bugis n great ...,,,
1,ham,ok lar joking wif u oni,,,
2,spam,free entry wkly comp win fa cup final tkts st ...,,,
3,ham,u dun say early hor u c already say,,,
4,ham,nah think go usf life around though,,,


In [5]:
#Apply N-Grams
from sklearn.feature_extraction.text import CountVectorizer

#Map Target train and test to 0 and 1 from no spam and spam
mapping={
    'spam':1,
    'ham':0
}

dataset['v1']=dataset['v1'].map(mapping)

X_train,X_test,Y_train,Y_test=train_test_split(dataset['v2'],dataset['v1'],test_size=0.20,shuffle=True)

#Using Bi Grams
ngram_vector=CountVectorizer(ngram_range=(2,2)) #Bigrams
X_train_vectors=ngram_vector.fit_transform(X_train)
X_test_vectors=ngram_vector.transform(X_test)

dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,go jurong point crazy available bugis n great ...,,,
1,0,ok lar joking wif u oni,,,
2,1,free entry wkly comp win fa cup final tkts st ...,,,
3,0,u dun say early hor u c already say,,,
4,0,nah think go usf life around though,,,


In [6]:
logistic_reg=LogisticRegression(solver='liblinear',C=10,penalty='l2')
logistic_reg.fit(X_train_vectors,Y_train)

In [7]:
y_pred=logistic_reg.predict(X_test_vectors)
y_prb=logistic_reg.predict_proba(X_test_vectors)[:,1]

print("Results with N-Grams")
print(classification_report(Y_test,y_pred))

Results with N-Grams
              precision    recall  f1-score   support

           0       0.95      1.00      0.98       981
           1       0.99      0.65      0.78       134

    accuracy                           0.96      1115
   macro avg       0.97      0.82      0.88      1115
weighted avg       0.96      0.96      0.95      1115



# Without Applying N-Grams

In [8]:

import os 
from dotenv import load_dotenv

load_dotenv('.env')
path=os.getenv("spam_classification")

dataset=pd.read_csv(path,encoding='ISO-8859-1')
dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [9]:
#Remove punctuations
import re
import string

def preprocess(text):
    text=text.lower()
    text=text.strip()

    text=re.compile('<.&?').sub('',text)
    text=re.compile('[%s]'% re.escape(string.punctuation)).sub(' ', text)
    text = re.sub('\s+', ' ', text)  
    text = re.sub(r'\[[0-9]*\]',' ',text) 
    text=re.sub(r'[^\w\s]', '', str(text).lower().strip())
    text = re.sub(r'\d',' ',text) 
    text = re.sub(r'\s+',' ',text) 

    return text

def stopword(text):
    a=[i for i in text.split() if i not in stopwords.words('english')]

    return ' '.join(a)

def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
wl=WordNetLemmatizer()

def lemmatizer(word):
    word_pos_tags=nltk.pos_tag(word_tokenize(word))

    a=[wl.lemmatize(tag[0], get_wordnet_pos(tag[1])) for idx, tag in enumerate(word_pos_tags)]
    return " ".join(a)


  text = re.sub('\s+', ' ', text)


In [10]:
#Preprocess the text

def total_preprocess(text):
    return lemmatizer(stopword(preprocess(text)))

dataset['v2']=dataset['v2'].apply(lambda x:total_preprocess(x))

dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,go jurong point crazy available bugis n great ...,,,
1,ham,ok lar joking wif u oni,,,
2,spam,free entry wkly comp win fa cup final tkts st ...,,,
3,ham,u dun say early hor u c already say,,,
4,ham,nah think go usf life around though,,,


In [11]:
#No N-Grams Used
from sklearn.feature_extraction.text import CountVectorizer

#Map Target train and test to 0 and 1 from no spam and spam
mapping={
    'spam':1,
    'ham':0
}

dataset['v1']=dataset['v1'].map(mapping)

X_train,X_test,Y_train,Y_test=train_test_split(dataset['v2'],dataset['v1'],test_size=0.20,shuffle=True)

ngram_vector=CountVectorizer()
X_train_vectors=ngram_vector.fit_transform(X_train)
X_test_vectors=ngram_vector.transform(X_test)

dataset.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,0,go jurong point crazy available bugis n great ...,,,
1,0,ok lar joking wif u oni,,,
2,1,free entry wkly comp win fa cup final tkts st ...,,,
3,0,u dun say early hor u c already say,,,
4,0,nah think go usf life around though,,,


In [12]:
logistic_reg=LogisticRegression(solver='liblinear',C=10,penalty='l2')
logistic_reg.fit(X_train_vectors,Y_train)

In [13]:
y_pred=logistic_reg.predict(X_test_vectors)
y_prb=logistic_reg.predict_proba(X_test_vectors)[:,1]

print("Results without N-Grams")
print(classification_report(Y_test,y_pred))

Results without N-Grams
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       947
           1       0.99      0.92      0.95       168

    accuracy                           0.99      1115
   macro avg       0.99      0.96      0.97      1115
weighted avg       0.99      0.99      0.99      1115

