<a href="https://colab.research.google.com/github/Aeagon07/Natural-Language-Processing-/blob/main/Text_Classification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd

url = "https://raw.githubusercontent.com/justmarkham/pycon-2016-tutorial/master/data/sms.tsv"

df = pd.read_csv(url, sep='\t', names=['label', 'message'])

df.head()

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


## **Text Preproccessing**

In [2]:
df['message'][1]

'Ok lar... Joking wif u oni...'

In [4]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
ham,4825
spam,747


In [5]:
df.isnull().sum()

Unnamed: 0,0
label,0
message,0


In [6]:
df.duplicated().sum()

np.int64(403)

In [7]:
df.drop_duplicates(inplace=True)

In [8]:
df.duplicated().sum()

np.int64(0)

In [9]:
import re
def remove_tag(raw_text):
  clean_text = re.sub(re.compile('<.*?>'), ' ', raw_text)
  return clean_text

In [10]:
df['message'] = df['message'].apply(remove_tag)

In [11]:
df['message'] = df['message'].apply(lambda x: x.lower())

In [12]:
from nltk.corpus import stopwords
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [13]:
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def remove_stopwords(text):
  return ' '.join([word for word in text.split() if word not in stop_words])

In [14]:
df['message'] = df['message'].apply(remove_stopwords)
display(df.head())

Unnamed: 0,label,message
0,ham,"go jurong point, crazy.. available bugis n gre..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry 2 wkly comp win fa cup final tkts 2...
3,ham,u dun say early hor... u c already say...
4,ham,"nah think goes usf, lives around though"


In [33]:
X = df['message']
y = df['label']

In [34]:
X

Unnamed: 0,message
0,"go jurong point, crazy.. available bugis n gre..."
1,ok lar... joking wif u oni...
2,free entry 2 wkly comp win fa cup final tkts 2...
3,u dun say early hor... u c already say...
4,"nah think goes usf, lives around though"
...,...
5567,2nd time tried 2 contact u. u £750 pound prize...
5568,ü b going esplanade fr home?
5569,"pity, * mood that. so...any suggestions?"
5570,guy bitching acted like interested buying some...


In [17]:
y

Unnamed: 0,label
0,ham
1,ham
2,spam
3,ham
4,ham
...,...
5567,spam
5568,ham
5569,ham
5570,ham


In [39]:
# convert y into numbers or binary
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
y = le.fit_transform(y)

In [40]:
y

array([0, 0, 1, ..., 0, 0, 0])

In [41]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [42]:
X_train.shape

(4135,)

## Using BoW

In [43]:
# Applying BoW
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()

In [46]:
X_train_BoW = cv.fit_transform(X_train).toarray()
X_test_BoW = cv.transform(X_test).toarray()

In [47]:
X_train_BoW.shape

(4135, 7559)

In [48]:
# Uses the Guasian Modeling
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

gnb.fit(X_train_BoW, y_train)

In [49]:
y_pred = gnb.predict(X_test_BoW)

from sklearn.metrics import accuracy_score,confusion_matrix
print(accuracy_score(y_test, y_pred))

0.9119922630560928


In [50]:
print(confusion_matrix(y_test, y_pred))

[[819  75]
 [ 16 124]]


In [51]:
# Another technique is Random Forest
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier()

rf.fit(X_train_BoW, y_train) # apply
y_pred = rf.predict(X_test_BoW) # predict

print(accuracy_score(y_test, y_pred))

0.9748549323017408


## Using N-Grams with Randome Forest

In [52]:
# You can use the N-Grams also but if you have a lots of data contains lots of words then convert them into bi or tri grams may exceed the meomery limit
# so instead of running all you run on the max_features of the data
cv = CountVectorizer(ngram_range=(1,2))

X_train_bow = cv.fit_transform(X_train).toarray()
X_test_bow = cv.transform(X_test).toarray()

rf = RandomForestClassifier()

rf.fit(X_train_bow,y_train)
y_pred = rf.predict(X_test_bow)
accuracy_score(y_test,y_pred)

0.9671179883945842

## Using Tf-Idf

In [53]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()

In [54]:
X_train_tfidf = tfidf.fit_transform(X_train).toarray()
X_test_tfidf = tfidf.transform(X_test).toarray()

In [55]:
rf = RandomForestClassifier()

rf.fit(X_train_tfidf,y_train)
y_pred = rf.predict(X_test_tfidf)
accuracy_score(y_test,y_pred)

0.97678916827853

## Using Word2Vec

In [57]:
!pip install gensim
import gensim
from gensim.models import Word2Vec,KeyedVectors

Collecting gensim
  Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl.metadata (8.4 kB)
Downloading gensim-4.4.0-cp312-cp312-manylinux_2_24_x86_64.manylinux_2_28_x86_64.whl (27.9 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m27.9/27.9 MB[0m [31m28.4 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: gensim
Successfully installed gensim-4.4.0


In [58]:
# Import spacy and load the language model
import spacy

# It's good practice to ensure the model is downloaded
# If it's already downloaded, this command will simply confirm it.
!python -m spacy download en_core_web_sm

# Load the English language model
nlp = spacy.load('en_core_web_sm')

Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m97.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [59]:
import numpy as np

# Get vectors for X_train
input_arr = []
for text in X_train.values:
    doc = nlp(text)
    input_arr.append(doc.vector)

# Convert to numpy array
X_train_vectors = np.array(input_arr)

print(f"Shape of X_train_vectors: {X_train_vectors.shape}")

Shape of X_train_vectors: (4135, 96)


In [60]:
# Get vectors for X_test
input_test_arr = []
for text in X_test.values:
    doc = nlp(text)
    input_test_arr.append(doc.vector)

# Convert to numpy array
X_test_vectors = np.array(input_test_arr)

print(f"Shape of X_test_vectors: {X_test_vectors.shape}")

Shape of X_test_vectors: (1034, 96)


### Explanation of the Word2Vec process with SpaCy:

1.  **SpaCy Model Loading**: We first ensure the `en_core_web_sm` model is downloaded and then load it. This model provides pre-trained word vectors.
2.  **Text Processing and Vectorization**: For each text in your `X_train` and `X_test` datasets, SpaCy's `nlp()` function processes the text. The `.vector` attribute of the resulting `doc` object gives you the averaged vector representation for the entire text (sentence vector).
3.  **Numpy Array Conversion**: These vectors are then collected into lists and converted into NumPy arrays, which are suitable for training machine learning models.

In [66]:
# Using Gaussian Naive Bayes

from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()

In [62]:
gnb.fit(X_train_vectors, y_train)

In [64]:
y_pred = gnb.predict(X_test_vectors)
accuracy_score(y_test, y_pred)

0.8317214700193424

In [67]:
# Using Random Forest

from sklearn.ensemble import RandomForestClassifier
rf_word2vec = RandomForestClassifier()

rf_word2vec.fit(X_train_vectors, y_train)
y_pred = rf_word2vec.predict(X_test_vectors)

print(f"Accuracy with Word2Vec using RandomForest: {accuracy_score(y_test, y_pred)}")
print(f"Confusion Matrix:\n{confusion_matrix(y_test, y_pred)}")

Accuracy with Word2Vec using RandomForest: 0.9410058027079303
Confusion Matrix:
[[890   4]
 [ 57  83]]
