<a href="https://colab.research.google.com/github/Abdelrahman-Wael-1029/spam-email/blob/main/nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# install nltk

In [167]:
!pip install nltk




In [168]:
import nltk
import pandas as pd
import numpy as np

In [169]:
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [234]:
df = pd.read_csv('email.csv')
df

Unnamed: 0,Category,Message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...
5571,ham,Rofl. Its true to its name


# preprocessing data

In [235]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5573 entries, 0 to 5572
Data columns (total 2 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   Category  5573 non-null   object
 1   Message   5573 non-null   object
dtypes: object(2)
memory usage: 87.2+ KB


In [236]:
df.describe()

Unnamed: 0,Category,Message
count,5573,5573
unique,3,5158
top,ham,"Sorry, I'll call later"
freq,4825,30


In [237]:
df.Category.value_counts()

Category
ham               4825
spam               747
{"mode":"full"       1
Name: count, dtype: int64

In [238]:
df[df['Category'] == '{"mode":"full"'].Message

5572    isActive:false}
Name: Message, dtype: object

In [239]:
df = df.drop(df[df['Category'] == '{"mode":"full"'].index)


## remove  punctuation

In [179]:
import string
def remove_punctuation(text):
  for punctuation in string.punctuation:
    text = text.replace(punctuation, '')
  return text


## tokenize

In [180]:
def tokenize(text):
  tokens = nltk.word_tokenize(text)
  return tokens


## remove stop words

In [181]:
def remove_stopwords(text):
  stopwords = nltk.corpus.stopwords.words('english')
  text = [word for word in text if word not in stopwords]
  return text



## stemming

In [182]:
# prompt: make stemming

from nltk.stem import PorterStemmer

def stemming(text):
  porter_stemmer = PorterStemmer()
  text = [porter_stemmer.stem(word) for word in text]
  return text


## lemmatize

In [183]:

def lemmatize(text):
  lemmatizer = nltk.stem.WordNetLemmatizer()
  text = [lemmatizer.lemmatize(word) for word in text]
  return text



## Create function to apply all preprocessing functions

*   List item
*   List item




In [240]:
def clean_text(text):
  text = remove_punctuation(text.lower())
  text = tokenize(text)
  text = remove_stopwords(text)
  text = stemming(text)
  return ' '.join(text)

df['Clean_Message'] = df['Message'].apply(lambda x : clean_text(x))
df


Unnamed: 0,Category,Message,Clean_Message
0,ham,"Go until jurong point, crazy.. Available only ...",go jurong point crazi avail bugi n great world...
1,ham,Ok lar... Joking wif u oni...,ok lar joke wif u oni
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,free entri 2 wkli comp win fa cup final tkt 21...
3,ham,U dun say so early hor... U c already then say...,u dun say earli hor u c alreadi say
4,ham,"Nah I don't think he goes to usf, he lives aro...",nah dont think goe usf live around though
...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,2nd time tri 2 contact u u £750 pound prize 2 ...
5568,ham,Will ü b going to esplanade fr home?,ü b go esplanad fr home
5569,ham,"Pity, * was in mood for that. So...any other s...",piti mood soani suggest
5570,ham,The guy did some bitching but I acted like i'd...,guy bitch act like id interest buy someth els ...


# feature extraction

In [242]:
df['body_length'] = df['Message'].apply(lambda x: len(x))
df['punc%'] = df['Message'].apply(lambda x: sum(c in string.punctuation for c in x) / len(x) * 100)
df = df.drop('Message', axis = 1)
df.head()


Unnamed: 0,Category,Clean_Message,body_length,punc%
0,ham,go jurong point crazi avail bugi n great world...,111,8.108108
1,ham,ok lar joke wif u oni,29,20.689655
2,spam,free entri 2 wkli comp win fa cup final tkt 21...,155,3.870968
3,ham,u dun say earli hor u c alreadi say,49,12.244898
4,ham,nah dont think goe usf live around though,61,3.278689


## vectorizing

In [243]:
from sklearn.feature_extraction.text import CountVectorizer
sentences =["i am make project i am", "did not like", "i like it"]
def count_vectorize(sentences):
  vectorizer = CountVectorizer()
  features_cv = vectorizer.fit_transform(sentences)
  return features_cv, vectorizer.get_feature_names_out()
  # print(features_cv.shape)
  # print('Sparse Matrix :\n', features_cv)
  # features_cv = pd.DataFrame(features_cv.toarray())
  # features_cv.columns = vectorizer.get_feature_names_out()
  # features_cv
print(count_vectorize(sentences)[0])

  (0, 0)	2
  (0, 4)	1
  (0, 6)	1
  (1, 1)	1
  (1, 5)	1
  (1, 3)	1
  (2, 3)	1
  (2, 2)	1


In [244]:
# prompt: create function n grams count_vectorize

def ngram_count_vectorize(sentences, ngram_range=(1, 2)):
  ngram_vect = CountVectorizer(ngram_range=ngram_range)
  features_ng = ngram_vect.fit_transform(sentences)
  return features_ng, ngram_vect.get_feature_names_out()
  # print(features_ng.shape)
  # print('Sparse Matrix :\n', features_ng)
  # features_ng = pd.DataFrame(features_ng.toarray())
  # features_ng.columns = ngram_vect.get_feature_names_out()
  # features_ng
print(ngram_count_vectorize(sentences)[0])


  (0, 0)	2
  (0, 7)	1
  (0, 11)	1
  (0, 1)	1
  (0, 8)	1
  (0, 12)	1
  (1, 2)	1
  (1, 9)	1
  (1, 5)	1
  (1, 3)	1
  (1, 10)	1
  (2, 5)	1
  (2, 4)	1
  (2, 6)	1


In [245]:
# prompt: create function tf idf

from sklearn.feature_extraction.text import TfidfVectorizer

def tfidf_vectorize(sentences):
  tfidf_vect = TfidfVectorizer()
  features_tfidf = tfidf_vect.fit_transform(sentences)
  return features_tfidf, tfidf_vect.get_feature_names_out()
print((tfidf_vectorize(sentences)[0]))


  (0, 6)	0.40824829046386296
  (0, 4)	0.40824829046386296
  (0, 0)	0.8164965809277259
  (1, 3)	0.4736296010332684
  (1, 5)	0.6227660078332259
  (1, 1)	0.6227660078332259
  (2, 2)	0.7959605415681652
  (2, 3)	0.6053485081062916


## apply the algorithm of vectorize

In [189]:
features_tfidf, feature_names = tfidf_vectorize(df['Clean_Message'].values)
features_tfidf = pd.DataFrame(features_tfidf.toarray())
features_tfidf.columns = feature_names
features_tfidf

Unnamed: 0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zero,zhong,zindgi,zoe,zogtoriu,zoom,zouk,zyada,üll,〨ud
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [212]:
# prompt: make concatonate for matrix x on features_tfidf and df without clean message

concat_df = pd.concat([features_tfidf,df.drop(['Clean_Message'], axis=1)], axis=1)
concat_df


Unnamed: 0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zoe,zogtoriu,zoom,zouk,zyada,üll,〨ud,Category,body_length,punc%
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham,0.511185,0.371240
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham,-0.857270,2.501968
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,spam,1.245478,-0.346334
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham,-0.523501,1.071820
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham,-0.323239,-0.446638
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,spam,1.328921,-0.155128
5568,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham,-0.740451,-0.531469
5569,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham,-0.389993,1.077883
5570,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,ham,0.744824,-0.866413


# model building

In [213]:
X = concat_df.drop('Category', axis = 1)
y = df['Category']

In [214]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print('X_train shape:', X_train.shape)
print('y_train shape:', y_train.shape)
print('X_test shape:', X_test.shape)
print('y_test shape:', y_test.shape)


X_train shape: (4457, 8090)
y_train shape: (4457,)
X_test shape: (1115, 8090)
y_test shape: (1115,)


In [215]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score


random_forest = RandomForestClassifier()
random_forest.fit(X_train, y_train)

print("score: ", random_forest.score(X_train, y_train))
y_pred = random_forest.predict(X_test)
print("accuracy:",accuracy_score(y_test, y_pred))

score:  0.9997756338344178
accuracy: 0.9802690582959641


In [261]:


feature_importances = pd.Series(random_forest.feature_importances_, index=X_train.columns)
feature_importances.sort_values(ascending=False).head(15)


body_length    0.051772
free           0.023540
txt            0.021995
claim          0.020731
mobil          0.019917
call           0.017973
prize          0.017970
repli          0.016110
win            0.013373
servic         0.012302
stop           0.011928
tone           0.011683
urgent         0.011310
text           0.010820
cash           0.010269
dtype: float64

In [216]:
from sklearn.svm import SVC
svm = SVC()
svm.fit(X_train, y_train)
y_pred = svm.predict(X_test)
print("accuracy:",accuracy_score(y_test, y_pred))


accuracy: 0.979372197309417


In [262]:
feature_importances = pd.Series(random_forest.feature_importances_, index=X_train.columns)
feature_importances.sort_values(ascending=False).head(15)


body_length    0.051772
free           0.023540
txt            0.021995
claim          0.020731
mobil          0.019917
call           0.017973
prize          0.017970
repli          0.016110
win            0.013373
servic         0.012302
stop           0.011928
tone           0.011683
urgent         0.011310
text           0.010820
cash           0.010269
dtype: float64

# deployment

In [228]:
message ="""
Draw or make interesting and/or creative "visuals" about any of the concepts included in the course.
These "visuals" that you create could be drawings, gif files, or stand-alone animations, PowerPoint animations ,or short videos (less than 30 seconds).
If you use any media as raw material to create your visuals (e.g. use an image to create gif files or short videos):
1- you must have drawn raw media yourself, or
2- or it should have "creative commons" or public domain license, and you have to paste a link to the source of the media, write a citation/attribution to the author, mention the type of license, and how you knew that this was the type of license.

Note that we may use some of the content that you submit in future courses (while mentioning your name as the author).
By submitting the content here, you consent to giving us permission to do so.
""".strip()

In [258]:

text = clean_text(message)
vectorizer = TfidfVectorizer(max_features=features_tfidf.shape[1])

# Fit the vectorizer if you haven't already (using training data)
vectorizer.fit(features_tfidf)

# Transform the new text data
new_text_vector = vectorizer.transform([text])

print(new_text_vector.shape)

(1, 8088)


In [254]:
text_df = pd.DataFrame(new_text_vector.toarray())
text_df.columns = features_tfidf.columns
text_df['body_length'] = len(text)
text_df['punc%'] = sum(c in string.punctuation for c in text) / len(text) * 100
text_df


Unnamed: 0,008704050406,0089mi,0121,01223585236,01223585334,0125698789,02,020603,0207,02070836089,...,zindgi,zoe,zogtoriu,zoom,zouk,zyada,üll,〨ud,body_length,punc%
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4812,0.0


In [255]:
y_pred_new = random_forest.predict(text_df)
print(y_pred_new)


['ham']
