In [15]:
import numpy as np
import pandas as pd
import seaborn as sns
import nltk
import re

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag, word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, confusion_matrix

print("all libraries installed successfully")

all libraries installed successfully


In [16]:
df = pd.read_csv(r'combined_data.csv')

display(df.head())
print(df.info())
display(df.describe)

Unnamed: 0,label,text
0,1,ounce feather bowl hummingbird opec moment ala...
1,1,wulvob get your medircations online qnb ikud v...
2,0,computer connection from cnn com wednesday es...
3,1,university degree obtain a prosperous future m...
4,0,thanks for all your answers guys i know i shou...


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 83448 entries, 0 to 83447
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   label   83448 non-null  int64 
 1   text    83448 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.3+ MB
None


<bound method NDFrame.describe of        label                                               text
0          1  ounce feather bowl hummingbird opec moment ala...
1          1  wulvob get your medircations online qnb ikud v...
2          0   computer connection from cnn com wednesday es...
3          1  university degree obtain a prosperous future m...
4          0  thanks for all your answers guys i know i shou...
...      ...                                                ...
83443      0  hi given a date how do i get the last date of ...
83444      1  now you can order software on cd or download i...
83445      1  dear valued member canadianpharmacy provides a...
83446      0  subscribe change profile contact us long term ...
83447      1  get the most out of life ! viagra has helped m...

[83448 rows x 2 columns]>

In [17]:
# checking for the empty values
print(df.isna().sum())

label    0
text     0
dtype: int64


In [18]:
# preprocessing

# since already in 0 and 1
# df['label'] = df['label'].map({'ham':0, 'spam':1})

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

stop_words = set(stopwords.words('english'))
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

clean_texts = []

for text in df['text']:
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = word_tokenize(text)
    tokens = [t for t in tokens if t not in stop_words]
    tokens = [stemmer.stem(lemmatizer.lemmatize(t)) for t in tokens]
    # removed these lines to keep it simpler and efficient
    # tagged = pos_tag(tokens)
    # clean_texts.append(" ".join([w for w, pos in tagged]))
    clean_texts.append(" ".join(tokens))


df['cleaned_text'] = clean_texts



X = df['cleaned_text']
y = df['label']

Xtrain, Xtest, ytrain, ytest = train_test_split(X, y, test_size=0.2, random_state=42)

vectorizer = CountVectorizer() #simple bag of words
xtrainVec = vectorizer.fit_transform(Xtrain)
xtestVec = vectorizer.transform(Xtest)

model = MultinomialNB()
model.fit(xtrainVec, ytrain)

y_pred = model.predict(xtestVec)

print(classification_report(ytest, y_pred))
print(confusion_matrix(ytest, y_pred))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\azama\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\azama\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\azama\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\azama\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


              precision    recall  f1-score   support

           0       0.96      0.99      0.97      7938
           1       0.99      0.96      0.97      8752

    accuracy                           0.97     16690
   macro avg       0.97      0.97      0.97     16690
weighted avg       0.97      0.97      0.97     16690

[[7826  112]
 [ 326 8426]]


In [25]:
sample = ["thanks for all your answers guys i know i should have checked the rsync manual but i would rather get a escapenumber sure answer from one of you this is my current script bin bash rsync avt \\ exclude alpha \\ exclude arm \\ exclude hppa \\ exclude hurd \\ exclude iaescapenumber \\ exclude mescapenumberk \\ exclude mips \\ exclude mipsel \\ exclude multi arch \\ exclude powerpc \\ exclude sescapenumber \\ exclude sh \\ exclude sparc \\ exclude source \\ ftp de debian org debian cd var www mirror debian cd i know loads of excludes for now will include more distros soon from the rsync manual del an alias for delete during delete delete extraneous files from dest dirs delete before receiver deletes before transfer default delete during receiver deletes during xfer not before delete after receiver deletes after transfer not before delete excluded also delete excluded files from dest dirs which delete would you suggest i use thanks again john escapelong on escapenumber escapenumber escapenumber olleg samoylov wrote jonathan escapelong wrote sorry for the banal question my favourite keys for escapenumber stage rsync rsync verbose recursive links hard links times filter 'r tmp ' delete after delay updates source url destination log file olleg samoylov www escapelong org mirror escapelong org rcrack escapelong org ninux org wireless community rome "]

sampleVec = vectorizer.transform(sample)
prediction = model.predict(sampleVec)
print(f"Sample Prediction: {prediction}")
print(f"Sample Probabilities: {model.predict_proba(sampleVec)}")
print(f"Sample Accuracy: {model.score(sampleVec, [1])}")

Sample Prediction: [0]
Sample Probabilities: [[1.00000000e+00 4.82144509e-36]]
Sample Accuracy: 0.0
