In [None]:
## IMPORT LIBRARIES ##
import re
import pandas as pd
import numpy as np
import nltk
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer
import textblob
from textblob import TextBlob, Word 
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix,accuracy_score

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [None]:
## EXTRACT DATA SAMPLE ##
data = pd.read_csv("Combined_data.csv")
data_train = data[:100]
data_test = data[100:150]
data_train.head()

Unnamed: 0,text,labels
0,"Hey, a guy I know is breathing down my neck to...",0
1,Mr Deeds! am i 87? these numbers mean nothing ...,0
2,Why nothing. Ok anyway give me treat,0
3,no child support.,0
4,why do you want a massage,1


In [None]:
## DATA CLEANING AND LEMMATIZING UTILITY ##
def clean_text(data):
  stopwords_en = set(stopwords.words('english'))
  list_word_clean = []
  for t in data['text']:
    sent = TextBlob(t)
    sent = sent.lower()
    sent = " ".join([word.lemmatize() for word in sent.words if word not in stopwords_en and not word.isdigit()])
    sent = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', 
                  '', sent, flags=re.MULTILINE)
    sent = re.sub('[^a-zA-Z]', ' ', sent)
    sent = re.sub(r"won\'t", "will not", sent)
    sent = re.sub(r"can\'t", "can not", sent)
    sent = re.sub(r"n\'t", " not", sent)
    sent = re.sub(r"\'re", " are", sent)
    sent = re.sub(r"\'s", " is", sent)
    sent = re.sub(r"\'d", " would", sent)
    sent = re.sub(r"\'ll", " will", sent)
    sent = re.sub(r"\'t", " not", sent)
    sent = re.sub(r"\'ve", " have", sent)
    sent = re.sub(r"\'m", " am", sent)
    list_word_clean.append(sent)
  data['clean-text'] = list_word_clean

clean_text(data_train)
clean_text(data_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
data.head()

Unnamed: 0,text,labels
0,"Hey, a guy I know is breathing down my neck to...",0
1,Mr Deeds! am i 87? these numbers mean nothing ...,0
2,Why nothing. Ok anyway give me treat,0
3,no child support.,0
4,why do you want a massage,1


In [None]:
## VECTORIZING TEXT ##
cv = CountVectorizer(max_features=1500)
x_train = cv.fit_transform(data_train['clean-text']).toarray()
x_test = cv.transform(data_test['clean-text']).toarray()
y_train = data_train['labels']
y_test = data_test['labels']

In [None]:
## CLASSIFIER - NAIVE BAYES ##
classifier = GaussianNB()
classifier.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
## PREDICTION ##
y_pred = classifier.predict(x_test)

In [None]:
## CONFUSION MATRIX AND SCORE ##
cm = confusion_matrix(y_test,y_pred)
sc = accuracy_score(y_test,y_pred)
print("{}, {}".format(cm,sc))

[[16  7]
 [ 8 19]], 0.7


In [None]:
## STEMMING TEXT AND CLASSIFYING ##
def clean_text(data):
  stopwords_en = set(stopwords.words('english'))
  list_word_clean = []
  for t in data['text']:
    sent = TextBlob(t)
    sent = sent.lower()
    ps = PorterStemmer()
    sent = " ".join([ps.stem(word) for word in sent.words if word not in stopwords_en and not word.isdigit()])
    sent = re.sub(r'((http|https)\:\/\/)?[a-zA-Z0-9\.\/\?\:@\-_=#]+\.([a-zA-Z]){2,6}([a-zA-Z0-9\.\&\/\?\:@\-_=#])*', 
                  '', sent, flags=re.MULTILINE)
    sent = re.sub('[^a-zA-Z]', ' ', sent)
    sent = re.sub(r"won\'t", "will not", sent)
    sent = re.sub(r"can\'t", "can not", sent)
    sent = re.sub(r"n\'t", " not", sent)
    sent = re.sub(r"\'re", " are", sent)
    sent = re.sub(r"\'s", " is", sent)
    sent = re.sub(r"\'d", " would", sent)
    sent = re.sub(r"\'ll", " will", sent)
    sent = re.sub(r"\'t", " not", sent)
    sent = re.sub(r"\'ve", " have", sent)
    sent = re.sub(r"\'m", " am", sent)
    # word_list = nltk.word_tokenize(sent)
    # lemmatized_output = ' '.join([lemmatizer.lemmatize(w) for w in word_list])
    # nltkLemma.append(lemmatized_output)
    list_word_clean.append(sent)
  data['stem-text'] = list_word_clean

clean_text(data_train)
clean_text(data_test)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy


In [None]:
data_train.head()

Unnamed: 0,text,labels,clean-text,stem-text
0,"Hey, a guy I know is breathing down my neck to...",0,hey guy know breathing neck get bud anyway d ...,hey guy know breath neck get bud anyway d abl...
1,Mr Deeds! am i 87? these numbers mean nothing ...,0,mr deed number mean nothing fair well mr puffy...,mr deed number mean noth fair well mr puffi ja...
2,Why nothing. Ok anyway give me treat,0,nothing ok anyway give treat,noth ok anyway give treat
3,no child support.,0,child support,child support
4,why do you want a massage,1,want massage,want massag


In [None]:
cv = CountVectorizer(max_features=1500)
x_train = cv.fit_transform(data_train['stem-text']).toarray()
x_test = cv.transform(data_test['stem-text']).toarray()
y_train = data_train['labels']
y_test = data_test['labels']

In [None]:
classifier = GaussianNB()
classifier.fit(x_train,y_train)

GaussianNB(priors=None, var_smoothing=1e-09)

In [None]:
y_pred = classifier.predict(x_test)

In [None]:
cm = confusion_matrix(y_test,y_pred)
sc = accuracy_score(y_test,y_pred)
print("{}, {}".format(cm,sc))

[[20  3]
 [ 9 18]], 0.76
